From be68d63a139bd4a0eae44d06234eaff8c07d207c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 12 Jun 2024 19:19:36 -0400
Subject: ring-buffer: Add ring_buffer_alloc_range()

In preparation to allowing the trace ring buffer to be allocated in a
range of memory that is persistent across reboots, add
ring_buffer_alloc_range(). It takes a contiguous range of memory and will
split it up evenly for the per CPU ring buffers.

If there's not enough memory to handle all CPUs with the minimum size, it
will fail to allocate the ring buffer.

Link: https://lkml.kernel.org/r/20240612232025.363998725@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vineeth Pillai <vineeth@bitbyteword.org>
Cc: Youssef Esmat <youssefesmat@google.com>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Ross Zwisler <zwisler@google.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 96d2140b471e..a50b0223b1d3 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -89,6 +89,11 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
 struct trace_buffer *
 __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key);
 
+struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags,
+					       int order, unsigned long start,
+					       unsigned long range_size,
+					       struct lock_class_key *key);
+
 /*
  * Because the ring buffer is generic, if other users of the ring buffer get
  * traced by ftrace, it can produce lockdep warnings. We need to keep each
@@ -100,6 +105,18 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
 	__ring_buffer_alloc((size), (flags), &__key);	\
 })
 
+/*
+ * Because the ring buffer is generic, if other users of the ring buffer get
+ * traced by ftrace, it can produce lockdep warnings. We need to keep each
+ * ring buffer's lock class separate.
+ */
+#define ring_buffer_alloc_range(size, flags, order, start, range_size)	\
+({									\
+	static struct lock_class_key __key;				\
+	__ring_buffer_alloc_range((size), (flags), (order), (start),	\
+				  (range_size), &__key);		\
+})
+
 typedef bool (*ring_buffer_cond_fn)(void *data);
 int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full,
 		     ring_buffer_cond_fn cond, void *data);
-- 
cgit v1.2.3


From 7a1d1e4b9639ff08b2f42605c2950ae1ba4a43bf Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 12 Jun 2024 19:19:44 -0400
Subject: tracing/ring-buffer: Add last_boot_info file to boot instance

If an instance is mapped to memory on boot up, create a new file called
"last_boot_info" that will hold information that can be used to properly
parse the raw data in the ring buffer.

It will export the delta of the addresses for text and data from what it
was from the last boot. It does not expose actually addresses (unless you
knew what the actual address was from the last boot).

The output will look like:

 # cat last_boot_info
 text delta:	-268435456
 data delta:	-268435456

The text and data are kept separate in case they are ever made different.

Link: https://lkml.kernel.org/r/20240612232026.658680738@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vineeth Pillai <vineeth@bitbyteword.org>
Cc: Youssef Esmat <youssefesmat@google.com>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Ross Zwisler <zwisler@google.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index a50b0223b1d3..55de3798a9b9 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -94,6 +94,9 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag
 					       unsigned long range_size,
 					       struct lock_class_key *key);
 
+bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text,
+				 long *data);
+
 /*
  * Because the ring buffer is generic, if other users of the ring buffer get
  * traced by ftrace, it can produce lockdep warnings. We need to keep each
-- 
cgit v1.2.3


From 304b3f2bc07ba7c74a1ebc7917a8f0549e6b8d54 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:16 -1000
Subject: sched: Allow sched_cgroup_fork() to fail and introduce
 sched_cancel_fork()

A new BPF extensible sched_class will need more control over the forking
process. It wants to be able to fail from sched_cgroup_fork() after the new
task's sched_task_group is initialized so that the loaded BPF program can
prepare the task with its cgroup association is established and reject fork
if e.g. allocation fails.

Allow sched_cgroup_fork() to fail by making it return int instead of void
and adding sched_cancel_fork() to undo sched_fork() in the error path.

sched_cgroup_fork() doesn't fail yet and this patch shouldn't cause any
behavior changes.

v2: Patch description updated to detail the expected use.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/task.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index d362aacf9f89..4df2f9055587 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 
 extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
-extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+extern void sched_cancel_fork(struct task_struct *p);
 extern void sched_post_fork(struct task_struct *p);
 extern void sched_dead(struct task_struct *p);
 
-- 
cgit v1.2.3


From 4f9c7ca851044273df5d67e00ca0b4d0476a48f6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:16 -1000
Subject: sched: Factor out cgroup weight conversion functions

Factor out sched_weight_from/to_cgroup() which convert between scheduler
shares and cgroup weight. No functional change. The factored out functions
will be used by a new BPF extensible sched_class so that the weights can be
exposed to the BPF programs in a way which is consistent cgroup weights and
easier to interpret.

The weight conversions will be used regardless of cgroup usage. It's just
borrowing the cgroup weight range as it's more intuitive.
CGROUP_WEIGHT_MIN/DFL/MAX constants are moved outside CONFIG_CGROUPS so that
the conversion helpers can always be defined.

v2: The helpers are now defined regardless of COFNIG_CGROUPS.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/cgroup.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 2150ca60394b..3cdaec701600 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -29,8 +29,6 @@
 
 struct kernel_clone_args;
 
-#ifdef CONFIG_CGROUPS
-
 /*
  * All weight knobs on the default hierarchy should use the following min,
  * default and max values.  The default value is the logarithmic center of
@@ -40,6 +38,8 @@ struct kernel_clone_args;
 #define CGROUP_WEIGHT_DFL		100
 #define CGROUP_WEIGHT_MAX		10000
 
+#ifdef CONFIG_CGROUPS
+
 enum {
 	CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
 	CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
-- 
cgit v1.2.3


From a7a9fc549293168c1edbc8433d5d4fbbe1171630 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:17 -1000
Subject: sched_ext: Add boilerplate for extensible scheduler class

This adds dummy implementations of sched_ext interfaces which interact with
the scheduler core and hook them in the correct places. As they're all
dummies, this doesn't cause any behavior changes. This is split out to help
reviewing.

v2: balance_scx_on_up() dropped. This will be handled in sched_ext proper.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 include/linux/sched/ext.h

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
new file mode 100644
index 000000000000..a05dfcf533b0
--- /dev/null
+++ b/include/linux/sched/ext.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_EXT_H
+#define _LINUX_SCHED_EXT_H
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+#error "NOT IMPLEMENTED YET"
+#else	/* !CONFIG_SCHED_CLASS_EXT */
+
+static inline void sched_ext_free(struct task_struct *p) {}
+
+#endif	/* CONFIG_SCHED_CLASS_EXT */
+#endif	/* _LINUX_SCHED_EXT_H */
-- 
cgit v1.2.3


From f0e1a0643a59bf1f922fa209cec86a170b784f3f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:17 -1000
Subject: sched_ext: Implement BPF extensible scheduler class
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement a new scheduler class sched_ext (SCX), which allows scheduling
policies to be implemented as BPF programs to achieve the following:

1. Ease of experimentation and exploration: Enabling rapid iteration of new
   scheduling policies.

2. Customization: Building application-specific schedulers which implement
   policies that are not applicable to general-purpose schedulers.

3. Rapid scheduler deployments: Non-disruptive swap outs of scheduling
   policies in production environments.

sched_ext leverages BPF’s struct_ops feature to define a structure which
exports function callbacks and flags to BPF programs that wish to implement
scheduling policies. The struct_ops structure exported by sched_ext is
struct sched_ext_ops, and is conceptually similar to struct sched_class. The
role of sched_ext is to map the complex sched_class callbacks to the more
simple and ergonomic struct sched_ext_ops callbacks.

For more detailed discussion on the motivations and overview, please refer
to the cover letter.

Later patches will also add several example schedulers and documentation.

This patch implements the minimum core framework to enable implementation of
BPF schedulers. Subsequent patches will gradually add functionalities
including safety guarantee mechanisms, nohz and cgroup support.

include/linux/sched/ext.h defines struct sched_ext_ops. With the comment on
top, each operation should be self-explanatory. The followings are worth
noting:

- Both "sched_ext" and its shorthand "scx" are used. If the identifier
  already has "sched" in it, "ext" is used; otherwise, "scx".

- In sched_ext_ops, only .name is mandatory. Every operation is optional and
  if omitted a simple but functional default behavior is provided.

- A new policy constant SCHED_EXT is added and a task can select sched_ext
  by invoking sched_setscheduler(2) with the new policy constant. However,
  if the BPF scheduler is not loaded, SCHED_EXT is the same as SCHED_NORMAL
  and the task is scheduled by CFS. When the BPF scheduler is loaded, all
  tasks which have the SCHED_EXT policy are switched to sched_ext.

- To bridge the workflow imbalance between the scheduler core and
  sched_ext_ops callbacks, sched_ext uses simple FIFOs called dispatch
  queues (dsq's). By default, there is one global dsq (SCX_DSQ_GLOBAL), and
  one local per-CPU dsq (SCX_DSQ_LOCAL). SCX_DSQ_GLOBAL is provided for
  convenience and need not be used by a scheduler that doesn't require it.
  SCX_DSQ_LOCAL is the per-CPU FIFO that sched_ext pulls from when putting
  the next task on the CPU. The BPF scheduler can manage an arbitrary number
  of dsq's using scx_bpf_create_dsq() and scx_bpf_destroy_dsq().

- sched_ext guarantees system integrity no matter what the BPF scheduler
  does. To enable this, each task's ownership is tracked through
  p->scx.ops_state and all tasks are put on scx_tasks list. The disable path
  can always recover and revert all tasks back to CFS. See p->scx.ops_state
  and scx_tasks.

- A task is not tied to its rq while enqueued. This decouples CPU selection
  from queueing and allows sharing a scheduling queue across an arbitrary
  subset of CPUs. This adds some complexities as a task may need to be
  bounced between rq's right before it starts executing. See
  dispatch_to_local_dsq() and move_task_to_local_dsq().

- One complication that arises from the above weak association between task
  and rq is that synchronizing with dequeue() gets complicated as dequeue()
  may happen anytime while the task is enqueued and the dispatch path might
  need to release the rq lock to transfer the task. Solving this requires a
  bit of complexity. See the logic around p->scx.sticky_cpu and
  p->scx.ops_qseq.

- Both enable and disable paths are a bit complicated. The enable path
  switches all tasks without blocking to avoid issues which can arise from
  partially switched states (e.g. the switching task itself being starved).
  The disable path can't trust the BPF scheduler at all, so it also has to
  guarantee forward progress without blocking. See scx_ops_enable() and
  scx_ops_disable_workfn().

- When sched_ext is disabled, static_branches are used to shut down the
  entry points from hot paths.

v7: - scx_ops_bypass() was incorrectly and unnecessarily trying to grab
      scx_ops_enable_mutex which can lead to deadlocks in the disable path.
      Fixed.

    - Fixed TASK_DEAD handling bug in scx_ops_enable() path which could lead
      to use-after-free.

    - Consolidated per-cpu variable usages and other cleanups.

v6: - SCX_NR_ONLINE_OPS replaced with SCX_OPI_*_BEGIN/END so that multiple
      groups can be expressed. Later CPU hotplug operations are put into
      their own group.

    - SCX_OPS_DISABLING state is replaced with the new bypass mechanism
      which allows temporarily putting the system into simple FIFO
      scheduling mode bypassing the BPF scheduler. In addition to the shut
      down path, this will also be used to isolate the BPF scheduler across
      PM events. Enabling and disabling the bypass mode requires iterating
      all runnable tasks. rq->scx.runnable_list addition is moved from the
      later watchdog patch.

    - ops.prep_enable() is replaced with ops.init_task() and
      ops.enable/disable() are now called whenever the task enters and
      leaves sched_ext instead of when the task becomes schedulable on
      sched_ext and stops being so. A new operation - ops.exit_task() - is
      called when the task stops being schedulable on sched_ext.

    - scx_bpf_dispatch() can now be called from ops.select_cpu() too. This
      removes the need for communicating local dispatch decision made by
      ops.select_cpu() to ops.enqueue() via per-task storage.
      SCX_KF_SELECT_CPU is added to support the change.

    - SCX_TASK_ENQ_LOCAL which told the BPF scheudler that
      scx_select_cpu_dfl() wants the task to be dispatched to the local DSQ
      was removed. Instead, scx_bpf_select_cpu_dfl() now dispatches directly
      if it finds a suitable idle CPU. If such behavior is not desired,
      users can use scx_bpf_select_cpu_dfl() which returns the verdict in a
      bool out param.

    - scx_select_cpu_dfl() was mishandling WAKE_SYNC and could end up
      queueing many tasks on a local DSQ which makes tasks to execute in
      order while other CPUs stay idle which made some hackbench numbers
      really bad. Fixed.

    - The current state of sched_ext can now be monitored through files
      under /sys/sched_ext instead of /sys/kernel/debug/sched/ext. This is
      to enable monitoring on kernels which don't enable debugfs.

    - sched_ext wasn't telling BPF that ops.dispatch()'s @prev argument may
      be NULL and a BPF scheduler which derefs the pointer without checking
      could crash the kernel. Tell BPF. This is currently a bit ugly. A
      better way to annotate this is expected in the future.

    - scx_exit_info updated to carry pointers to message buffers instead of
      embedding them directly. This decouples buffer sizes from API so that
      they can be changed without breaking compatibility.

    - exit_code added to scx_exit_info. This is used to indicate different
      exit conditions on non-error exits and will be used to handle e.g. CPU
      hotplugs.

    - The patch "sched_ext: Allow BPF schedulers to switch all eligible
      tasks into sched_ext" is folded in and the interface is changed so
      that partial switching is indicated with a new ops flag
      %SCX_OPS_SWITCH_PARTIAL. This makes scx_bpf_switch_all() unnecessasry
      and in turn SCX_KF_INIT. ops.init() is now called with
      SCX_KF_SLEEPABLE.

    - Code reorganized so that only the parts necessary to integrate with
      the rest of the kernel are in the header files.

    - Changes to reflect the BPF and other kernel changes including the
      addition of bpf_sched_ext_ops.cfi_stubs.

v5: - To accommodate 32bit configs, p->scx.ops_state is now atomic_long_t
      instead of atomic64_t and scx_dsp_buf_ent.qseq which uses
      load_acquire/store_release is now unsigned long instead of u64.

    - Fix the bug where bpf_scx_btf_struct_access() was allowing write
      access to arbitrary fields.

    - Distinguish kfuncs which can be called from any sched_ext ops and from
      anywhere. e.g. scx_bpf_pick_idle_cpu() can now be called only from
      sched_ext ops.

    - Rename "type" to "kind" in scx_exit_info to make it easier to use on
      languages in which "type" is a reserved keyword.

    - Since cff9b2332ab7 ("kernel/sched: Modify initial boot task idle
      setup"), PF_IDLE is not set on idle tasks which haven't been online
      yet which made scx_task_iter_next_filtered() include those idle tasks
      in iterations leading to oopses. Update scx_task_iter_next_filtered()
      to directly test p->sched_class against idle_sched_class instead of
      using is_idle_task() which tests PF_IDLE.

    - Other updates to match upstream changes such as adding const to
      set_cpumask() param and renaming check_preempt_curr() to
      wakeup_preempt().

v4: - SCHED_CHANGE_BLOCK replaced with the previous
      sched_deq_and_put_task()/sched_enq_and_set_tsak() pair. This is
      because upstream is adaopting a different generic cleanup mechanism.
      Once that lands, the code will be adapted accordingly.

    - task_on_scx() used to test whether a task should be switched into SCX,
      which is confusing. Renamed to task_should_scx(). task_on_scx() now
      tests whether a task is currently on SCX.

    - scx_has_idle_cpus is barely used anymore and replaced with direct
      check on the idle cpumask.

    - SCX_PICK_IDLE_CORE added and scx_pick_idle_cpu() improved to prefer
      fully idle cores.

    - ops.enable() now sees up-to-date p->scx.weight value.

    - ttwu_queue path is disabled for tasks on SCX to avoid confusing BPF
      schedulers expecting ->select_cpu() call.

    - Use cpu_smt_mask() instead of topology_sibling_cpumask() like the rest
      of the scheduler.

v3: - ops.set_weight() added to allow BPF schedulers to track weight changes
      without polling p->scx.weight.

    - move_task_to_local_dsq() was losing SCX-specific enq_flags when
      enqueueing the task on the target dsq because it goes through
      activate_task() which loses the upper 32bit of the flags. Carry the
      flags through rq->scx.extra_enq_flags.

    - scx_bpf_dispatch(), scx_bpf_pick_idle_cpu(), scx_bpf_task_running()
      and scx_bpf_task_cpu() now use the new KF_RCU instead of
      KF_TRUSTED_ARGS to make it easier for BPF schedulers to call them.

    - The kfunc helper access control mechanism implemented through
      sched_ext_entity.kf_mask is improved. Now SCX_CALL_OP*() is always
      used when invoking scx_ops operations.

v2: - balance_scx_on_up() is dropped. Instead, on UP, balance_scx() is
      called from put_prev_taks_scx() and pick_next_task_scx() as necessary.
      To determine whether balance_scx() should be called from
      put_prev_task_scx(), SCX_TASK_DEQD_FOR_SLEEP flag is added. See the
      comment in put_prev_task_scx() for details.

    - sched_deq_and_put_task() / sched_enq_and_set_task() sequences replaced
      with SCHED_CHANGE_BLOCK().

    - Unused all_dsqs list removed. This was a left-over from previous
      iterations.

    - p->scx.kf_mask is added to track and enforce which kfunc helpers are
      allowed. Also, init/exit sequences are updated to make some kfuncs
      always safe to call regardless of the current BPF scheduler state.
      Combined, this should make all the kfuncs safe.

    - BPF now supports sleepable struct_ops operations. Hacky workaround
      removed and operations and kfunc helpers are tagged appropriately.

    - BPF now supports bitmask / cpumask helpers. scx_bpf_get_idle_cpumask()
      and friends are added so that BPF schedulers can use the idle masks
      with the generic helpers. This replaces the hacky kfunc helpers added
      by a separate patch in V1.

    - CONFIG_SCHED_CLASS_EXT can no longer be enabled if SCHED_CORE is
      enabled. This restriction will be removed by a later patch which adds
      core-sched support.

    - Add MAINTAINERS entries and other misc changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Co-authored-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Cc: Andrea Righi <andrea.righi@canonical.com>
---
 include/linux/sched.h     |   5 ++
 include/linux/sched/ext.h | 141 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 145 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 90691d99027e..06beb8a6e0ca 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -80,6 +80,8 @@ struct task_group;
 struct task_struct;
 struct user_event_mm;
 
+#include <linux/sched/ext.h>
+
 /*
  * Task state bitmask. NOTE! These bits are also
  * encoded in fs/proc/array.c: get_task_state().
@@ -802,6 +804,9 @@ struct task_struct {
 	struct sched_rt_entity		rt;
 	struct sched_dl_entity		dl;
 	struct sched_dl_entity		*dl_server;
+#ifdef CONFIG_SCHED_CLASS_EXT
+	struct sched_ext_entity		scx;
+#endif
 	const struct sched_class	*sched_class;
 
 #ifdef CONFIG_SCHED_CORE
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index a05dfcf533b0..c1530a7992cc 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -1,9 +1,148 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
 #ifndef _LINUX_SCHED_EXT_H
 #define _LINUX_SCHED_EXT_H
 
 #ifdef CONFIG_SCHED_CLASS_EXT
-#error "NOT IMPLEMENTED YET"
+
+#include <linux/llist.h>
+#include <linux/rhashtable-types.h>
+
+enum scx_public_consts {
+	SCX_OPS_NAME_LEN	= 128,
+
+	SCX_SLICE_DFL		= 20 * 1000000,	/* 20ms */
+};
+
+/*
+ * DSQ (dispatch queue) IDs are 64bit of the format:
+ *
+ *   Bits: [63] [62 ..  0]
+ *         [ B] [   ID   ]
+ *
+ *    B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
+ *   ID: 63 bit ID
+ *
+ * Built-in IDs:
+ *
+ *   Bits: [63] [62] [61..32] [31 ..  0]
+ *         [ 1] [ L] [   R  ] [    V   ]
+ *
+ *    1: 1 for built-in DSQs.
+ *    L: 1 for LOCAL_ON DSQ IDs, 0 for others
+ *    V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
+ */
+enum scx_dsq_id_flags {
+	SCX_DSQ_FLAG_BUILTIN	= 1LLU << 63,
+	SCX_DSQ_FLAG_LOCAL_ON	= 1LLU << 62,
+
+	SCX_DSQ_INVALID		= SCX_DSQ_FLAG_BUILTIN | 0,
+	SCX_DSQ_GLOBAL		= SCX_DSQ_FLAG_BUILTIN | 1,
+	SCX_DSQ_LOCAL		= SCX_DSQ_FLAG_BUILTIN | 2,
+	SCX_DSQ_LOCAL_ON	= SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
+	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
+};
+
+/*
+ * Dispatch queue (dsq) is a simple FIFO which is used to buffer between the
+ * scheduler core and the BPF scheduler. See the documentation for more details.
+ */
+struct scx_dispatch_q {
+	raw_spinlock_t		lock;
+	struct list_head	list;	/* tasks in dispatch order */
+	u32			nr;
+	u64			id;
+	struct rhash_head	hash_node;
+	struct llist_node	free_node;
+	struct rcu_head		rcu;
+};
+
+/* scx_entity.flags */
+enum scx_ent_flags {
+	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
+	SCX_TASK_BAL_KEEP	= 1 << 1, /* balance decided to keep current */
+	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
+	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
+
+	SCX_TASK_STATE_SHIFT	= 8,	  /* bit 8 and 9 are used to carry scx_task_state */
+	SCX_TASK_STATE_BITS	= 2,
+	SCX_TASK_STATE_MASK	= ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
+
+	SCX_TASK_CURSOR		= 1 << 31, /* iteration cursor, not a task */
+};
+
+/* scx_entity.flags & SCX_TASK_STATE_MASK */
+enum scx_task_state {
+	SCX_TASK_NONE,		/* ops.init_task() not called yet */
+	SCX_TASK_INIT,		/* ops.init_task() succeeded, but task can be cancelled */
+	SCX_TASK_READY,		/* fully initialized, but not in sched_ext */
+	SCX_TASK_ENABLED,	/* fully initialized and in sched_ext */
+
+	SCX_TASK_NR_STATES,
+};
+
+/*
+ * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
+ * everywhere and the following bits track which kfunc sets are currently
+ * allowed for %current. This simple per-task tracking works because SCX ops
+ * nest in a limited way. BPF will likely implement a way to allow and disallow
+ * kfuncs depending on the calling context which will replace this manual
+ * mechanism. See scx_kf_allow().
+ */
+enum scx_kf_mask {
+	SCX_KF_UNLOCKED		= 0,	  /* not sleepable, not rq locked */
+	/* all non-sleepables may be nested inside SLEEPABLE */
+	SCX_KF_SLEEPABLE	= 1 << 0, /* sleepable init operations */
+	/* ops.dequeue (in REST) may be nested inside DISPATCH */
+	SCX_KF_DISPATCH		= 1 << 2, /* ops.dispatch() */
+	SCX_KF_ENQUEUE		= 1 << 3, /* ops.enqueue() and ops.select_cpu() */
+	SCX_KF_SELECT_CPU	= 1 << 4, /* ops.select_cpu() */
+	SCX_KF_REST		= 1 << 5, /* other rq-locked operations */
+
+	__SCX_KF_RQ_LOCKED	= SCX_KF_DISPATCH |
+				  SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
+};
+
+/*
+ * The following is embedded in task_struct and contains all fields necessary
+ * for a task to be scheduled by SCX.
+ */
+struct sched_ext_entity {
+	struct scx_dispatch_q	*dsq;
+	struct list_head	dsq_node;
+	u32			flags;		/* protected by rq lock */
+	u32			weight;
+	s32			sticky_cpu;
+	s32			holding_cpu;
+	u32			kf_mask;	/* see scx_kf_mask above */
+	atomic_long_t		ops_state;
+
+	struct list_head	runnable_node;	/* rq->scx.runnable_list */
+
+	u64			ddsp_dsq_id;
+	u64			ddsp_enq_flags;
+
+	/* BPF scheduler modifiable fields */
+
+	/*
+	 * Runtime budget in nsecs. This is usually set through
+	 * scx_bpf_dispatch() but can also be modified directly by the BPF
+	 * scheduler. Automatically decreased by SCX as the task executes. On
+	 * depletion, a scheduling event is triggered.
+	 */
+	u64			slice;
+
+	/* cold fields */
+	/* must be the last field, see init_scx_entity() */
+	struct list_head	tasks_node;
+};
+
+void sched_ext_free(struct task_struct *p);
+
 #else	/* !CONFIG_SCHED_CLASS_EXT */
 
 static inline void sched_ext_free(struct task_struct *p) {}
-- 
cgit v1.2.3


From 8a010b81b3a50b033fc3cddc613517abda586cbe Mon Sep 17 00:00:00 2001
From: David Vernet <dvernet@meta.com>
Date: Tue, 18 Jun 2024 10:09:18 -1000
Subject: sched_ext: Implement runnable task stall watchdog

The most common and critical way that a BPF scheduler can misbehave is by
failing to run runnable tasks for too long. This patch implements a
watchdog.

* All tasks record when they become runnable.

* A watchdog work periodically scans all runnable tasks. If any task has
  stayed runnable for too long, the BPF scheduler is aborted.

* scheduler_tick() monitors whether the watchdog itself is stuck. If so, the
  BPF scheduler is aborted.

Because the watchdog only scans the tasks which are currently runnable and
usually very infrequently, the overhead should be negligible.
scx_qmap is updated so that it can be told to stall user and/or
kernel tasks.

A detected task stall looks like the following:

 sched_ext: BPF scheduler "qmap" errored, disabling
 sched_ext: runnable task stall (dbus-daemon[953] failed to run for 6.478s)
    scx_check_timeout_workfn+0x10e/0x1b0
    process_one_work+0x287/0x560
    worker_thread+0x234/0x420
    kthread+0xe9/0x100
    ret_from_fork+0x1f/0x30

A detected watchdog stall:

 sched_ext: BPF scheduler "qmap" errored, disabling
 sched_ext: runnable task stall (watchdog failed to check in for 5.001s)
    scheduler_tick+0x2eb/0x340
    update_process_times+0x7a/0x90
    tick_sched_timer+0xd8/0x130
    __hrtimer_run_queues+0x178/0x3b0
    hrtimer_interrupt+0xfc/0x390
    __sysvec_apic_timer_interrupt+0xb7/0x2b0
    sysvec_apic_timer_interrupt+0x90/0xb0
    asm_sysvec_apic_timer_interrupt+0x1b/0x20
    default_idle+0x14/0x20
    arch_cpu_idle+0xf/0x20
    default_idle_call+0x50/0x90
    do_idle+0xe8/0x240
    cpu_startup_entry+0x1d/0x20
    kernel_init+0x0/0x190
    start_kernel+0x0/0x392
    start_kernel+0x324/0x392
    x86_64_start_reservations+0x2a/0x2c
    x86_64_start_kernel+0x104/0x109
    secondary_startup_64_no_verify+0xce/0xdb

Note that this patch exposes scx_ops_error[_type]() in kernel/sched/ext.h to
inline scx_notify_sched_tick().

v4: - While disabling, cancel_delayed_work_sync(&scx_watchdog_work) was
      being called before forward progress was guaranteed and thus could
      lead to system lockup. Relocated.

    - While enabling, it was comparing msecs against jiffies without
      conversion leading to spurious load failures on lower HZ kernels.
      Fixed.

    - runnable list management is now used by core bypass logic and moved to
      the patch implementing sched_ext core.

v3: - bpf_scx_init_member() was incorrectly comparing ops->timeout_ms
      against SCX_WATCHDOG_MAX_TIMEOUT which is in jiffies without
      conversion leading to spurious load failures in lower HZ kernels.
      Fixed.

v2: - Julia Lawall noticed that the watchdog code was mixing msecs and
      jiffies. Fix by using jiffies for everything.

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Cc: Julia Lawall <julia.lawall@inria.fr>
---
 include/linux/sched/ext.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index c1530a7992cc..96031252436f 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -122,6 +122,7 @@ struct sched_ext_entity {
 	atomic_long_t		ops_state;
 
 	struct list_head	runnable_node;	/* rq->scx.runnable_list */
+	unsigned long		runnable_at;
 
 	u64			ddsp_dsq_id;
 	u64			ddsp_enq_flags;
-- 
cgit v1.2.3


From 7bb6f0810ecfb73a9d7a2ca56fb001e0201a6758 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:18 -1000
Subject: sched_ext: Allow BPF schedulers to disallow specific tasks from
 joining SCHED_EXT

BPF schedulers might not want to schedule certain tasks - e.g. kernel
threads. This patch adds p->scx.disallow which can be set by BPF schedulers
in such cases. The field can be changed anytime and setting it in
ops.prep_enable() guarantees that the task can never be scheduled by
sched_ext.

scx_qmap is updated with the -d option to disallow a specific PID:

  # echo $$
  1092
  # grep -E '(policy)|(ext\.enabled)' /proc/self/sched
  policy                                       :                    0
  ext.enabled                                  :                    0
  # ./set-scx 1092
  # grep -E '(policy)|(ext\.enabled)' /proc/self/sched
  policy                                       :                    7
  ext.enabled                                  :                    0

Run "scx_qmap -p -d 1092" in another terminal.

  # cat /sys/kernel/sched_ext/nr_rejected
  1
  # grep -E '(policy)|(ext\.enabled)' /proc/self/sched
  policy                                       :                    0
  ext.enabled                                  :                    0
  # ./set-scx 1092
  setparam failed for 1092 (Permission denied)

- v4: Refreshed on top of tip:sched/core.

- v3: Update description to reflect /sys/kernel/sched_ext interface change.

- v2: Use atomic_long_t instead of atomic64_t for scx_kick_cpus_pnt_seqs to
      accommodate 32bit archs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Barret Rhoden <brho@google.com>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 96031252436f..ea7c501ac819 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -137,6 +137,18 @@ struct sched_ext_entity {
 	 */
 	u64			slice;
 
+	/*
+	 * If set, reject future sched_setscheduler(2) calls updating the policy
+	 * to %SCHED_EXT with -%EACCES.
+	 *
+	 * If set from ops.init_task() and the task's policy is already
+	 * %SCHED_EXT, which can happen while the BPF scheduler is being loaded
+	 * or by inhering the parent's policy during fork, the task's policy is
+	 * rejected and forcefully reverted to %SCHED_NORMAL. The number of
+	 * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected.
+	 */
+	bool			disallow;	/* reject switching into SCX */
+
 	/* cold fields */
 	/* must be the last field, see init_scx_entity() */
 	struct list_head	tasks_node;
-- 
cgit v1.2.3


From 1538e33995eaf3f315cbb5506019b9f913ed8555 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Tue, 18 Jun 2024 10:09:18 -1000
Subject: sched_ext: Print sched_ext info when dumping stack

It would be useful to see what the sched_ext scheduler state is, and what
scheduler is running, when we're dumping a task's stack. This patch
therefore adds a new print_scx_info() function that's called in the same
context as print_worker_info() and print_stop_info(). An example dump
follows.

  BUG: kernel NULL pointer dereference, address: 0000000000000999
  #PF: supervisor write access in kernel mode
  #PF: error_code(0x0002) - not-present page
  PGD 0 P4D 0
  Oops: 0002 [#1] PREEMPT SMP
  CPU: 13 PID: 2047 Comm: insmod Tainted: G           O       6.6.0-work-10323-gb58d4cae8e99-dirty #34
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022
  Sched_ext: qmap (enabled+all), task: runnable_at=-17ms
  RIP: 0010:init_module+0x9/0x1000 [test_module]
  ...

v3: - scx_ops_enable_state_str[] definition moved to an earlier patch as
      it's now used by core implementation.

    - Convert jiffy delta to msecs using jiffies_to_msecs() instead of
      multiplying by (HZ / MSEC_PER_SEC). The conversion is implemented in
      jiffies_delta_msecs().

v2: - We are now using scx_ops_enable_state_str[] outside
      CONFIG_SCHED_DEBUG. Move it outside of CONFIG_SCHED_DEBUG and to the
      top. This was reported by Changwoo and Andrea.

Signed-off-by: David Vernet <void@manifault.com>
Reported-by: Changwoo Min <changwoo@igalia.com>
Reported-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index ea7c501ac819..85fb5dc725ef 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -155,10 +155,12 @@ struct sched_ext_entity {
 };
 
 void sched_ext_free(struct task_struct *p);
+void print_scx_info(const char *log_lvl, struct task_struct *p);
 
 #else	/* !CONFIG_SCHED_CLASS_EXT */
 
 static inline void sched_ext_free(struct task_struct *p) {}
+static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
 
 #endif	/* CONFIG_SCHED_CLASS_EXT */
 #endif	/* _LINUX_SCHED_EXT_H */
-- 
cgit v1.2.3


From 81aae789181b5850d77dfdf74d4b85c63f0705e9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:19 -1000
Subject: sched_ext: Implement scx_bpf_kick_cpu() and task preemption support

It's often useful to wake up and/or trigger reschedule on other CPUs. This
patch adds scx_bpf_kick_cpu() kfunc helper that BPF scheduler can call to
kick the target CPU into the scheduling path.

As a sched_ext task relinquishes its CPU only after its slice is depleted,
this patch also adds SCX_KICK_PREEMPT and SCX_ENQ_PREEMPT which clears the
slice of the target CPU's current task to guarantee that sched_ext's
scheduling path runs on the CPU.

If SCX_KICK_IDLE is specified, the target CPU is kicked iff the CPU is idle
to guarantee that the target CPU will go through at least one full sched_ext
scheduling cycle after the kicking. This can be used to wake up idle CPUs
without incurring unnecessary overhead if it isn't currently idle.

As a demonstration of how backward compatibility can be supported using BPF
CO-RE, tools/sched_ext/include/scx/compat.bpf.h is added. It provides
__COMPAT_scx_bpf_kick_cpu_IDLE() which uses SCX_KICK_IDLE if available or
becomes a regular kicking otherwise. This allows schedulers to use the new
SCX_KICK_IDLE while maintaining support for older kernels. The plan is to
temporarily use compat helpers to ease API updates and drop them after a few
kernel releases.

v5: - SCX_KICK_IDLE added. Note that this also adds a compat mechanism for
      schedulers so that they can support kernels without SCX_KICK_IDLE.
      This is useful as a demonstration of how new feature flags can be
      added in a backward compatible way.

    - kick_cpus_irq_workfn() reimplemented so that it touches the pending
      cpumasks only as necessary to reduce kicking overhead on machines with
      a lot of CPUs.

    - tools/sched_ext/include/scx/compat.bpf.h added.

v4: - Move example scheduler to its own patch.

v3: - Make scx_example_central switch all tasks by default.

    - Convert to BPF inline iterators.

v2: - Julia Lawall reported that scx_example_central can overflow the
      dispatch buffer and malfunction. As scheduling for other CPUs can't be
      handled by the automatic retry mechanism, fix by implementing an
      explicit overflow and retry handling.

    - Updated to use generic BPF cpumask helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 85fb5dc725ef..3b2809b980ac 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -134,6 +134,10 @@ struct sched_ext_entity {
 	 * scx_bpf_dispatch() but can also be modified directly by the BPF
 	 * scheduler. Automatically decreased by SCX as the task executes. On
 	 * depletion, a scheduling event is triggered.
+	 *
+	 * This value is cleared to zero if the task is preempted by
+	 * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the
+	 * task ran. Use p->se.sum_exec_runtime instead.
 	 */
 	u64			slice;
 
-- 
cgit v1.2.3


From 22a920209ab6aa4f8ec960ed81041643fddeaec6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:19 -1000
Subject: sched_ext: Implement tickless support

Allow BPF schedulers to indicate tickless operation by setting p->scx.slice
to SCX_SLICE_INF. A CPU whose current task has infinte slice goes into
tickless operation.

scx_central is updated to use tickless operations for all tasks and
instead use a BPF timer to expire slices. This also uses the SCX_ENQ_PREEMPT
and task state tracking added by the previous patches.

Currently, there is no way to pin the timer on the central CPU, so it may
end up on one of the worker CPUs; however, outside of that, the worker CPUs
can go tickless both while running sched_ext tasks and idling.

With schbench running, scx_central shows:

  root@test ~# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts
  LOC:     142024        656        664        449   Local timer interrupts
  LOC:     161663        663        665        449   Local timer interrupts

Without it:

  root@test ~ [SIGINT]# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts
  LOC:     188778       3142       3793       3993   Local timer interrupts
  LOC:     198993       5314       6323       6438   Local timer interrupts

While scx_central itself is too barebone to be useful as a
production scheduler, a more featureful central scheduler can be built using
the same approach. Google's experience shows that such an approach can have
significant benefits for certain applications such as VM hosting.

v4: Allow operation even if BPF_F_TIMER_CPU_PIN is not available.

v3: Pin the central scheduler's timer on the central_cpu using
    BPF_F_TIMER_CPU_PIN.

v2: Convert to BPF inline iterators.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 3b2809b980ac..6f1a4977e9f8 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -16,6 +16,7 @@ enum scx_public_consts {
 	SCX_OPS_NAME_LEN	= 128,
 
 	SCX_SLICE_DFL		= 20 * 1000000,	/* 20ms */
+	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
 };
 
 /*
-- 
cgit v1.2.3


From 36454023f50b2aadd25b7f47c50b5edc0c59e1c0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:19 -1000
Subject: sched_ext: Track tasks that are subjects of the in-flight SCX
 operation

When some SCX operations are in flight, it is known that the subject task's
rq lock is held throughout which makes it safe to access certain fields of
the task - e.g. its current task_group. We want to add SCX kfunc helpers
that can make use of this guarantee - e.g. to help determining the currently
associated CPU cgroup from the task's current task_group.

As it'd be dangerous call such a helper on a task which isn't rq lock
protected, the helper should be able to verify the input task and reject
accordingly. This patch adds sched_ext_entity.kf_tasks[] that track the
tasks which are currently being operated on by a terminal SCX operation. The
new SCX_CALL_OP_[2]TASK[_RET]() can be used when invoking SCX operations
which take tasks as arguments and the scx_kf_allowed_on_arg_tasks() can be
used by kfunc helpers to verify the input task status.

Note that as sched_ext_entity.kf_tasks[] can't handle nesting, the tracking
is currently only limited to terminal SCX operations. If needed in the
future, this restriction can be removed by moving the tracking to the task
side with a couple per-task counters.

v2: Updated to reflect the addition of SCX_KF_SELECT_CPU.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
---
 include/linux/sched/ext.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 6f1a4977e9f8..74341dbc6a19 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -106,6 +106,7 @@ enum scx_kf_mask {
 
 	__SCX_KF_RQ_LOCKED	= SCX_KF_DISPATCH |
 				  SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
+	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
 };
 
 /*
@@ -120,6 +121,7 @@ struct sched_ext_entity {
 	s32			sticky_cpu;
 	s32			holding_cpu;
 	u32			kf_mask;	/* see scx_kf_mask above */
+	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
 	atomic_long_t		ops_state;
 
 	struct list_head	runnable_node;	/* rq->scx.runnable_list */
-- 
cgit v1.2.3


From 245254f7081dbe1c8da54675d0e4ddbe74cee61b Mon Sep 17 00:00:00 2001
From: David Vernet <dvernet@meta.com>
Date: Tue, 18 Jun 2024 10:09:20 -1000
Subject: sched_ext: Implement sched_ext_ops.cpu_acquire/release()

Scheduler classes are strictly ordered and when a higher priority class has
tasks to run, the lower priority ones lose access to the CPU. Being able to
monitor and act on these events are necessary for use cases includling
strict core-scheduling and latency management.

This patch adds two operations ops.cpu_acquire() and .cpu_release(). The
former is invoked when a CPU becomes available to the BPF scheduler and the
opposite for the latter. This patch also implements
scx_bpf_reenqueue_local() which can be called from .cpu_release() to trigger
requeueing of all tasks in the local dsq of the CPU so that the tasks can be
reassigned to other available CPUs.

scx_pair is updated to use .cpu_acquire/release() along with
%SCX_KICK_WAIT to make the pair scheduling guarantee strict even when a CPU
is preempted by a higher priority scheduler class.

scx_qmap is updated to use .cpu_acquire/release() to empty the local
dsq of a preempted CPU. A similar approach can be adopted by BPF schedulers
that want to have a tight control over latency.

v4: Use the new SCX_KICK_IDLE to wake up a CPU after re-enqueueing.

v3: Drop the const qualifier from scx_cpu_release_args.task. BPF enforces
    access control through the verifier, so the qualifier isn't actually
    operative and only gets in the way when interacting with various
    helpers.

v2: Add p->scx.kf_mask annotation to allow calling scx_bpf_reenqueue_local()
    from ops.cpu_release() nested inside ops.init() and other sleepable
    operations.

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 include/linux/sched/ext.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 74341dbc6a19..21c627337e01 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -98,13 +98,15 @@ enum scx_kf_mask {
 	SCX_KF_UNLOCKED		= 0,	  /* not sleepable, not rq locked */
 	/* all non-sleepables may be nested inside SLEEPABLE */
 	SCX_KF_SLEEPABLE	= 1 << 0, /* sleepable init operations */
+	/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
+	SCX_KF_CPU_RELEASE	= 1 << 1, /* ops.cpu_release() */
 	/* ops.dequeue (in REST) may be nested inside DISPATCH */
 	SCX_KF_DISPATCH		= 1 << 2, /* ops.dispatch() */
 	SCX_KF_ENQUEUE		= 1 << 3, /* ops.enqueue() and ops.select_cpu() */
 	SCX_KF_SELECT_CPU	= 1 << 4, /* ops.select_cpu() */
 	SCX_KF_REST		= 1 << 5, /* other rq-locked operations */
 
-	__SCX_KF_RQ_LOCKED	= SCX_KF_DISPATCH |
+	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
 				  SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
 	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
 };
-- 
cgit v1.2.3


From 7b0888b7cc1924b74ce660e02f6211df8dd46e7b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:20 -1000
Subject: sched_ext: Implement core-sched support

The core-sched support is composed of the following parts:

- task_struct->scx.core_sched_at is added. This is a timestamp which can be
  used to order tasks. Depending on whether the BPF scheduler implements
  custom ordering, it tracks either global FIFO ordering of all tasks or
  local-DSQ ordering within the dispatched tasks on a CPU.

- prio_less() is updated to call scx_prio_less() when comparing SCX tasks.
  scx_prio_less() calls ops.core_sched_before() if available or uses the
  core_sched_at timestamp. For global FIFO ordering, the BPF scheduler
  doesn't need to do anything. Otherwise, it should implement
  ops.core_sched_before() which reflects the ordering.

- When core-sched is enabled, balance_scx() balances all SMT siblings so
  that they all have tasks dispatched if necessary before pick_task_scx() is
  called. pick_task_scx() picks between the current task and the first
  dispatched task on the local DSQ based on availability and the
  core_sched_at timestamps. Note that FIFO ordering is expected among the
  already dispatched tasks whether running or on the local DSQ, so this path
  always compares core_sched_at instead of calling into
  ops.core_sched_before().

qmap_core_sched_before() is added to scx_qmap. It scales the
distances from the heads of the queues to compare the tasks across different
priority queues and seems to behave as expected.

v3: Fixed build error when !CONFIG_SCHED_SMT reported by Andrea Righi.

v2: Sched core added the const qualifiers to prio_less task arguments.
    Explicitly drop them for ops.core_sched_before() task arguments. BPF
    enforces access control through the verifier, so the qualifier isn't
    actually operative and only gets in the way when interacting with
    various helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Reviewed-by: Josh Don <joshdon@google.com>
Cc: Andrea Righi <andrea.righi@canonical.com>
---
 include/linux/sched/ext.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 21c627337e01..3db7b32b2d1d 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -129,6 +129,9 @@ struct sched_ext_entity {
 	struct list_head	runnable_node;	/* rq->scx.runnable_list */
 	unsigned long		runnable_at;
 
+#ifdef CONFIG_SCHED_CORE
+	u64			core_sched_at;	/* see scx_prio_less() */
+#endif
 	u64			ddsp_dsq_id;
 	u64			ddsp_enq_flags;
 
-- 
cgit v1.2.3


From 06e51be3d5e7a07aea5c9012773df8d5de01db6c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:21 -1000
Subject: sched_ext: Add vtime-ordered priority queue to dispatch_q's

Currently, a dsq is always a FIFO. A task which is dispatched earlier gets
consumed or executed earlier. While this is sufficient when dsq's are used
for simple staging areas for tasks which are ready to execute, it'd make
dsq's a lot more useful if they can implement custom ordering.

This patch adds a vtime-ordered priority queue to dsq's. When the BPF
scheduler dispatches a task with the new scx_bpf_dispatch_vtime() helper, it
can specify the vtime tha the task should be inserted at and the task is
inserted into the priority queue in the dsq which is ordered according to
time_before64() comparison of the vtime values.

A DSQ can either be a FIFO or priority queue and automatically switches
between the two depending on whether scx_bpf_dispatch() or
scx_bpf_dispatch_vtime() is used. Using the wrong variant while the DSQ
already has the other type queued is not allowed and triggers an ops error.
Built-in DSQs must always be FIFOs.

This makes it very easy for the BPF schedulers to implement proper vtime
based scheduling within each dsq very easy and efficient at a negligible
cost in terms of code complexity and overhead.

scx_simple and scx_example_flatcg are updated to default to weighted
vtime scheduling (the latter within each cgroup). FIFO scheduling can be
selected with -f option.

v4: - As allowing mixing priority queue and FIFO on the same DSQ sometimes
      led to unexpected starvations, DSQs now error out if both modes are
      used at the same time and the built-in DSQs are no longer allowed to
      be priority queues.

    - Explicit type struct scx_dsq_node added to contain fields needed to be
      linked on DSQs. This will be used to implement stateful iterator.

    - Tasks are now always linked on dsq->list whether the DSQ is in FIFO or
      PRIQ mode. This confines PRIQ related complexities to the enqueue and
      dequeue paths. Other paths only need to look at dsq->list. This will
      also ease implementing BPF iterator.

    - Print p->scx.dsq_flags in debug dump.

v3: - SCX_TASK_DSQ_ON_PRIQ flag is moved from p->scx.flags into its own
      p->scx.dsq_flags. The flag is protected with the dsq lock unlike other
      flags in p->scx.flags. This led to flag corruption in some cases.

    - Add comments explaining the interaction between using consumption of
      p->scx.slice to determine vtime progress and yielding.

v2: - p->scx.dsq_vtime was not initialized on load or across cgroup
      migrations leading to some tasks being stalled for extended period of
      time depending on how saturated the machine is. Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
---
 include/linux/sched/ext.h | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 3db7b32b2d1d..9cee193dab19 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -49,12 +49,15 @@ enum scx_dsq_id_flags {
 };
 
 /*
- * Dispatch queue (dsq) is a simple FIFO which is used to buffer between the
- * scheduler core and the BPF scheduler. See the documentation for more details.
+ * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered
+ * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to
+ * buffer between the scheduler core and the BPF scheduler. See the
+ * documentation for more details.
  */
 struct scx_dispatch_q {
 	raw_spinlock_t		lock;
 	struct list_head	list;	/* tasks in dispatch order */
+	struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */
 	u32			nr;
 	u64			id;
 	struct rhash_head	hash_node;
@@ -86,6 +89,11 @@ enum scx_task_state {
 	SCX_TASK_NR_STATES,
 };
 
+/* scx_entity.dsq_flags */
+enum scx_ent_dsq_flags {
+	SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */
+};
+
 /*
  * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
  * everywhere and the following bits track which kfunc sets are currently
@@ -111,13 +119,19 @@ enum scx_kf_mask {
 	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
 };
 
+struct scx_dsq_node {
+	struct list_head	list;		/* dispatch order */
+	struct rb_node		priq;		/* p->scx.dsq_vtime order */
+	u32			flags;		/* SCX_TASK_DSQ_* flags */
+};
+
 /*
  * The following is embedded in task_struct and contains all fields necessary
  * for a task to be scheduled by SCX.
  */
 struct sched_ext_entity {
 	struct scx_dispatch_q	*dsq;
-	struct list_head	dsq_node;
+	struct scx_dsq_node	dsq_node;	/* protected by dsq lock */
 	u32			flags;		/* protected by rq lock */
 	u32			weight;
 	s32			sticky_cpu;
@@ -149,6 +163,15 @@ struct sched_ext_entity {
 	 */
 	u64			slice;
 
+	/*
+	 * Used to order tasks when dispatching to the vtime-ordered priority
+	 * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime()
+	 * but can also be modified directly by the BPF scheduler. Modifying it
+	 * while a task is queued on a dsq may mangle the ordering and is not
+	 * recommended.
+	 */
+	u64			dsq_vtime;
+
 	/*
 	 * If set, reject future sched_setscheduler(2) calls updating the policy
 	 * to %SCHED_EXT with -%EACCES.
-- 
cgit v1.2.3


From fa48e8d2c7b58d242c1db3a09c14f4274e055087 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Jun 2024 10:09:21 -1000
Subject: sched_ext: Documentation: scheduler: Document extensible scheduler
 class

Add Documentation/scheduler/sched-ext.rst which gives a high-level overview
and pointers to the examples.

v6: - Add paragraph explaining debug dump.

v5: - Updated to reflect /sys/kernel interface change. Kconfig options
      added.

v4: - README improved, reformatted in markdown and renamed to README.md.

v3: - Added tools/sched_ext/README.

    - Dropped _example prefix from scheduler names.

v2: - Apply minor edits suggested by Bagas. Caveats section dropped as all
      of them are addressed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Cc: Bagas Sanjaya <bagasdotme@gmail.com>
---
 include/linux/sched/ext.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 9cee193dab19..fe9a67ffe6b1 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
-- 
cgit v1.2.3


From d4af01c3731ff9c6e224d7183f8226a56d72b56c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jul 2024 14:30:55 -1000
Subject: sched_ext: Take out ->priq and ->flags from scx_dsq_node

struct scx_dsq_node contains two data structure nodes to link the containing
task to a DSQ and a flags field that is protected by the lock of the
associated DSQ. One reason why they are grouped into a struct is to use the
type independently as a cursor node when iterating tasks on a DSQ. However,
when iterating, the cursor only needs to be linked on the FIFO list and the
rb_node part ends up inflating the size of the iterator data structure
unnecessarily making it potentially too expensive to place it on stack.

Take ->priq and ->flags out of scx_dsq_node and put them in sched_ext_entity
as ->dsq_priq and ->dsq_flags, respectively. scx_dsq_node is renamed to
scx_dsq_list_node and the field names are renamed accordingly. This will
help implementing DSQ task iterator that can be allocated on stack.

No functional change intended.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: David Vernet <void@manifault.com>
---
 include/linux/sched/ext.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index fe9a67ffe6b1..eb9cfd18a923 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -121,10 +121,8 @@ enum scx_kf_mask {
 	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
 };
 
-struct scx_dsq_node {
-	struct list_head	list;		/* dispatch order */
-	struct rb_node		priq;		/* p->scx.dsq_vtime order */
-	u32			flags;		/* SCX_TASK_DSQ_* flags */
+struct scx_dsq_list_node {
+	struct list_head	node;
 };
 
 /*
@@ -133,7 +131,9 @@ struct scx_dsq_node {
  */
 struct sched_ext_entity {
 	struct scx_dispatch_q	*dsq;
-	struct scx_dsq_node	dsq_node;	/* protected by dsq lock */
+	struct scx_dsq_list_node dsq_list;	/* dispatch order */
+	struct rb_node		dsq_priq;	/* p->scx.dsq_vtime order */
+	u32			dsq_flags;	/* protected by DSQ lock */
 	u32			flags;		/* protected by rq lock */
 	u32			weight;
 	s32			sticky_cpu;
-- 
cgit v1.2.3


From 650ba21b131ed1f8ee57826b2c6295a3be221132 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Jul 2024 14:30:55 -1000
Subject: sched_ext: Implement DSQ iterator

DSQs are very opaque in the consumption path. The BPF scheduler has no way
of knowing which tasks are being considered and which is picked. This patch
adds BPF DSQ iterator.

- Allows iterating tasks queued on a DSQ in the dispatch order or reverse
  from anywhere using bpf_for_each(scx_dsq) or calling the iterator kfuncs
  directly.

- Has ordering guarantee where only tasks which were already queued when the
  iteration started are visible and consumable during the iteration.

v5: - Add a comment to the naked list_empty(&dsq->list) test in
      consume_dispatch_q() to explain the reasoning behind the lockless test
      and by extension why nldsq_next_task() isn't used there.

    - scx_qmap changes separated into its own patch.

v4: - bpf_iter_scx_dsq_new() declaration in common.bpf.h was using the wrong
      type for the last argument (bool rev instead of u64 flags). Fix it.

v3: - Alexei pointed out that the iterator is too big to allocate on stack.
      Added a prep patch to reduce the size of the cursor. Now
      bpf_iter_scx_dsq is 48 bytes and bpf_iter_scx_dsq_kern is 40 bytes on
      64bit.

    - u32_before() comparison factored out.

v2: - scx_bpf_consume_task() is separated out into a separate patch.

    - DSQ seq and iter flags don't need to be u64. Use u32.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: bpf@vger.kernel.org
---
 include/linux/sched/ext.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index eb9cfd18a923..593d2f4909dd 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -61,6 +61,7 @@ struct scx_dispatch_q {
 	struct list_head	list;	/* tasks in dispatch order */
 	struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */
 	u32			nr;
+	u32			seq;	/* used by BPF iter */
 	u64			id;
 	struct rhash_head	hash_node;
 	struct llist_node	free_node;
@@ -123,6 +124,7 @@ enum scx_kf_mask {
 
 struct scx_dsq_list_node {
 	struct list_head	node;
+	bool			is_bpf_iter_cursor;
 };
 
 /*
@@ -133,6 +135,7 @@ struct sched_ext_entity {
 	struct scx_dispatch_q	*dsq;
 	struct scx_dsq_list_node dsq_list;	/* dispatch order */
 	struct rb_node		dsq_priq;	/* p->scx.dsq_vtime order */
+	u32			dsq_seq;
 	u32			dsq_flags;	/* protected by DSQ lock */
 	u32			flags;		/* protected by rq lock */
 	u32			weight;
-- 
cgit v1.2.3


From e8858fc07eb8386b9b6436d3e86eeb4b81e4f660 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sat, 13 Jul 2024 23:00:27 -0700
Subject: Input: msc5000_ts - remove the driver

MCS-5000 belongs to the 1st generation of Melfas chips, manufactured in
2000-2007.

The driver relies on custom platform data (no DT support) and there
never were any users of this driver in the mainline kernel. The commit
adding the driver mentioned that the driver was tested on S3C6410 NCP
board (with Samsung S3C6410 SoC) but the touchscreen device was never
added to the board file. This board was removed in v6.3 in commit
743c8fbb90ca ("ARM: s3c: remove most s3c64xx board support").

Remove the driver since there are no users.

Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20240714060029.1528662-1-dmitry.torokhov@gmail.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/platform_data/mcs.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/mcs.h b/include/linux/platform_data/mcs.h
index fcc6f2a1f5c3..f3b0749f1630 100644
--- a/include/linux/platform_data/mcs.h
+++ b/include/linux/platform_data/mcs.h
@@ -16,10 +16,6 @@ struct mcs_platform_data {
 	void (*poweron)(bool);
 	void (*cfg_pin)(void);
 
-	/* touchscreen */
-	unsigned int x_size;
-	unsigned int y_size;
-
 	/* touchkey */
 	const u32 *keymap;
 	unsigned int keymap_size;
-- 
cgit v1.2.3


From dd29eadee1e8f07bbec57dddf4befbcd3e3c0af7 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sat, 13 Jul 2024 23:00:28 -0700
Subject: Input: msc_touchkey - remove the driver

MCS-5000/5080 chips belong to the 1st generation of Melfas chips,
manufactured in 2000-2007.

The driver relies on custom platform data (no DT support) and there
never were any users of this driver in the mainline kernel. It is likely
that the driver was (like mcs5000_ts driver) was tested on S3C6410 NCP
board (with Samsung S3C6410 SoC), but the touchkey device was never
added to the board file. This board was removed in v6.3 in commit
743c8fbb90ca ("ARM: s3c: remove most s3c64xx board support").

Remove the driver since there are no users.

Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20240714060029.1528662-2-dmitry.torokhov@gmail.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/platform_data/mcs.h | 26 --------------------------
 1 file changed, 26 deletions(-)
 delete mode 100644 include/linux/platform_data/mcs.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/mcs.h b/include/linux/platform_data/mcs.h
deleted file mode 100644
index f3b0749f1630..000000000000
--- a/include/linux/platform_data/mcs.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (C) 2009 - 2010 Samsung Electronics Co.Ltd
- * Author: Joonyoung Shim <jy0922.shim@samsung.com>
- * Author: HeungJun Kim <riverful.kim@samsung.com>
- */
-
-#ifndef __LINUX_MCS_H
-#define __LINUX_MCS_H
-
-#define MCS_KEY_MAP(v, c)	((((v) & 0xff) << 16) | ((c) & 0xffff))
-#define MCS_KEY_VAL(v)		(((v) >> 16) & 0xff)
-#define MCS_KEY_CODE(v)		((v) & 0xffff)
-
-struct mcs_platform_data {
-	void (*poweron)(bool);
-	void (*cfg_pin)(void);
-
-	/* touchkey */
-	const u32 *keymap;
-	unsigned int keymap_size;
-	unsigned int key_maxval;
-	bool no_autorepeat;
-};
-
-#endif	/* __LINUX_MCS_H */
-- 
cgit v1.2.3


From e1a261ba599eec97e1c5c7760d5c3698fc24e6a6 Mon Sep 17 00:00:00 2001
From: Jocelyn Falempe <jfalempe@redhat.com>
Date: Tue, 2 Jul 2024 14:26:04 +0200
Subject: printk: Add a short description string to kmsg_dump()

kmsg_dump doesn't forward the panic reason string to the kmsg_dumper
callback.
This patch adds a new struct kmsg_dump_detail, that will hold the
reason and description, and pass it to the dump() callback.

To avoid updating all kmsg_dump() call, it adds a kmsg_dump_desc()
function and a macro for backward compatibility.

I've written this for drm_panic, but it can be useful for other
kmsg_dumper.
It allows to see the panic reason, like "sysrq triggered crash"
or "VFS: Unable to mount root fs on xxxx" on the drm panic screen.

v2:
 * Use a struct kmsg_dump_detail to hold the reason and description
   pointer, for more flexibility if we want to add other parameters.
   (Kees Cook)
 * Fix powerpc/nvram_64 build, as I didn't update the forward
   declaration of oops_to_nvram()

Signed-off-by: Jocelyn Falempe <jfalempe@redhat.com>
Acked-by: Petr Mladek <pmladek@suse.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Acked-by: Kees Cook <kees@kernel.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20240702122639.248110-1-jfalempe@redhat.com
---
 include/linux/kmsg_dump.h | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index 906521c2329c..6055fc969877 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -39,6 +39,17 @@ struct kmsg_dump_iter {
 	u64	next_seq;
 };
 
+/**
+ * struct kmsg_dump_detail - kernel crash detail
+ * @reason: reason for the crash, see kmsg_dump_reason.
+ * @description: optional short string, to provide additional information.
+ */
+
+struct kmsg_dump_detail {
+	enum kmsg_dump_reason reason;
+	const char *description;
+};
+
 /**
  * struct kmsg_dumper - kernel crash message dumper structure
  * @list:	Entry in the dumper list (private)
@@ -49,13 +60,13 @@ struct kmsg_dump_iter {
  */
 struct kmsg_dumper {
 	struct list_head list;
-	void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason);
+	void (*dump)(struct kmsg_dumper *dumper, struct kmsg_dump_detail *detail);
 	enum kmsg_dump_reason max_reason;
 	bool registered;
 };
 
 #ifdef CONFIG_PRINTK
-void kmsg_dump(enum kmsg_dump_reason reason);
+void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc);
 
 bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
 			char *line, size_t size, size_t *len);
@@ -71,7 +82,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper);
 
 const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason);
 #else
-static inline void kmsg_dump(enum kmsg_dump_reason reason)
+static inline void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc)
 {
 }
 
@@ -107,4 +118,9 @@ static inline const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason)
 }
 #endif
 
+static inline void kmsg_dump(enum kmsg_dump_reason reason)
+{
+	kmsg_dump_desc(reason, NULL);
+}
+
 #endif /* _LINUX_KMSG_DUMP_H */
-- 
cgit v1.2.3


From d20a9f568f99591886ec73b80ff7283486710643 Mon Sep 17 00:00:00 2001
From: Jocelyn Falempe <jfalempe@redhat.com>
Date: Wed, 17 Jul 2024 10:48:40 +0200
Subject: fbcon: Add an option to disable fbcon in panic

This is required to avoid conflict between DRM_PANIC, and fbcon. If
a drm device already handle panic with drm_panic, it should set
the skip_panic field in fb_info, so that fbcon will stay quiet, and
not overwrite the panic_screen.

Signed-off-by: Jocelyn Falempe <jfalempe@redhat.com>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20240717090102.968152-3-jfalempe@redhat.com
---
 include/linux/fb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index db7d97b10964..865dad03e73e 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -510,6 +510,7 @@ struct fb_info {
 	void *par;
 
 	bool skip_vt_switch; /* no VT switch on suspend/resume required */
+	bool skip_panic; /* Do not write to the fb after a panic */
 };
 
 /* This will go away
-- 
cgit v1.2.3


From a838e5dca63d1dc701e63b2b1176943c57485c45 Mon Sep 17 00:00:00 2001
From: Kemeng Shi <shikemeng@huaweicloud.com>
Date: Mon, 15 Jul 2024 21:05:32 +0800
Subject: quota: remove unneeded return value of register_quota_format

The register_quota_format always returns 0, simply remove unneeded return
value.

Link: https://patch.msgid.link/20240715130534.2112678-3-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/quota.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/quota.h b/include/linux/quota.h
index 07071e64abf3..89a0d83ddad0 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -526,7 +526,7 @@ struct quota_info {
 	const struct quota_format_ops *ops[MAXQUOTAS];	/* Operations for each type */
 };
 
-int register_quota_format(struct quota_format_type *fmt);
+void register_quota_format(struct quota_format_type *fmt);
 void unregister_quota_format(struct quota_format_type *fmt);
 
 struct quota_module_name {
-- 
cgit v1.2.3


From d5e79eeba3086a52593b295ac4bf6eddd64d4aad Mon Sep 17 00:00:00 2001
From: "T.J. Mercier" <tjmercier@google.com>
Date: Sat, 20 Jul 2024 15:15:42 +0800
Subject: dma-buf: heaps: Deduplicate docs and adopt common format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The docs for dma_heap_get_name were incorrect, and since they were
duplicated in the header they were wrong there too.

The docs formatting was inconsistent so I tried to make it more
consistent across functions since I'm already in here doing cleanup.

Remove multiple unused includes and alphabetize.

Signed-off-by: T.J. Mercier <tjmercier@google.com>
Signed-off-by: Yong Wu <yong.wu@mediatek.com>
[Yong: Just add a comment for "priv" to mute build warning]
Signed-off-by: Yunfei Dong <yunfei.dong@mediatek.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240720071606.27930-5-yunfei.dong@mediatek.com
Signed-off-by: Christian König <christian.koenig@amd.com>
---
 include/linux/dma-heap.h | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
index 064bad725061..27d15f60950a 100644
--- a/include/linux/dma-heap.h
+++ b/include/linux/dma-heap.h
@@ -9,14 +9,13 @@
 #ifndef _DMA_HEAPS_H
 #define _DMA_HEAPS_H
 
-#include <linux/cdev.h>
 #include <linux/types.h>
 
 struct dma_heap;
 
 /**
  * struct dma_heap_ops - ops to operate on a given heap
- * @allocate:		allocate dmabuf and return struct dma_buf ptr
+ * @allocate:	allocate dmabuf and return struct dma_buf ptr
  *
  * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
  */
@@ -41,28 +40,10 @@ struct dma_heap_export_info {
 	void *priv;
 };
 
-/**
- * dma_heap_get_drvdata() - get per-heap driver data
- * @heap: DMA-Heap to retrieve private data for
- *
- * Returns:
- * The per-heap data for the heap.
- */
 void *dma_heap_get_drvdata(struct dma_heap *heap);
 
-/**
- * dma_heap_get_name() - get heap name
- * @heap: DMA-Heap to retrieve private data for
- *
- * Returns:
- * The char* for the heap name.
- */
 const char *dma_heap_get_name(struct dma_heap *heap);
 
-/**
- * dma_heap_add - adds a heap to dmabuf heaps
- * @exp_info:		information needed to register this heap
- */
 struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info);
 
 #endif /* _DMA_HEAPS_H */
-- 
cgit v1.2.3


From 23a55f4492fcf868d068da31a2cd30c15f46207d Mon Sep 17 00:00:00 2001
From: Ofir Gal <ofir.gal@volumez.com>
Date: Thu, 18 Jul 2024 11:45:12 +0300
Subject: net: introduce helper sendpages_ok()

Network drivers are using sendpage_ok() to check the first page of an
iterator in order to disable MSG_SPLICE_PAGES. The iterator can
represent list of contiguous pages.

When MSG_SPLICE_PAGES is enabled skb_splice_from_iter() is being used,
it requires all pages in the iterator to be sendable. Therefore it needs
to check that each page is sendable.

The patch introduces a helper sendpages_ok(), it returns true if all the
contiguous pages are sendable.

Drivers who want to send contiguous pages with MSG_SPLICE_PAGES may use
this helper to check whether the page list is OK. If the helper does not
return true, the driver should remove MSG_SPLICE_PAGES flag.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Ofir Gal <ofir.gal@volumez.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20240718084515.3833733-2-ofir.gal@volumez.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/net.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index 688320b79fcc..b75bc534c1b3 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -322,6 +322,25 @@ static inline bool sendpage_ok(struct page *page)
 	return !PageSlab(page) && page_count(page) >= 1;
 }
 
+/*
+ * Check sendpage_ok on contiguous pages.
+ */
+static inline bool sendpages_ok(struct page *page, size_t len, size_t offset)
+{
+	struct page *p = page + (offset >> PAGE_SHIFT);
+	size_t count = 0;
+
+	while (count < len) {
+		if (!sendpage_ok(p))
+			return false;
+
+		p++;
+		count += PAGE_SIZE;
+	}
+
+	return true;
+}
+
 int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec,
 		   size_t num, size_t len);
 int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg,
-- 
cgit v1.2.3


From 320f6693097bf89d67f9cabad24a2b911e23073f Mon Sep 17 00:00:00 2001
From: Marcelo Schmitt <marcelo.schmitt@analog.com>
Date: Fri, 12 Jul 2024 16:21:03 -0300
Subject: spi: bitbang: Implement support for MOSI idle state configuration

Some SPI peripherals may require strict MOSI line state when the controller
is not clocking out data. Implement support for MOSI idle state
configuration (low or high) by setting the data output line level on
controller setup and after transfers. Bitbang operations now call
controller specific set_mosi_idle() callback to set MOSI to its idle state.
The MOSI line is kept at its idle state if no tx buffer is provided.

Acked-by: Nuno Sa <nuno.sa@analog.com>
Reviewed-by: David Lechner <dlechner@baylibre.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Marcelo Schmitt <marcelo.schmitt@analog.com>
Link: https://patch.msgid.link/de61a600b56ed9cb714d5ea87afa88948e70041e.1720810545.git.marcelo.schmitt@analog.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi_bitbang.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h
index d4cb83195f7a..c92cd43a47f4 100644
--- a/include/linux/spi/spi_bitbang.h
+++ b/include/linux/spi/spi_bitbang.h
@@ -24,6 +24,7 @@ struct spi_bitbang {
 #define	BITBANG_CS_ACTIVE	1	/* normally nCS, active low */
 #define	BITBANG_CS_INACTIVE	0
 
+	void	(*set_mosi_idle)(struct spi_device *spi);
 	/* txrx_bufs() may handle dma mapping for transfers that don't
 	 * already have one (transfer.{tx,rx}_dma is zero), or use PIO
 	 */
-- 
cgit v1.2.3


From d65d411c9259a2499081e1e7ed91088232666b57 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Tue, 25 Jul 2023 12:08:50 +0100
Subject: treewide: context_tracking: Rename CONTEXT_* into CT_STATE_*

Context tracking state related symbols currently use a mix of the
CONTEXT_ (e.g. CONTEXT_KERNEL) and CT_SATE_ (e.g. CT_STATE_MASK) prefixes.

Clean up the naming and make the ctx_state enum use the CT_STATE_ prefix.

Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/context_tracking.h       | 16 ++++++++--------
 include/linux/context_tracking_state.h | 20 ++++++++++----------
 include/linux/entry-common.h           |  2 +-
 3 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 6e76b9dba00e..28fcfa184903 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -26,26 +26,26 @@ extern void user_exit_callable(void);
 static inline void user_enter(void)
 {
 	if (context_tracking_enabled())
-		ct_user_enter(CONTEXT_USER);
+		ct_user_enter(CT_STATE_USER);
 
 }
 static inline void user_exit(void)
 {
 	if (context_tracking_enabled())
-		ct_user_exit(CONTEXT_USER);
+		ct_user_exit(CT_STATE_USER);
 }
 
 /* Called with interrupts disabled.  */
 static __always_inline void user_enter_irqoff(void)
 {
 	if (context_tracking_enabled())
-		__ct_user_enter(CONTEXT_USER);
+		__ct_user_enter(CT_STATE_USER);
 
 }
 static __always_inline void user_exit_irqoff(void)
 {
 	if (context_tracking_enabled())
-		__ct_user_exit(CONTEXT_USER);
+		__ct_user_exit(CT_STATE_USER);
 }
 
 static inline enum ctx_state exception_enter(void)
@@ -57,7 +57,7 @@ static inline enum ctx_state exception_enter(void)
 		return 0;
 
 	prev_ctx = __ct_state();
-	if (prev_ctx != CONTEXT_KERNEL)
+	if (prev_ctx != CT_STATE_KERNEL)
 		ct_user_exit(prev_ctx);
 
 	return prev_ctx;
@@ -67,7 +67,7 @@ static inline void exception_exit(enum ctx_state prev_ctx)
 {
 	if (!IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) &&
 	    context_tracking_enabled()) {
-		if (prev_ctx != CONTEXT_KERNEL)
+		if (prev_ctx != CT_STATE_KERNEL)
 			ct_user_enter(prev_ctx);
 	}
 }
@@ -75,7 +75,7 @@ static inline void exception_exit(enum ctx_state prev_ctx)
 static __always_inline bool context_tracking_guest_enter(void)
 {
 	if (context_tracking_enabled())
-		__ct_user_enter(CONTEXT_GUEST);
+		__ct_user_enter(CT_STATE_GUEST);
 
 	return context_tracking_enabled_this_cpu();
 }
@@ -83,7 +83,7 @@ static __always_inline bool context_tracking_guest_enter(void)
 static __always_inline void context_tracking_guest_exit(void)
 {
 	if (context_tracking_enabled())
-		__ct_user_exit(CONTEXT_GUEST);
+		__ct_user_exit(CT_STATE_GUEST);
 }
 
 #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond))
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index bbff5f7f8803..f1c53125edee 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -10,18 +10,18 @@
 #define DYNTICK_IRQ_NONIDLE	((LONG_MAX / 2) + 1)
 
 enum ctx_state {
-	CONTEXT_DISABLED	= -1,	/* returned by ct_state() if unknown */
-	CONTEXT_KERNEL		= 0,
-	CONTEXT_IDLE		= 1,
-	CONTEXT_USER		= 2,
-	CONTEXT_GUEST		= 3,
-	CONTEXT_MAX		= 4,
+	CT_STATE_DISABLED	= -1,	/* returned by ct_state() if unknown */
+	CT_STATE_KERNEL		= 0,
+	CT_STATE_IDLE		= 1,
+	CT_STATE_USER		= 2,
+	CT_STATE_GUEST		= 3,
+	CT_STATE_MAX		= 4,
 };
 
 /* Even value for idle, else odd. */
-#define RCU_DYNTICKS_IDX CONTEXT_MAX
+#define RCU_DYNTICKS_IDX CT_STATE_MAX
 
-#define CT_STATE_MASK (CONTEXT_MAX - 1)
+#define CT_STATE_MASK (CT_STATE_MAX - 1)
 #define CT_DYNTICKS_MASK (~CT_STATE_MASK)
 
 struct context_tracking {
@@ -123,14 +123,14 @@ static inline bool context_tracking_enabled_this_cpu(void)
  *
  * Returns the current cpu's context tracking state if context tracking
  * is enabled.  If context tracking is disabled, returns
- * CONTEXT_DISABLED.  This should be used primarily for debugging.
+ * CT_STATE_DISABLED.  This should be used primarily for debugging.
  */
 static __always_inline int ct_state(void)
 {
 	int ret;
 
 	if (!context_tracking_enabled())
-		return CONTEXT_DISABLED;
+		return CT_STATE_DISABLED;
 
 	preempt_disable();
 	ret = __ct_state();
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index b0fb775a600d..1e50cdb83ae5 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -108,7 +108,7 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
 	arch_enter_from_user_mode(regs);
 	lockdep_hardirqs_off(CALLER_ADDR0);
 
-	CT_WARN_ON(__ct_state() != CONTEXT_USER);
+	CT_WARN_ON(__ct_state() != CT_STATE_USER);
 	user_exit_irqoff();
 
 	instrumentation_begin();
-- 
cgit v1.2.3


From 4aa35e0db6d758e8f952ed30d7ac3117c376562c Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Tue, 25 Jul 2023 12:19:01 +0100
Subject: context_tracking, rcu: Rename RCU_DYNTICKS_IDX into CT_RCU_WATCHING

The symbols relating to the CT_STATE part of context_tracking.state are now
all prefixed with CT_STATE.

The RCU dynticks counter part of that atomic variable still involves
symbols with different prefixes, align them all to be prefixed with
CT_RCU_WATCHING.

Suggested-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/context_tracking.h       |  6 +++---
 include/linux/context_tracking_state.h | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 28fcfa184903..a6c36780cc3b 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -119,7 +119,7 @@ extern void ct_idle_exit(void);
  */
 static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
 {
-	return !(raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & RCU_DYNTICKS_IDX);
+	return !(raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING);
 }
 
 /*
@@ -142,7 +142,7 @@ static __always_inline bool warn_rcu_enter(void)
 	preempt_disable_notrace();
 	if (rcu_dynticks_curr_cpu_in_eqs()) {
 		ret = true;
-		ct_state_inc(RCU_DYNTICKS_IDX);
+		ct_state_inc(CT_RCU_WATCHING);
 	}
 
 	return ret;
@@ -151,7 +151,7 @@ static __always_inline bool warn_rcu_enter(void)
 static __always_inline void warn_rcu_exit(bool rcu)
 {
 	if (rcu)
-		ct_state_inc(RCU_DYNTICKS_IDX);
+		ct_state_inc(CT_RCU_WATCHING);
 	preempt_enable_notrace();
 }
 
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index f1c53125edee..94d6a935af3b 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -18,11 +18,11 @@ enum ctx_state {
 	CT_STATE_MAX		= 4,
 };
 
-/* Even value for idle, else odd. */
-#define RCU_DYNTICKS_IDX CT_STATE_MAX
+/* Odd value for watching, else even. */
+#define CT_RCU_WATCHING CT_STATE_MAX
 
 #define CT_STATE_MASK (CT_STATE_MAX - 1)
-#define CT_DYNTICKS_MASK (~CT_STATE_MASK)
+#define CT_RCU_WATCHING_MASK (~CT_STATE_MASK)
 
 struct context_tracking {
 #ifdef CONFIG_CONTEXT_TRACKING_USER
@@ -58,21 +58,21 @@ static __always_inline int __ct_state(void)
 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
 static __always_inline int ct_dynticks(void)
 {
-	return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_DYNTICKS_MASK;
+	return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING_MASK;
 }
 
 static __always_inline int ct_dynticks_cpu(int cpu)
 {
 	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
 
-	return atomic_read(&ct->state) & CT_DYNTICKS_MASK;
+	return atomic_read(&ct->state) & CT_RCU_WATCHING_MASK;
 }
 
 static __always_inline int ct_dynticks_cpu_acquire(int cpu)
 {
 	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
 
-	return atomic_read_acquire(&ct->state) & CT_DYNTICKS_MASK;
+	return atomic_read_acquire(&ct->state) & CT_RCU_WATCHING_MASK;
 }
 
 static __always_inline long ct_dynticks_nesting(void)
-- 
cgit v1.2.3


From a4a7921ec08d016f9d692752e2f82f66369c7ffa Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Tue, 16 Apr 2024 10:45:56 +0200
Subject: context_tracking, rcu: Rename ct_dynticks() into ct_rcu_watching()

The context_tracking.state RCU_DYNTICKS subvariable has been renamed to
RCU_WATCHING, reflect that change in the related helpers.

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/context_tracking_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 94d6a935af3b..cb90d8c17810 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -56,7 +56,7 @@ static __always_inline int __ct_state(void)
 #endif
 
 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
-static __always_inline int ct_dynticks(void)
+static __always_inline int ct_rcu_watching(void)
 {
 	return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING_MASK;
 }
-- 
cgit v1.2.3


From a9fde9d1a5dd42edadf61cf4e64aa234b4c5bd3f Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Tue, 16 Apr 2024 10:47:14 +0200
Subject: context_tracking, rcu: Rename ct_dynticks_cpu() into
 ct_rcu_watching_cpu()

The context_tracking.state RCU_DYNTICKS subvariable has been renamed to
RCU_WATCHING, reflect that change in the related helpers.

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/context_tracking_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index cb90d8c17810..ad5a06a42b4a 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -61,7 +61,7 @@ static __always_inline int ct_rcu_watching(void)
 	return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING_MASK;
 }
 
-static __always_inline int ct_dynticks_cpu(int cpu)
+static __always_inline int ct_rcu_watching_cpu(int cpu)
 {
 	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
 
-- 
cgit v1.2.3


From 125716c393889f9035567d4d78da239483f63ece Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Tue, 16 Apr 2024 14:47:12 +0200
Subject: context_tracking, rcu: Rename ct_dynticks_cpu_acquire() into
 ct_rcu_watching_cpu_acquire()

The context_tracking.state RCU_DYNTICKS subvariable has been renamed to
RCU_WATCHING, reflect that change in the related helpers.

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/context_tracking_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index ad5a06a42b4a..ad6570ffeff3 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -68,7 +68,7 @@ static __always_inline int ct_rcu_watching_cpu(int cpu)
 	return atomic_read(&ct->state) & CT_RCU_WATCHING_MASK;
 }
 
-static __always_inline int ct_dynticks_cpu_acquire(int cpu)
+static __always_inline int ct_rcu_watching_cpu_acquire(int cpu)
 {
 	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
 
-- 
cgit v1.2.3


From 7aeba709a048d870c15940af8b620b16281c3b9e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 30 May 2024 15:45:42 +0200
Subject: rcu/nocb: Introduce RCU_NOCB_LOCKDEP_WARN()

Checking for races against concurrent (de-)offloading implies the
creation of !CONFIG_RCU_NOCB_CPU stubs to check if each relevant lock
is held. For now this only implies the nocb_lock but more are to be
expected.

Create instead a NOCB specific version of RCU_LOCKDEP_WARN() to avoid
the proliferation of stubs.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcupdate.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 13f6f00aecf9..d48d3c237305 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -144,11 +144,18 @@ void rcu_init_nohz(void);
 int rcu_nocb_cpu_offload(int cpu);
 int rcu_nocb_cpu_deoffload(int cpu);
 void rcu_nocb_flush_deferred_wakeup(void);
+
+#define RCU_NOCB_LOCKDEP_WARN(c, s) RCU_LOCKDEP_WARN(c, s)
+
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
+
 static inline void rcu_init_nohz(void) { }
 static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
 static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; }
 static inline void rcu_nocb_flush_deferred_wakeup(void) { }
+
+#define RCU_NOCB_LOCKDEP_WARN(c, s)
+
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 
 /*
-- 
cgit v1.2.3


From bae6076ebbd14cc1f1bd4de65c2db21d3ab109d7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 30 May 2024 15:45:50 +0200
Subject: rcu/nocb: Remove SEGCBLIST_RCU_CORE

RCU core can't be running anymore while in the middle of (de-)offloading
since this sort of transition now only applies to offline CPUs.

The SEGCBLIST_RCU_CORE state can therefore be removed.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcu_segcblist.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index ba95c06675e1..5469c54cd778 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -185,11 +185,10 @@ struct rcu_cblist {
  *  ----------------------------------------------------------------------------
  */
 #define SEGCBLIST_ENABLED	BIT(0)
-#define SEGCBLIST_RCU_CORE	BIT(1)
-#define SEGCBLIST_LOCKING	BIT(2)
-#define SEGCBLIST_KTHREAD_CB	BIT(3)
-#define SEGCBLIST_KTHREAD_GP	BIT(4)
-#define SEGCBLIST_OFFLOADED	BIT(5)
+#define SEGCBLIST_LOCKING	BIT(1)
+#define SEGCBLIST_KTHREAD_CB	BIT(2)
+#define SEGCBLIST_KTHREAD_GP	BIT(3)
+#define SEGCBLIST_OFFLOADED	BIT(4)
 
 struct rcu_segcblist {
 	struct rcu_head *head;
-- 
cgit v1.2.3


From 91e43b9044a4f9b6976dc28b3f1446b733cc68de Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 30 May 2024 15:45:51 +0200
Subject: rcu/nocb: Remove SEGCBLIST_KTHREAD_CB

This state excerpt from the (de-)offloading state machine was used to
implement an ad-hoc kthread parking of rcuo kthreads. This code has
been removed and therefore the related state can be erased as well.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcu_segcblist.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 5469c54cd778..1ef1bb54853d 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -186,9 +186,8 @@ struct rcu_cblist {
  */
 #define SEGCBLIST_ENABLED	BIT(0)
 #define SEGCBLIST_LOCKING	BIT(1)
-#define SEGCBLIST_KTHREAD_CB	BIT(2)
-#define SEGCBLIST_KTHREAD_GP	BIT(3)
-#define SEGCBLIST_OFFLOADED	BIT(4)
+#define SEGCBLIST_KTHREAD_GP	BIT(2)
+#define SEGCBLIST_OFFLOADED	BIT(3)
 
 struct rcu_segcblist {
 	struct rcu_head *head;
-- 
cgit v1.2.3


From 59c34008d3bdeef4c8ebc0ed2426109b474334d4 Mon Sep 17 00:00:00 2001
From: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
Date: Mon, 22 Jul 2024 14:58:01 +0530
Subject: x86/amd_nb: Add new PCI IDs for AMD family 1Ah model 60h

Add new PCI device IDs into the root IDs and miscellaneous IDs lists to
provide support for the latest generation of AMD 1Ah family 60h processor
models.

Signed-off-by: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Link: https://lore.kernel.org/r/20240722092801.3480266-1-Shyam-sundar.S-k@amd.com
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index e388c8b1cbc2..91182aa1d2ec 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -580,6 +580,7 @@
 #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F3 0x12fb
 #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3 0x12c3
 #define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3 0x16fb
+#define PCI_DEVICE_ID_AMD_1AH_M60H_DF_F3 0x124b
 #define PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3 0x12bb
 #define PCI_DEVICE_ID_AMD_MI200_DF_F3	0x14d3
 #define PCI_DEVICE_ID_AMD_MI300_DF_F3	0x152b
-- 
cgit v1.2.3


From 52c3fb1a0f822fd64529ca64f3792095524de450 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 18 Jul 2024 10:21:49 +0200
Subject: perf/x86: Add hw_perf_event::aux_config

Start a new section for AUX PMUs in hw_perf_event.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/perf_event.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1a8942277dda..6bb0c21d6335 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -168,6 +168,9 @@ struct hw_perf_event {
 			struct hw_perf_event_extra extra_reg;
 			struct hw_perf_event_extra branch_reg;
 		};
+		struct { /* aux / Intel-PT */
+			u64		aux_config;
+		};
 		struct { /* software */
 			struct hrtimer	hrtimer;
 		};
-- 
cgit v1.2.3


From f23c042ce34ba265cf3129d530702b5d218e3f4b Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Mon, 27 May 2024 14:06:47 +0200
Subject: sched/deadline: Comment sched_dl_entity::dl_server variable

Add an explanation for the newly added variable.

Fixes: 63ba8422f876 ("sched/deadline: Introduce deadline servers")
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/147f7aa8cb8fd925f36aa8059af6a35aad08b45a.1716811044.git.bristot@kernel.org
---
 include/linux/sched.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f8d150343d42..1c771ea4481d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -639,6 +639,8 @@ struct sched_dl_entity {
 	 *
 	 * @dl_overrun tells if the task asked to be informed about runtime
 	 * overruns.
+	 *
+	 * @dl_server tells if this is a server entity.
 	 */
 	unsigned int			dl_throttled      : 1;
 	unsigned int			dl_yielded        : 1;
-- 
cgit v1.2.3


From a110a81c52a9de73e2e57e826dd3bf0fd4c22226 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Mon, 27 May 2024 14:06:51 +0200
Subject: sched/deadline: Deferrable dl server

Among the motivations for the DL servers is the real-time throttling
mechanism. This mechanism works by throttling the rt_rq after
running for a long period without leaving space for fair tasks.

The base dl server avoids this problem by boosting fair tasks instead
of throttling the rt_rq. The point is that it boosts without waiting
for potential starvation, causing some non-intuitive cases.

For example, an IRQ dispatches two tasks on an idle system, a fair
and an RT. The DL server will be activated, running the fair task
before the RT one. This problem can be avoided by deferring the
dl server activation.

By setting the defer option, the dl_server will dispatch an
SCHED_DEADLINE reservation with replenished runtime, but throttled.

The dl_timer will be set for the defer time at (period - runtime) ns
from start time. Thus boosting the fair rq at defer time.

If the fair scheduler has the opportunity to run while waiting
for defer time, the dl server runtime will be consumed. If
the runtime is completely consumed before the defer time, the
server will be replenished while still in a throttled state. Then,
the dl_timer will be reset to the new defer time

If the fair server reaches the defer time without consuming
its runtime, the server will start running, following CBS rules
(thus without breaking SCHED_DEADLINE). Then the server will
continue the running state (without deferring) until it fair
tasks are able to execute as regular fair scheduler (end of
the starvation).

Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lore.kernel.org/r/dd175943c72533cd9f0b87767c6499204879cc38.1716811044.git.bristot@kernel.org
---
 include/linux/sched.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c771ea4481d..4edd7e2096fb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -641,12 +641,24 @@ struct sched_dl_entity {
 	 * overruns.
 	 *
 	 * @dl_server tells if this is a server entity.
+	 *
+	 * @dl_defer tells if this is a deferred or regular server. For
+	 * now only defer server exists.
+	 *
+	 * @dl_defer_armed tells if the deferrable server is waiting
+	 * for the replenishment timer to activate it.
+	 *
+	 * @dl_defer_running tells if the deferrable server is actually
+	 * running, skipping the defer phase.
 	 */
 	unsigned int			dl_throttled      : 1;
 	unsigned int			dl_yielded        : 1;
 	unsigned int			dl_non_contending : 1;
 	unsigned int			dl_overrun	  : 1;
 	unsigned int			dl_server         : 1;
+	unsigned int			dl_defer	  : 1;
+	unsigned int			dl_defer_armed	  : 1;
+	unsigned int			dl_defer_running  : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
-- 
cgit v1.2.3


From c8a85394cfdb4696b4e2f8a0f3066a1c921af426 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Mon, 27 May 2024 14:06:54 +0200
Subject: sched/core: Fix picking of tasks for core scheduling with DL server

* Use simple CFS pick_task for DL pick_task

  DL server's pick_task calls CFS's pick_next_task_fair(), this is wrong
  because core scheduling's pick_task only calls CFS's pick_task() for
  evaluation / checking of the CFS task (comparing across CPUs), not for
  actually affirmatively picking the next task. This causes RB tree
  corruption issues in CFS that were found by syzbot.

* Make pick_task_fair clear DL server

  A DL task pick might set ->dl_server, but it is possible the task will
  never run (say the other HT has a stop task). If the CFS task is picked
  in the future directly (say without DL server), ->dl_server will be
  set. So clear it in pick_task_fair().

This fixes the KASAN issue reported by syzbot in set_next_entity().

(DL refactoring suggestions by Vineeth Pillai).

Reported-by: Suleiman Souhlal <suleiman@google.com>
Signed-off-by: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vineeth Pillai <vineeth@bitbyteword.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lore.kernel.org/r/b10489ab1f03d23e08e6097acea47442e7d6466f.1716811044.git.bristot@kernel.org
---
 include/linux/sched.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4edd7e2096fb..2c1b4ee3234f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -686,7 +686,8 @@ struct sched_dl_entity {
 	 */
 	struct rq			*rq;
 	dl_server_has_tasks_f		server_has_tasks;
-	dl_server_pick_f		server_pick;
+	dl_server_pick_f		server_pick_next;
+	dl_server_pick_f		server_pick_task;
 
 #ifdef CONFIG_RT_MUTEXES
 	/*
-- 
cgit v1.2.3


From 5297bba507dc54045e6efeb9955c1271ca9aafe1 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marek.vasut+renesas@mailbox.org>
Date: Tue, 23 Jul 2024 15:27:01 +0200
Subject: genirq/msi: Silence 'set affinity failed' warning

Various PCI controllers that mux MSIs onto a single IRQ line produce these
"IRQ%d: set affinity failed" warnings when entering suspend. This has been
discussed before [1] [2] and an example test case is included at the end of
this commit message.

Controller drivers that create MSI IRQ domain with
MSI_FLAG_USE_DEF_CHIP_OPS and do not override the .irq_set_affinity()
irqchip callback get assigned the default msi_domain_set_affinity()
callback. That is not desired on controllers where it is not possible to
set affinity of each MSI IRQ line to a specific CPU core due to hardware
limitation.

Introduce flag MSI_FLAG_NO_AFFINITY, which keeps .irq_set_affinity() unset
if the controller driver did not assign it.  This way, migrate_one_irq()
can exit right away, without printing the warning. The .irq_set_affinity()
implementations which only return -EINVAL can be removed from multiple
controller drivers.

  $ grep 25 /proc/interrupts
   25:   0 0 0 0 0 0 0 0   PCIe MSI   0   Edge   PCIe PME

  $ echo core > /sys/power/pm_test ; echo mem > /sys/power/state
  ...
  Disabling non-boot CPUs ...
  IRQ25: set affinity failed(-22). <---------- This is being silenced here
  psci: CPU7 killed (polled 4 ms)
  ...

[1] https://lore.kernel.org/all/d4a6eea3c5e33a3a4056885419df95a7@kernel.org/
[2] https://lore.kernel.org/all/5f4947b18bf381615a37aa81c2242477@kernel.org/

Link: https://lore.kernel.org/r/20240723132958.41320-2-marek.vasut+renesas@mailbox.org
Signed-off-by: Marek Vasut <marek.vasut+renesas@mailbox.org>
[bhelgaas: commit log]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/msi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 944979763825..b10093c4d00e 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -554,6 +554,8 @@ enum {
 	MSI_FLAG_MSIX_CONTIGUOUS	= (1 << 19),
 	/* PCI/MSI-X vectors can be dynamically allocated/freed post MSI-X enable */
 	MSI_FLAG_PCI_MSIX_ALLOC_DYN	= (1 << 20),
+	/* PCI MSIs cannot be steered separately to CPU cores */
+	MSI_FLAG_NO_AFFINITY		= (1 << 21),
 };
 
 /**
-- 
cgit v1.2.3


From de79583ffe794663c53b77f97be814522d4edc4f Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Tue, 2 Jul 2024 18:02:33 +0200
Subject: iio: core: add accessors 'masklength'

'masklength' is supposed to be an IIO private member. However, drivers
(often in trigger handlers) need to access it to iterate over the
enabled channels for example (there are other reasons). Hence, a couple
of new accessors are being added:

 * iio_for_each_active_channel() - Iterates over the active channels;
 * iio_get_masklength() - Get length of the channels mask.

The goal of these new accessors is to annotate 'masklength' as private
as soon as all drivers accessing it are converted to use the new
helpers.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Reviewed-by: Alexandru Ardelean <aardelean@baylibre.com>
Link: https://patch.msgid.link/20240702-dev-iio-masklength-private-v1-1-98193bf536a6@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 894309294182..dd6bbc468283 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -855,6 +855,24 @@ static inline const struct iio_scan_type
 	return &chan->scan_type;
 }
 
+/**
+ * iio_get_masklength - Get length of the channels mask
+ * @indio_dev: the IIO device to get the masklength for
+ */
+static inline unsigned int iio_get_masklength(const struct iio_dev *indio_dev)
+{
+	return indio_dev->masklength;
+}
+
+/**
+ * iio_for_each_active_channel - Iterated over active channels
+ * @indio_dev: the IIO device
+ * @chan: Holds the index of the enabled channel
+ */
+#define iio_for_each_active_channel(indio_dev, chan) \
+	for_each_set_bit((chan), (indio_dev)->active_scan_mask, \
+			 iio_get_masklength(indio_dev))
+
 ssize_t iio_format_value(char *buf, unsigned int type, int size, int *vals);
 
 int iio_str_to_fixpoint(const char *str, int fract_mult, int *integer,
-- 
cgit v1.2.3


From 4bf79f9be434e000c8e12fe83b2f4402480f1460 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Thu, 18 Jul 2024 13:23:53 -0700
Subject: bpf: Track equal scalars history on per-instruction level

Use bpf_verifier_state->jmp_history to track which registers were
updated by find_equal_scalars() (renamed to collect_linked_regs())
when conditional jump was verified. Use recorded information in
backtrack_insn() to propagate precision.

E.g. for the following program:

            while verifying instructions
  1: r1 = r0              |
  2: if r1 < 8  goto ...  | push r0,r1 as linked registers in jmp_history
  3: if r0 > 16 goto ...  | push r0,r1 as linked registers in jmp_history
  4: r2 = r10             |
  5: r2 += r0             v mark_chain_precision(r0)

            while doing mark_chain_precision(r0)
  5: r2 += r0             | mark r0 precise
  4: r2 = r10             |
  3: if r0 > 16 goto ...  | mark r0,r1 as precise
  2: if r1 < 8  goto ...  | mark r0,r1 as precise
  1: r1 = r0              v

Technically, do this as follows:
- Use 10 bits to identify each register that gains range because of
  sync_linked_regs():
  - 3 bits for frame number;
  - 6 bits for register or stack slot number;
  - 1 bit to indicate if register is spilled.
- Use u64 as a vector of 6 such records + 4 bits for vector length.
- Augment struct bpf_jmp_history_entry with a field 'linked_regs'
  representing such vector.
- When doing check_cond_jmp_op() remember up to 6 registers that
  gain range because of sync_linked_regs() in such a vector.
- Don't propagate range information and reset IDs for registers that
  don't fit in 6-value vector.
- Push a pair {instruction index, linked registers vector}
  to bpf_verifier_state->jmp_history.
- When doing backtrack_insn() check if any of recorded linked
  registers is currently marked precise, if so mark all linked
  registers as precise.

This also requires fixes for two test_verifier tests:
- precise: test 1
- precise: test 2

Both tests contain the following instruction sequence:

19: (bf) r2 = r9                      ; R2=scalar(id=3) R9=scalar(id=3)
20: (a5) if r2 < 0x8 goto pc+1        ; R2=scalar(id=3,umin=8)
21: (95) exit
22: (07) r2 += 1                      ; R2_w=scalar(id=3+1,...)
23: (bf) r1 = r10                     ; R1_w=fp0 R10=fp0
24: (07) r1 += -8                     ; R1_w=fp-8
25: (b7) r3 = 0                       ; R3_w=0
26: (85) call bpf_probe_read_kernel#113

The call to bpf_probe_read_kernel() at (26) forces r2 to be precise.
Previously, this forced all registers with same id to become precise
immediately when mark_chain_precision() is called.
After this change, the precision is propagated to registers sharing
same id only when 'if' instruction is backtracked.
Hence verification log for both tests is changed:
regs=r2,r9 -> regs=r2 for instructions 25..20.

Fixes: 904e6ddf4133 ("bpf: Use scalar ids in mark_chain_precision()")
Reported-by: Hao Sun <sunhao.th@gmail.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240718202357.1746514-2-eddyz87@gmail.com

Closes: https://lore.kernel.org/bpf/CAEf4BzZ0xidVCqB47XnkXcNhkPWF6_nTV7yt+_Lf0kcFEut2Mg@mail.gmail.com/
---
 include/linux/bpf_verifier.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 6503c85b10a3..731a0a4ac13c 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -371,6 +371,10 @@ struct bpf_jmp_history_entry {
 	u32 prev_idx : 22;
 	/* special flags, e.g., whether insn is doing register stack spill/load */
 	u32 flags : 10;
+	/* additional registers that need precision tracking when this
+	 * jump is backtracked, vector of six 10-bit records
+	 */
+	u64 linked_regs;
 };
 
 /* Maximum number of register states that can exist at once */
-- 
cgit v1.2.3


From e42ac14180554fa23a3312d4f921dc4ea7972fb7 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Mon, 22 Jul 2024 11:30:45 -0700
Subject: bpf: Check unsupported ops from the bpf_struct_ops's cfi_stubs

The bpf_tcp_ca struct_ops currently uses a "u32 unsupported_ops[]"
array to track which ops is not supported.

After cfi_stubs had been added, the function pointer in cfi_stubs is
also NULL for the unsupported ops. Thus, the "u32 unsupported_ops[]"
becomes redundant. This observation was originally brought up in the
bpf/cfi discussion:
https://lore.kernel.org/bpf/CAADnVQJoEkdjyCEJRPASjBw1QGsKYrF33QdMGc1RZa9b88bAEA@mail.gmail.com/

The recent bpf qdisc patch (https://lore.kernel.org/bpf/20240714175130.4051012-6-amery.hung@bytedance.com/)
also needs to specify quite many unsupported ops. It is a good time
to clean it up.

This patch removes the need of "u32 unsupported_ops[]" and tests for null-ness
in the cfi_stubs instead.

Testing the cfi_stubs is done in a new function bpf_struct_ops_supported().
The verifier will call bpf_struct_ops_supported() when loading the
struct_ops program. The ".check_member" is removed from the bpf_tcp_ca
in this patch. ".check_member" could still be useful for other subsytems
to enforce other restrictions (e.g. sched_ext checks for prog->sleepable).

To keep the same error return, ENOTSUPP is used.

Cc: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240722183049.2254692-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3b94ec161e8c..4c54864316ee 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1795,6 +1795,7 @@ struct bpf_struct_ops_common_value {
 #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
 bool bpf_struct_ops_get(const void *kdata);
 void bpf_struct_ops_put(const void *kdata);
+int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff);
 int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 				       void *value);
 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
@@ -1851,6 +1852,10 @@ static inline void bpf_module_put(const void *data, struct module *owner)
 {
 	module_put(owner);
 }
+static inline int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff)
+{
+	return -ENOTSUPP;
+}
 static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
 						     void *key,
 						     void *value)
-- 
cgit v1.2.3


From 52dea0a15cc888e89f0144dca7b712fb56d12a28 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 10 Jun 2024 18:42:28 +0200
Subject: posix-timers: Convert timer list to hlist

No requirement for a real list. Spare a few bytes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/posix-timers.h | 2 +-
 include/linux/sched/signal.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index dc7b738de299..453691710839 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -158,7 +158,7 @@ static inline void posix_cputimers_init_work(void) { }
  * @rcu:		RCU head for freeing the timer.
  */
 struct k_itimer {
-	struct list_head	list;
+	struct hlist_node	list;
 	struct hlist_node	t_hash;
 	spinlock_t		it_lock;
 	const struct k_clock	*kclock;
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0a0e23c45406..622fdef78e14 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -137,7 +137,7 @@ struct signal_struct {
 
 	/* POSIX.1b Interval Timers */
 	unsigned int		next_posix_timer_id;
-	struct list_head	posix_timers;
+	struct hlist_head	posix_timers;
 
 	/* ITIMER_REAL timer for the process */
 	struct hrtimer real_timer;
-- 
cgit v1.2.3


From a2b80ce87a87fc18c594e74d13031d5e347b69cb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 10 Jun 2024 18:42:33 +0200
Subject: signal: Remove task argument from dequeue_signal()

The task pointer which is handed to dequeue_signal() is always current. The
argument along with the first comment about signalfd in that function is
confusing at best. Remove it and use current internally.

Update the stale comment for dequeue_signal() while at it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched/signal.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 622fdef78e14..c8ed09ac29ac 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -276,8 +276,7 @@ static inline void signal_set_stop_flags(struct signal_struct *sig,
 extern void flush_signals(struct task_struct *);
 extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
-extern int dequeue_signal(struct task_struct *task, sigset_t *mask,
-			  kernel_siginfo_t *info, enum pid_type *type);
+extern int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type);
 
 static inline int kernel_dequeue_signal(void)
 {
@@ -287,7 +286,7 @@ static inline int kernel_dequeue_signal(void)
 	int ret;
 
 	spin_lock_irq(&task->sighand->siglock);
-	ret = dequeue_signal(task, &task->blocked, &__info, &__type);
+	ret = dequeue_signal(&task->blocked, &__info, &__type);
 	spin_unlock_irq(&task->sighand->siglock);
 
 	return ret;
-- 
cgit v1.2.3


From 5d99e198be279045e6ecefe220f5c52f8ce9bfd5 Mon Sep 17 00:00:00 2001
From: Xu Kuohai <xukuohai@huawei.com>
Date: Fri, 19 Jul 2024 19:00:52 +0800
Subject: bpf, lsm: Add check for BPF LSM return value

A bpf prog returning a positive number attached to file_alloc_security
hook makes kernel panic.

This happens because file system can not filter out the positive number
returned by the LSM prog using IS_ERR, and misinterprets this positive
number as a file pointer.

Given that hook file_alloc_security never returned positive number
before the introduction of BPF LSM, and other BPF LSM hooks may
encounter similar issues, this patch adds LSM return value check
in verifier, to ensure no unexpected value is returned.

Fixes: 520b7aa00d8c ("bpf: lsm: Initialize the BPF LSM hooks")
Reported-by: Xin Liu <liuxin350@huawei.com>
Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240719110059.797546-3-xukuohai@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf.h     | 1 +
 include/linux/bpf_lsm.h | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4c54864316ee..5739cc9986f8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -927,6 +927,7 @@ struct bpf_insn_access_aux {
 		};
 	};
 	struct bpf_verifier_log *log; /* for verbose logs */
+	bool is_retval; /* is accessing function return value ? */
 };
 
 static inline void
diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index 1de7ece5d36d..aefcd6564251 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -9,6 +9,7 @@
 
 #include <linux/sched.h>
 #include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
 #include <linux/lsm_hooks.h>
 
 #ifdef CONFIG_BPF_LSM
@@ -45,6 +46,8 @@ void bpf_inode_storage_free(struct inode *inode);
 
 void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func);
 
+int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
+			     struct bpf_retval_range *range);
 #else /* !CONFIG_BPF_LSM */
 
 static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id)
@@ -78,6 +81,11 @@ static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
 {
 }
 
+static inline int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
+					   struct bpf_retval_range *range)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_BPF_LSM */
 
 #endif /* _LINUX_BPF_LSM_H */
-- 
cgit v1.2.3


From 28ead3eaabc16ecc907cfb71876da028080f6356 Mon Sep 17 00:00:00 2001
From: Xu Kuohai <xukuohai@huawei.com>
Date: Fri, 19 Jul 2024 19:00:53 +0800
Subject: bpf: Prevent tail call between progs attached to different hooks

bpf progs can be attached to kernel functions, and the attached functions
can take different parameters or return different return values. If
prog attached to one kernel function tail calls prog attached to another
kernel function, the ctx access or return value verification could be
bypassed.

For example, if prog1 is attached to func1 which takes only 1 parameter
and prog2 is attached to func2 which takes two parameters. Since verifier
assumes the bpf ctx passed to prog2 is constructed based on func2's
prototype, verifier allows prog2 to access the second parameter from
the bpf ctx passed to it. The problem is that verifier does not prevent
prog1 from passing its bpf ctx to prog2 via tail call. In this case,
the bpf ctx passed to prog2 is constructed from func1 instead of func2,
that is, the assumption for ctx access verification is bypassed.

Another example, if BPF LSM prog1 is attached to hook file_alloc_security,
and BPF LSM prog2 is attached to hook bpf_lsm_audit_rule_known. Verifier
knows the return value rules for these two hooks, e.g. it is legal for
bpf_lsm_audit_rule_known to return positive number 1, and it is illegal
for file_alloc_security to return positive number. So verifier allows
prog2 to return positive number 1, but does not allow prog1 to return
positive number. The problem is that verifier does not prevent prog1
from calling prog2 via tail call. In this case, prog2's return value 1
will be used as the return value for prog1's hook file_alloc_security.
That is, the return value rule is bypassed.

This patch adds restriction for tail call to prevent such bypasses.

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Link: https://lore.kernel.org/r/20240719110059.797546-4-xukuohai@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5739cc9986f8..f16d0753f518 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -294,6 +294,7 @@ struct bpf_map {
 	 * same prog type, JITed flag and xdp_has_frags flag.
 	 */
 	struct {
+		const struct btf_type *attach_func_proto;
 		spinlock_t lock;
 		enum bpf_prog_type type;
 		bool jited;
-- 
cgit v1.2.3


From 2aff9d20d50ac45dd13a013ef5231f4fb8912356 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 10 Jul 2024 14:32:25 -0700
Subject: lsm: infrastructure management of the sock security

Move management of the sock->sk_security blob out
of the individual security modules and into the security
infrastructure. Instead of allocating the blobs from within
the modules the modules tell the infrastructure how much
space is required, and the space is allocated there.

Acked-by: Paul Moore <paul@paul-moore.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Acked-by: Stephen Smalley <stephen.smalley.work@gmail.com>
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
[PM: subject tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index a2ade0ffe9e7..efd4a0655159 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -73,6 +73,7 @@ struct lsm_blob_sizes {
 	int	lbs_cred;
 	int	lbs_file;
 	int	lbs_inode;
+	int	lbs_sock;
 	int	lbs_superblock;
 	int	lbs_ipc;
 	int	lbs_msg_msg;
-- 
cgit v1.2.3


From 5f8d28f6d7d568dbbc8c5bce94894474c07afd4f Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 10 Jul 2024 14:32:26 -0700
Subject: lsm: infrastructure management of the key security blob

Move management of the key->security blob out of the individual security
modules and into the security infrastructure. Instead of allocating the
blobs from within the modules the modules tell the infrastructure how
much space is required, and the space is allocated there.  There are
no existing modules that require a key_free hook, so the call to it and
the definition for it have been removed.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
[PM: subject tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 1 -
 include/linux/lsm_hooks.h     | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 855db460e08b..8cc60644f3bd 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -403,7 +403,6 @@ LSM_HOOK(int, 0, xfrm_decode_session, struct sk_buff *skb, u32 *secid,
 #ifdef CONFIG_KEYS
 LSM_HOOK(int, 0, key_alloc, struct key *key, const struct cred *cred,
 	 unsigned long flags)
-LSM_HOOK(void, LSM_RET_VOID, key_free, struct key *key)
 LSM_HOOK(int, 0, key_permission, key_ref_t key_ref, const struct cred *cred,
 	 enum key_need_perm need_perm)
 LSM_HOOK(int, 0, key_getsecurity, struct key *key, char **buffer)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index efd4a0655159..7233bc0737be 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -76,6 +76,7 @@ struct lsm_blob_sizes {
 	int	lbs_sock;
 	int	lbs_superblock;
 	int	lbs_ipc;
+	int	lbs_key;
 	int	lbs_msg_msg;
 	int	lbs_task;
 	int	lbs_xattr_count; /* number of xattr slots in new_xattrs array */
-- 
cgit v1.2.3


From a39c0f77dbbe083f3eec6c3b32d90f168f7575eb Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 10 Jul 2024 14:32:28 -0700
Subject: lsm: infrastructure management of the dev_tun blob

Move management of the dev_tun security blob out of the individual
security modules and into the LSM infrastructure.  The security modules
tell the infrastructure how much space they require at initialization.
There are no longer any modules that require the dev_tun_free hook.
The hook definition has been removed.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
[PM: subject tweak, selinux style fixes]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 3 +--
 include/linux/lsm_hooks.h     | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 8cc60644f3bd..22c475f530ae 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -353,8 +353,7 @@ LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_inc, void)
 LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_dec, void)
 LSM_HOOK(void, LSM_RET_VOID, req_classify_flow, const struct request_sock *req,
 	 struct flowi_common *flic)
-LSM_HOOK(int, 0, tun_dev_alloc_security, void **security)
-LSM_HOOK(void, LSM_RET_VOID, tun_dev_free_security, void *security)
+LSM_HOOK(int, 0, tun_dev_alloc_security, void *security)
 LSM_HOOK(int, 0, tun_dev_create, void)
 LSM_HOOK(int, 0, tun_dev_attach_queue, void *security)
 LSM_HOOK(int, 0, tun_dev_attach, struct sock *sk, void *security)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 7233bc0737be..0ff14ff128c8 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -80,6 +80,7 @@ struct lsm_blob_sizes {
 	int	lbs_msg_msg;
 	int	lbs_task;
 	int	lbs_xattr_count; /* number of xattr slots in new_xattrs array */
+	int	lbs_tun_dev;
 };
 
 /**
-- 
cgit v1.2.3


From 66de33a0bbb59ef3909d2c65dbbb7fc503d573bd Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 10 Jul 2024 14:32:29 -0700
Subject: lsm: infrastructure management of the infiniband blob

Move management of the infiniband security blob out of the individual
security modules and into the LSM infrastructure.  The security modules
tell the infrastructure how much space they require at initialization.
There are no longer any modules that require the ib_free() hook.
The hook definition has been removed.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
[PM: subject tweak, selinux style fixes]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 3 +--
 include/linux/lsm_hooks.h     | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 22c475f530ae..4fcbc5b3f58c 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -373,8 +373,7 @@ LSM_HOOK(int, 0, mptcp_add_subflow, struct sock *sk, struct sock *ssk)
 LSM_HOOK(int, 0, ib_pkey_access, void *sec, u64 subnet_prefix, u16 pkey)
 LSM_HOOK(int, 0, ib_endport_manage_subnet, void *sec, const char *dev_name,
 	 u8 port_num)
-LSM_HOOK(int, 0, ib_alloc_security, void **sec)
-LSM_HOOK(void, LSM_RET_VOID, ib_free_security, void *sec)
+LSM_HOOK(int, 0, ib_alloc_security, void *sec)
 #endif /* CONFIG_SECURITY_INFINIBAND */
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 0ff14ff128c8..b6fc6ac88723 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -72,6 +72,7 @@ struct security_hook_list {
 struct lsm_blob_sizes {
 	int	lbs_cred;
 	int	lbs_file;
+	int	lbs_ib;
 	int	lbs_inode;
 	int	lbs_sock;
 	int	lbs_superblock;
-- 
cgit v1.2.3


From 61a1dcdceb44d79e5ab511295791b88ea178c045 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Wed, 10 Jul 2024 14:32:30 -0700
Subject: lsm: infrastructure management of the perf_event security blob

Move management of the perf_event->security blob out of the individual
security modules and into the security infrastructure. Instead of
allocating the blobs from within the modules the modules tell the
infrastructure how much space is required, and the space is allocated
there.  There are no longer any modules that require the perf_event_free()
hook.  The hook definition has been removed.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
[PM: subject tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 1 -
 include/linux/lsm_hooks.h     | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 4fcbc5b3f58c..12c81be78eb9 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -439,7 +439,6 @@ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)
 #ifdef CONFIG_PERF_EVENTS
 LSM_HOOK(int, 0, perf_event_open, struct perf_event_attr *attr, int type)
 LSM_HOOK(int, 0, perf_event_alloc, struct perf_event *event)
-LSM_HOOK(void, LSM_RET_VOID, perf_event_free, struct perf_event *event)
 LSM_HOOK(int, 0, perf_event_read, struct perf_event *event)
 LSM_HOOK(int, 0, perf_event_write, struct perf_event *event)
 #endif /* CONFIG_PERF_EVENTS */
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index b6fc6ac88723..f1ca8082075a 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -79,6 +79,7 @@ struct lsm_blob_sizes {
 	int	lbs_ipc;
 	int	lbs_key;
 	int	lbs_msg_msg;
+	int	lbs_perf_event;
 	int	lbs_task;
 	int	lbs_xattr_count; /* number of xattr slots in new_xattrs array */
 	int	lbs_tun_dev;
-- 
cgit v1.2.3


From 92de36080c93296ef9005690705cba260b9bd68a Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Tue, 23 Jul 2024 08:34:39 -0700
Subject: bpf: Fail verification for sign-extension of packet
 data/data_end/data_meta

syzbot reported a kernel crash due to
  commit 1f1e864b6555 ("bpf: Handle sign-extenstin ctx member accesses").
The reason is due to sign-extension of 32-bit load for
packet data/data_end/data_meta uapi field.

The original code looks like:
        r2 = *(s32 *)(r1 + 76) /* load __sk_buff->data */
        r3 = *(u32 *)(r1 + 80) /* load __sk_buff->data_end */
        r0 = r2
        r0 += 8
        if r3 > r0 goto +1
        ...
Note that __sk_buff->data load has 32-bit sign extension.

After verification and convert_ctx_accesses(), the final asm code looks like:
        r2 = *(u64 *)(r1 +208)
        r2 = (s32)r2
        r3 = *(u64 *)(r1 +80)
        r0 = r2
        r0 += 8
        if r3 > r0 goto pc+1
        ...
Note that 'r2 = (s32)r2' may make the kernel __sk_buff->data address invalid
which may cause runtime failure.

Currently, in C code, typically we have
        void *data = (void *)(long)skb->data;
        void *data_end = (void *)(long)skb->data_end;
        ...
and it will generate
        r2 = *(u64 *)(r1 +208)
        r3 = *(u64 *)(r1 +80)
        r0 = r2
        r0 += 8
        if r3 > r0 goto pc+1

If we allow sign-extension,
        void *data = (void *)(long)(int)skb->data;
        void *data_end = (void *)(long)skb->data_end;
        ...
the generated code looks like
        r2 = *(u64 *)(r1 +208)
        r2 <<= 32
        r2 s>>= 32
        r3 = *(u64 *)(r1 +80)
        r0 = r2
        r0 += 8
        if r3 > r0 goto pc+1
and this will cause verification failure since "r2 <<= 32" is not allowed
as "r2" is a packet pointer.

To fix this issue for case
  r2 = *(s32 *)(r1 + 76) /* load __sk_buff->data */
this patch added additional checking in is_valid_access() callback
function for packet data/data_end/data_meta access. If those accesses
are with sign-extenstion, the verification will fail.

  [1] https://lore.kernel.org/bpf/000000000000c90eee061d236d37@google.com/

Reported-by: syzbot+ad9ec60c8eaf69e6f99c@syzkaller.appspotmail.com
Fixes: 1f1e864b6555 ("bpf: Handle sign-extenstin ctx member accesses")
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240723153439.2429035-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f16d0753f518..f560ea0c2b36 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -920,6 +920,7 @@ static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
  */
 struct bpf_insn_access_aux {
 	enum bpf_reg_type reg_type;
+	bool is_ldsx;
 	union {
 		int ctx_field_size;
 		struct {
-- 
cgit v1.2.3


From 5b5f51bff1b66cedb62b5ba74a1878341204e057 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 22 Jul 2024 16:38:36 -0700
Subject: bpf: no_caller_saved_registers attribute for helper calls

GCC and LLVM define a no_caller_saved_registers function attribute.
This attribute means that function scratches only some of
the caller saved registers defined by ABI.
For BPF the set of such registers could be defined as follows:
- R0 is scratched only if function is non-void;
- R1-R5 are scratched only if corresponding parameter type is defined
  in the function prototype.

This commit introduces flag bpf_func_prot->allow_nocsr.
If this flag is set for some helper function, verifier assumes that
it follows no_caller_saved_registers calling convention.

The contract between kernel and clang allows to simultaneously use
such functions and maintain backwards compatibility with old
kernels that don't understand no_caller_saved_registers calls
(nocsr for short):

- clang generates a simple pattern for nocsr calls, e.g.:

    r1 = 1;
    r2 = 2;
    *(u64 *)(r10 - 8)  = r1;
    *(u64 *)(r10 - 16) = r2;
    call %[to_be_inlined]
    r2 = *(u64 *)(r10 - 16);
    r1 = *(u64 *)(r10 - 8);
    r0 = r1;
    r0 += r2;
    exit;

- kernel removes unnecessary spills and fills, if called function is
  inlined by verifier or current JIT (with assumption that patch
  inserted by verifier or JIT honors nocsr contract, e.g. does not
  scratch r3-r5 for the example above), e.g. the code above would be
  transformed to:

    r1 = 1;
    r2 = 2;
    call %[to_be_inlined]
    r0 = r1;
    r0 += r2;
    exit;

Technically, the transformation is split into the following phases:
- function mark_nocsr_patterns(), called from bpf_check()
  searches and marks potential patterns in instruction auxiliary data;
- upon stack read or write access,
  function check_nocsr_stack_contract() is used to verify if
  stack offsets, presumably reserved for nocsr patterns, are used
  only from those patterns;
- function remove_nocsr_spills_fills(), called from bpf_check(),
  applies the rewrite for valid patterns.

See comment in mark_nocsr_pattern_for_call() for more details.

Suggested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240722233844.1406874-3-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf.h          |  6 ++++++
 include/linux/bpf_verifier.h | 14 ++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f560ea0c2b36..b9425e410bcb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -808,6 +808,12 @@ struct bpf_func_proto {
 	bool gpl_only;
 	bool pkt_access;
 	bool might_sleep;
+	/* set to true if helper follows contract for gcc/llvm
+	 * attribute no_caller_saved_registers:
+	 * - void functions do not scratch r0
+	 * - functions taking N arguments scratch only registers r1-rN
+	 */
+	bool allow_nocsr;
 	enum bpf_return_type ret_type;
 	union {
 		struct {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 731a0a4ac13c..5cea15c81b8a 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -576,6 +576,14 @@ struct bpf_insn_aux_data {
 	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
 	bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
 	u8 alu_state; /* used in combination with alu_limit */
+	/* true if STX or LDX instruction is a part of a spill/fill
+	 * pattern for a no_caller_saved_registers call.
+	 */
+	u8 nocsr_pattern:1;
+	/* for CALL instructions, a number of spill/fill pairs in the
+	 * no_caller_saved_registers pattern.
+	 */
+	u8 nocsr_spills_num:3;
 
 	/* below fields are initialized once */
 	unsigned int orig_idx; /* original instruction index */
@@ -645,6 +653,10 @@ struct bpf_subprog_info {
 	u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
 	u16 stack_depth; /* max. stack depth used by this function */
 	u16 stack_extra;
+	/* offsets in range [stack_depth .. nocsr_stack_off)
+	 * are used for no_caller_saved_registers spills and fills.
+	 */
+	s16 nocsr_stack_off;
 	bool has_tail_call: 1;
 	bool tail_call_reachable: 1;
 	bool has_ld_abs: 1;
@@ -652,6 +664,8 @@ struct bpf_subprog_info {
 	bool is_async_cb: 1;
 	bool is_exception_cb: 1;
 	bool args_cached: 1;
+	/* true if nocsr stack region is used by functions that can't be inlined */
+	bool keep_nocsr_stack: 1;
 
 	u8 arg_cnt;
 	struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
-- 
cgit v1.2.3


From 7ebd8c5acad5f8ca41f37b36dc62570e1fa13d8b Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Thu, 18 Jul 2024 16:59:06 +0900
Subject: ata: libata: Use QUIRK instead of HORKAGE

According to Wiktionary, the verb "hork" is computing slang defined as
"To foul up; to be occupied with difficulty, tangle, or unpleasantness;
to be broken" (https://en.wiktionary.org/wiki/hork#Verb). libata uses
this with the term "horkage" to refer to broken device features. Given
that this term is not widely used and its meaning unknown to many,
rename it to the more commonly used term "quirk", similar to many other
places in the kernel.

The renaming done is:
1) Rename all ATA_HORKAGE_XXX flags to ATA_QUIRK_XXX
2) Rename struct ata_device horkage field to quirks
3) Rename struct ata_blacklist_entry to struct ata_dev_quirks_entry. The
   array of these structures defining quirks for known devices is
   renamed __ata_dev_quirks.
4) The functions ata_dev_blacklisted() and ata_force_horkage() are
   renamed to ata_dev_quirks() and ata_force_quirks() respectively.
5) All the force_horkage_xxx() macros are renamed to force_quirk_xxx()

And while at it, make sure that the type "unsigned int" is used
consistantly for quirk flags variables and data structure fields.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
Reviewed-by: Igor Pylypiv <ipylypiv@google.com>
---
 include/linux/libata.h | 71 +++++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 17394098bee9..05dd7038ab30 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -362,40 +362,41 @@ enum {
 	 */
 	ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 8,
 
-	/* Horkage types. May be set by libata or controller on drives
-	   (some horkage may be drive/controller pair dependent */
-
-	ATA_HORKAGE_DIAGNOSTIC	= (1 << 0),	/* Failed boot diag */
-	ATA_HORKAGE_NODMA	= (1 << 1),	/* DMA problems */
-	ATA_HORKAGE_NONCQ	= (1 << 2),	/* Don't use NCQ */
-	ATA_HORKAGE_MAX_SEC_128	= (1 << 3),	/* Limit max sects to 128 */
-	ATA_HORKAGE_BROKEN_HPA	= (1 << 4),	/* Broken HPA */
-	ATA_HORKAGE_DISABLE	= (1 << 5),	/* Disable it */
-	ATA_HORKAGE_HPA_SIZE	= (1 << 6),	/* native size off by one */
-	ATA_HORKAGE_IVB		= (1 << 8),	/* cbl det validity bit bugs */
-	ATA_HORKAGE_STUCK_ERR	= (1 << 9),	/* stuck ERR on next PACKET */
-	ATA_HORKAGE_BRIDGE_OK	= (1 << 10),	/* no bridge limits */
-	ATA_HORKAGE_ATAPI_MOD16_DMA = (1 << 11), /* use ATAPI DMA for commands
-						    not multiple of 16 bytes */
-	ATA_HORKAGE_FIRMWARE_WARN = (1 << 12),	/* firmware update warning */
-	ATA_HORKAGE_1_5_GBPS	= (1 << 13),	/* force 1.5 Gbps */
-	ATA_HORKAGE_NOSETXFER	= (1 << 14),	/* skip SETXFER, SATA only */
-	ATA_HORKAGE_BROKEN_FPDMA_AA	= (1 << 15),	/* skip AA */
-	ATA_HORKAGE_DUMP_ID	= (1 << 16),	/* dump IDENTIFY data */
-	ATA_HORKAGE_MAX_SEC_LBA48 = (1 << 17),	/* Set max sects to 65535 */
-	ATA_HORKAGE_ATAPI_DMADIR = (1 << 18),	/* device requires dmadir */
-	ATA_HORKAGE_NO_NCQ_TRIM	= (1 << 19),	/* don't use queued TRIM */
-	ATA_HORKAGE_NOLPM	= (1 << 20),	/* don't use LPM */
-	ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21),	/* some WDs have broken LPM */
-	ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */
-	ATA_HORKAGE_NO_DMA_LOG	= (1 << 23),	/* don't use DMA for log read */
-	ATA_HORKAGE_NOTRIM	= (1 << 24),	/* don't use TRIM */
-	ATA_HORKAGE_MAX_SEC_1024 = (1 << 25),	/* Limit max sects to 1024 */
-	ATA_HORKAGE_MAX_TRIM_128M = (1 << 26),	/* Limit max trim size to 128M */
-	ATA_HORKAGE_NO_NCQ_ON_ATI = (1 << 27),	/* Disable NCQ on ATI chipset */
-	ATA_HORKAGE_NO_ID_DEV_LOG = (1 << 28),	/* Identify device log missing */
-	ATA_HORKAGE_NO_LOG_DIR	= (1 << 29),	/* Do not read log directory */
-	ATA_HORKAGE_NO_FUA	= (1 << 30),	/* Do not use FUA */
+	/*
+	 * Quirk flags: may be set by libata or controller drivers on drives.
+	 * Some quirks may be drive/controller pair dependent.
+	 */
+	ATA_QUIRK_DIAGNOSTIC	= (1 << 0),	/* Failed boot diag */
+	ATA_QUIRK_NODMA		= (1 << 1),	/* DMA problems */
+	ATA_QUIRK_NONCQ		= (1 << 2),	/* Do not use NCQ */
+	ATA_QUIRK_MAX_SEC_128	= (1 << 3),	/* Limit max sects to 128 */
+	ATA_QUIRK_BROKEN_HPA	= (1 << 4),	/* Broken HPA */
+	ATA_QUIRK_DISABLE	= (1 << 5),	/* Disable it */
+	ATA_QUIRK_HPA_SIZE	= (1 << 6),	/* Native size off by one */
+	ATA_QUIRK_IVB		= (1 << 8),	/* CBL det validity bit bugs */
+	ATA_QUIRK_STUCK_ERR	= (1 << 9),	/* Stuck ERR on next PACKET */
+	ATA_QUIRK_BRIDGE_OK	= (1 << 10),	/* No bridge limits */
+	ATA_QUIRK_ATAPI_MOD16_DMA = (1 << 11),	/* Use ATAPI DMA for commands */
+						/* not multiple of 16 bytes */
+	ATA_QUIRK_FIRMWARE_WARN = (1 << 12),	/* Firmware update warning */
+	ATA_QUIRK_1_5_GBPS	= (1 << 13),	/* Force 1.5 Gbps */
+	ATA_QUIRK_NOSETXFER	= (1 << 14),	/* Skip SETXFER, SATA only */
+	ATA_QUIRK_BROKEN_FPDMA_AA = (1 << 15),	/* Skip AA */
+	ATA_QUIRK_DUMP_ID	= (1 << 16),	/* Dump IDENTIFY data */
+	ATA_QUIRK_MAX_SEC_LBA48 = (1 << 17),	/* Set max sects to 65535 */
+	ATA_QUIRK_ATAPI_DMADIR	= (1 << 18),	/* Device requires dmadir */
+	ATA_QUIRK_NO_NCQ_TRIM	= (1 << 19),	/* Do not use queued TRIM */
+	ATA_QUIRK_NOLPM		= (1 << 20),	/* Do not use LPM */
+	ATA_QUIRK_WD_BROKEN_LPM = (1 << 21),	/* Some WDs have broken LPM */
+	ATA_QUIRK_ZERO_AFTER_TRIM = (1 << 22),	/* Guarantees zero after trim */
+	ATA_QUIRK_NO_DMA_LOG	= (1 << 23),	/* Do not use DMA for log read */
+	ATA_QUIRK_NOTRIM	= (1 << 24),	/* Do not use TRIM */
+	ATA_QUIRK_MAX_SEC_1024	= (1 << 25),	/* Limit max sects to 1024 */
+	ATA_QUIRK_MAX_TRIM_128M = (1 << 26),	/* Limit max trim size to 128M */
+	ATA_QUIRK_NO_NCQ_ON_ATI = (1 << 27),	/* Disable NCQ on ATI chipset */
+	ATA_QUIRK_NO_ID_DEV_LOG = (1 << 28),	/* Identify device log missing */
+	ATA_QUIRK_NO_LOG_DIR	= (1 << 29),	/* Do not read log directory */
+	ATA_QUIRK_NO_FUA	= (1 << 30),	/* Do not use FUA */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
@@ -663,7 +664,7 @@ struct ata_cpr_log {
 struct ata_device {
 	struct ata_link		*link;
 	unsigned int		devno;		/* 0 or 1 */
-	unsigned int		horkage;	/* List of broken features */
+	unsigned int		quirks;		/* List of broken features */
 	unsigned long		flags;		/* ATA_DFLAG_xxx */
 	struct scsi_device	*sdev;		/* attached SCSI device */
 	void			*private_data;
-- 
cgit v1.2.3


From 58157d607aecb4e05ab793408038b014c84e466f Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Thu, 18 Jul 2024 16:54:03 +0900
Subject: ata: libata: Print quirks applied to devices

Introduce the function ata_dev_print_quirks() to print the quirk flags
that will be applied to a scanned device. This new function is called
from ata_dev_quirks() when a match on a device model or device model
and revision is found for a device in the __ata_dev_quirks array.

To implement this function, the ATA_QUIRK_ flags are redefined using
the new enum ata_quirk which defines the bit shift for each quirk
flag. The array of strings ata_quirk_names is used to define the name
of each flag, which are printed by ata_dev_print_quirks().

Example output for a device listed in the __ata_dev_quirks array and
which has the ATA_QUIRK_DISABLE flag applied:

[10193.461270] ata1: SATA link up 6.0 Gbps (SStatus 133 SControl 300)
[10193.469190] ata1.00: Model 'ASMT109x- Config', rev '2143 5', applying quirks: disable
[10193.469195] ata1.00: unsupported device, disabling
[10193.481564] ata1.00: disable device

enum ata_quirk also defines the __ATA_QUIRK_MAX value as one plus the
last quirk flag defined. This value is used in ata_dev_quirks() to add a
build time check that all quirk flags fit within the unsigned int
(32-bits) quirks field of struct ata_device.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Igor Pylypiv <ipylypiv@google.com>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
---
 include/linux/libata.h | 106 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 72 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 05dd7038ab30..d598ef690e50 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -55,6 +55,46 @@
 /* defines only for the constants which don't work well as enums */
 #define ATA_TAG_POISON		0xfafbfcfdU
 
+/*
+ * Quirk flags bits.
+ * ata_device->quirks is an unsigned int, so __ATA_QUIRK_MAX must not exceed 32.
+ */
+enum ata_quirks {
+	__ATA_QUIRK_DIAGNOSTIC,		/* Failed boot diag */
+	__ATA_QUIRK_NODMA,		/* DMA problems */
+	__ATA_QUIRK_NONCQ,		/* Don't use NCQ */
+	__ATA_QUIRK_MAX_SEC_128,	/* Limit max sects to 128 */
+	__ATA_QUIRK_BROKEN_HPA,		/* Broken HPA */
+	__ATA_QUIRK_DISABLE,		/* Disable it */
+	__ATA_QUIRK_HPA_SIZE,		/* Native size off by one */
+	__ATA_QUIRK_IVB,		/* cbl det validity bit bugs */
+	__ATA_QUIRK_STUCK_ERR,		/* Stuck ERR on next PACKET */
+	__ATA_QUIRK_BRIDGE_OK,		/* No bridge limits */
+	__ATA_QUIRK_ATAPI_MOD16_DMA,	/* Use ATAPI DMA for commands that */
+					/* are not a multiple of 16 bytes */
+	__ATA_QUIRK_FIRMWARE_WARN,	/* Firmware update warning */
+	__ATA_QUIRK_1_5_GBPS,		/* Force 1.5 Gbps */
+	__ATA_QUIRK_NOSETXFER,		/* Skip SETXFER, SATA only */
+	__ATA_QUIRK_BROKEN_FPDMA_AA,	/* Skip AA */
+	__ATA_QUIRK_DUMP_ID,		/* Dump IDENTIFY data */
+	__ATA_QUIRK_MAX_SEC_LBA48,	/* Set max sects to 65535 */
+	__ATA_QUIRK_ATAPI_DMADIR,	/* Device requires dmadir */
+	__ATA_QUIRK_NO_NCQ_TRIM,	/* Do not use queued TRIM */
+	__ATA_QUIRK_NOLPM,		/* Do not use LPM */
+	__ATA_QUIRK_WD_BROKEN_LPM,	/* Some WDs have broken LPM */
+	__ATA_QUIRK_ZERO_AFTER_TRIM,	/* Guarantees zero after trim */
+	__ATA_QUIRK_NO_DMA_LOG,		/* Do not use DMA for log read */
+	__ATA_QUIRK_NOTRIM,		/* Do not use TRIM */
+	__ATA_QUIRK_MAX_SEC_1024,	/* Limit max sects to 1024 */
+	__ATA_QUIRK_MAX_TRIM_128M,	/* Limit max trim size to 128M */
+	__ATA_QUIRK_NO_NCQ_ON_ATI,	/* Disable NCQ on ATI chipset */
+	__ATA_QUIRK_NO_ID_DEV_LOG,	/* Identify device log missing */
+	__ATA_QUIRK_NO_LOG_DIR,		/* Do not read log directory */
+	__ATA_QUIRK_NO_FUA,		/* Do not use FUA */
+
+	__ATA_QUIRK_MAX,
+};
+
 enum {
 	/* various global constants */
 	LIBATA_MAX_PRD		= ATA_MAX_PRD / 2,
@@ -366,40 +406,38 @@ enum {
 	 * Quirk flags: may be set by libata or controller drivers on drives.
 	 * Some quirks may be drive/controller pair dependent.
 	 */
-	ATA_QUIRK_DIAGNOSTIC	= (1 << 0),	/* Failed boot diag */
-	ATA_QUIRK_NODMA		= (1 << 1),	/* DMA problems */
-	ATA_QUIRK_NONCQ		= (1 << 2),	/* Do not use NCQ */
-	ATA_QUIRK_MAX_SEC_128	= (1 << 3),	/* Limit max sects to 128 */
-	ATA_QUIRK_BROKEN_HPA	= (1 << 4),	/* Broken HPA */
-	ATA_QUIRK_DISABLE	= (1 << 5),	/* Disable it */
-	ATA_QUIRK_HPA_SIZE	= (1 << 6),	/* Native size off by one */
-	ATA_QUIRK_IVB		= (1 << 8),	/* CBL det validity bit bugs */
-	ATA_QUIRK_STUCK_ERR	= (1 << 9),	/* Stuck ERR on next PACKET */
-	ATA_QUIRK_BRIDGE_OK	= (1 << 10),	/* No bridge limits */
-	ATA_QUIRK_ATAPI_MOD16_DMA = (1 << 11),	/* Use ATAPI DMA for commands */
-						/* not multiple of 16 bytes */
-	ATA_QUIRK_FIRMWARE_WARN = (1 << 12),	/* Firmware update warning */
-	ATA_QUIRK_1_5_GBPS	= (1 << 13),	/* Force 1.5 Gbps */
-	ATA_QUIRK_NOSETXFER	= (1 << 14),	/* Skip SETXFER, SATA only */
-	ATA_QUIRK_BROKEN_FPDMA_AA = (1 << 15),	/* Skip AA */
-	ATA_QUIRK_DUMP_ID	= (1 << 16),	/* Dump IDENTIFY data */
-	ATA_QUIRK_MAX_SEC_LBA48 = (1 << 17),	/* Set max sects to 65535 */
-	ATA_QUIRK_ATAPI_DMADIR	= (1 << 18),	/* Device requires dmadir */
-	ATA_QUIRK_NO_NCQ_TRIM	= (1 << 19),	/* Do not use queued TRIM */
-	ATA_QUIRK_NOLPM		= (1 << 20),	/* Do not use LPM */
-	ATA_QUIRK_WD_BROKEN_LPM = (1 << 21),	/* Some WDs have broken LPM */
-	ATA_QUIRK_ZERO_AFTER_TRIM = (1 << 22),	/* Guarantees zero after trim */
-	ATA_QUIRK_NO_DMA_LOG	= (1 << 23),	/* Do not use DMA for log read */
-	ATA_QUIRK_NOTRIM	= (1 << 24),	/* Do not use TRIM */
-	ATA_QUIRK_MAX_SEC_1024	= (1 << 25),	/* Limit max sects to 1024 */
-	ATA_QUIRK_MAX_TRIM_128M = (1 << 26),	/* Limit max trim size to 128M */
-	ATA_QUIRK_NO_NCQ_ON_ATI = (1 << 27),	/* Disable NCQ on ATI chipset */
-	ATA_QUIRK_NO_ID_DEV_LOG = (1 << 28),	/* Identify device log missing */
-	ATA_QUIRK_NO_LOG_DIR	= (1 << 29),	/* Do not read log directory */
-	ATA_QUIRK_NO_FUA	= (1 << 30),	/* Do not use FUA */
-
-	 /* DMA mask for user DMA control: User visible values; DO NOT
-	    renumber */
+	ATA_QUIRK_DIAGNOSTIC		= (1U << __ATA_QUIRK_DIAGNOSTIC),
+	ATA_QUIRK_NODMA			= (1U << __ATA_QUIRK_NODMA),
+	ATA_QUIRK_NONCQ			= (1U << __ATA_QUIRK_NONCQ),
+	ATA_QUIRK_MAX_SEC_128		= (1U << __ATA_QUIRK_MAX_SEC_128),
+	ATA_QUIRK_BROKEN_HPA		= (1U << __ATA_QUIRK_BROKEN_HPA),
+	ATA_QUIRK_DISABLE		= (1U << __ATA_QUIRK_DISABLE),
+	ATA_QUIRK_HPA_SIZE		= (1U << __ATA_QUIRK_HPA_SIZE),
+	ATA_QUIRK_IVB			= (1U << __ATA_QUIRK_IVB),
+	ATA_QUIRK_STUCK_ERR		= (1U << __ATA_QUIRK_STUCK_ERR),
+	ATA_QUIRK_BRIDGE_OK		= (1U << __ATA_QUIRK_BRIDGE_OK),
+	ATA_QUIRK_ATAPI_MOD16_DMA	= (1U << __ATA_QUIRK_ATAPI_MOD16_DMA),
+	ATA_QUIRK_FIRMWARE_WARN		= (1U << __ATA_QUIRK_FIRMWARE_WARN),
+	ATA_QUIRK_1_5_GBPS		= (1U << __ATA_QUIRK_1_5_GBPS),
+	ATA_QUIRK_NOSETXFER		= (1U << __ATA_QUIRK_NOSETXFER),
+	ATA_QUIRK_BROKEN_FPDMA_AA	= (1U << __ATA_QUIRK_BROKEN_FPDMA_AA),
+	ATA_QUIRK_DUMP_ID		= (1U << __ATA_QUIRK_DUMP_ID),
+	ATA_QUIRK_MAX_SEC_LBA48		= (1U << __ATA_QUIRK_MAX_SEC_LBA48),
+	ATA_QUIRK_ATAPI_DMADIR		= (1U << __ATA_QUIRK_ATAPI_DMADIR),
+	ATA_QUIRK_NO_NCQ_TRIM		= (1U << __ATA_QUIRK_NO_NCQ_TRIM),
+	ATA_QUIRK_NOLPM			= (1U << __ATA_QUIRK_NOLPM),
+	ATA_QUIRK_WD_BROKEN_LPM		= (1U << __ATA_QUIRK_WD_BROKEN_LPM),
+	ATA_QUIRK_ZERO_AFTER_TRIM	= (1U << __ATA_QUIRK_ZERO_AFTER_TRIM),
+	ATA_QUIRK_NO_DMA_LOG		= (1U << __ATA_QUIRK_NO_DMA_LOG),
+	ATA_QUIRK_NOTRIM		= (1U << __ATA_QUIRK_NOTRIM),
+	ATA_QUIRK_MAX_SEC_1024		= (1U << __ATA_QUIRK_MAX_SEC_1024),
+	ATA_QUIRK_MAX_TRIM_128M		= (1U << __ATA_QUIRK_MAX_TRIM_128M),
+	ATA_QUIRK_NO_NCQ_ON_ATI		= (1U << __ATA_QUIRK_NO_NCQ_ON_ATI),
+	ATA_QUIRK_NO_ID_DEV_LOG		= (1U << __ATA_QUIRK_NO_ID_DEV_LOG),
+	ATA_QUIRK_NO_LOG_DIR		= (1U << __ATA_QUIRK_NO_LOG_DIR),
+	ATA_QUIRK_NO_FUA		= (1U << __ATA_QUIRK_NO_FUA),
+
+	/* User visible DMA mask for DMA control. DO NOT renumber. */
 	ATA_DMA_MASK_ATA	= (1 << 0),	/* DMA on ATA Disk */
 	ATA_DMA_MASK_ATAPI	= (1 << 1),	/* DMA on ATAPI */
 	ATA_DMA_MASK_CFA	= (1 << 2),	/* DMA on CF Card */
-- 
cgit v1.2.3


From 1cbe8143fd2f588031a5f157d15ae15ce12215e2 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Tue, 30 Jul 2024 13:37:33 +0800
Subject: bpf: kprobe: Remove unused declaring of bpf_kprobe_override

After the commit 66665ad2f102 ("tracing/kprobe: bpf: Compare instruction
pointer with original one"), "bpf_kprobe_override" is not used anywhere
anymore, and we can remove it now.

Fixes: 66665ad2f102 ("tracing/kprobe: bpf: Compare instruction pointer with original one")
Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240730053733.885785-1-dongml2@chinatelecom.cn
---
 include/linux/trace_events.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 9df3e2973626..9435185c10ef 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -880,7 +880,6 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
-DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
-- 
cgit v1.2.3


From c149c4a48b19afbf0c383614e57b452d39b154de Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Sat, 13 Jul 2024 08:59:16 +0000
Subject: cgroup/cpuset: Remove cpuset_slab_spread_rotor

Since the SLAB implementation was removed in v6.8, so the
cpuset_slab_spread_rotor is no longer used and can be removed.

Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cpuset.h | 6 ------
 include/linux/sched.h  | 1 -
 2 files changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index de4cf0ee96f7..2a6981eeebf8 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -113,7 +113,6 @@ extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
 			    struct pid *pid, struct task_struct *tsk);
 
 extern int cpuset_mem_spread_node(void);
-extern int cpuset_slab_spread_node(void);
 
 static inline int cpuset_do_page_mem_spread(void)
 {
@@ -246,11 +245,6 @@ static inline int cpuset_mem_spread_node(void)
 	return 0;
 }
 
-static inline int cpuset_slab_spread_node(void)
-{
-	return 0;
-}
-
 static inline int cpuset_do_page_mem_spread(void)
 {
 	return 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f8d150343d42..3773c1c8f099 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1243,7 +1243,6 @@ struct task_struct {
 	/* Sequence number to catch updates: */
 	seqcount_spinlock_t		mems_allowed_seq;
 	int				cpuset_mem_spread_rotor;
-	int				cpuset_slab_spread_rotor;
 #endif
 #ifdef CONFIG_CGROUPS
 	/* Control Group info protected by css_set_lock: */
-- 
cgit v1.2.3


From 93c8332c8373fee415bd79f08d5ba4ba7ca5ad15 Mon Sep 17 00:00:00 2001
From: Xavier <xavier_qy@163.com>
Date: Thu, 4 Jul 2024 14:24:43 +0800
Subject: Union-Find: add a new module in kernel library

This patch implements a union-find data structure in the kernel library,
which includes operations for allocating nodes, freeing nodes,
finding the root of a node, and merging two nodes.

Signed-off-by: Xavier <xavier_qy@163.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/union_find.h | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 include/linux/union_find.h

(limited to 'include/linux')

diff --git a/include/linux/union_find.h b/include/linux/union_find.h
new file mode 100644
index 000000000000..cfd49263c138
--- /dev/null
+++ b/include/linux/union_find.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_UNION_FIND_H
+#define __LINUX_UNION_FIND_H
+/**
+ * union_find.h - union-find data structure implementation
+ *
+ * This header provides functions and structures to implement the union-find
+ * data structure. The union-find data structure is used to manage disjoint
+ * sets and supports efficient union and find operations.
+ *
+ * See Documentation/core-api/union_find.rst for documentation and samples.
+ */
+
+struct uf_node {
+	struct uf_node *parent;
+	unsigned int rank;
+};
+
+/* This macro is used for static initialization of a union-find node. */
+#define UF_INIT_NODE(node)	{.parent = &node, .rank = 0}
+
+/**
+ * uf_node_init - Initialize a union-find node
+ * @node: pointer to the union-find node to be initialized
+ *
+ * This function sets the parent of the node to itself and
+ * initializes its rank to 0.
+ */
+static inline void uf_node_init(struct uf_node *node)
+{
+	node->parent = node;
+	node->rank = 0;
+}
+
+/* find the root of a node */
+struct uf_node *uf_find(struct uf_node *node);
+
+/* Merge two intersecting nodes */
+void uf_union(struct uf_node *node1, struct uf_node *node2);
+
+#endif /* __LINUX_UNION_FIND_H */
-- 
cgit v1.2.3


From 6dfbafd8a1d51a52a623d912a2219e9982020854 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Fri, 12 Jul 2024 16:08:00 +0200
Subject: soundwire: bus: drop unused driver name field

The soundwire driver name field is not currently used by any driver (and
even appears to never have been used) so drop it.

Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://lore.kernel.org/r/20240712140801.24267-3-johan+linaro@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 94fc1b57c57b..5e0dd47a0412 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -704,8 +704,6 @@ struct sdw_master_device {
 	container_of(d, struct sdw_master_device, dev)
 
 struct sdw_driver {
-	const char *name;
-
 	int (*probe)(struct sdw_slave *sdw,
 			const struct sdw_device_id *id);
 	int (*remove)(struct sdw_slave *sdw);
-- 
cgit v1.2.3


From 990c304930138dcd7a49763417e6e5313b81293e Mon Sep 17 00:00:00 2001
From: Patrick Rohr <prohr@google.com>
Date: Mon, 29 Jul 2024 15:00:59 -0700
Subject: Add support for PIO p flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

draft-ietf-6man-pio-pflag is adding a new flag to the Prefix Information
Option to signal that the network can allocate a unique IPv6 prefix per
client via DHCPv6-PD (see draft-ietf-v6ops-dhcp-pd-per-device).

When ra_honor_pio_pflag is enabled, the presence of a P-flag causes
SLAAC autoconfiguration to be disabled for that particular PIO.

An automated test has been added in Android (r.android.com/3195335) to
go along with this change.

Cc: Maciej Żenczykowski <maze@google.com>
Cc: Lorenzo Colitti <lorenzo@google.com>
Cc: David Lamparter <equinox@opensourcerouting.org>
Cc: Simon Horman <horms@kernel.org>
Signed-off-by: Patrick Rohr <prohr@google.com>
Reviewed-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 383a0ea2ab91..a6e2aadbb91b 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -89,6 +89,7 @@ struct ipv6_devconf {
 	__u8		ioam6_enabled;
 	__u8		ndisc_evict_nocarrier;
 	__u8		ra_honor_pio_life;
+	__u8		ra_honor_pio_pflag;
 
 	struct ctl_table_header *sysctl_header;
 };
-- 
cgit v1.2.3


From 9aed3b51fd6186582a95abd9fa67782982540749 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Jul 2024 18:49:35 -0700
Subject: rcu: Better define "atomic" for list replacement

The kernel-doc headers for list_replace_rcu() and hlist_replace_rcu()
claim that the replacement is atomic, which it is, but only for readers.
Avoid confusion by making it clear that the atomic nature of these
functions applies only to readers, not to concurrent updaters.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rculist.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 3dc1e58865f7..14dfa6008467 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -191,7 +191,10 @@ static inline void hlist_del_init_rcu(struct hlist_node *n)
  * @old : the element to be replaced
  * @new : the new element to insert
  *
- * The @old entry will be replaced with the @new entry atomically.
+ * The @old entry will be replaced with the @new entry atomically from
+ * the perspective of concurrent readers.  It is the caller's responsibility
+ * to synchronize with concurrent updaters, if any.
+ *
  * Note: @old should not be empty.
  */
 static inline void list_replace_rcu(struct list_head *old,
@@ -519,7 +522,9 @@ static inline void hlist_del_rcu(struct hlist_node *n)
  * @old : the element to be replaced
  * @new : the new element to insert
  *
- * The @old entry will be replaced with the @new entry atomically.
+ * The @old entry will be replaced with the @new entry atomically from
+ * the perspective of concurrent readers.  It is the caller's responsibility
+ * to synchronize with concurrent updaters, if any.
  */
 static inline void hlist_replace_rcu(struct hlist_node *old,
 					struct hlist_node *new)
-- 
cgit v1.2.3


From ab03125268679e058e1e7b6612f6d12610761769 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 15 Jul 2024 11:00:34 -0400
Subject: cgroup: Show # of subsystem CSSes in cgroup.stat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cgroup subsystem state (CSS) is an abstraction in the cgroup layer to
help manage different structures in various cgroup subsystems by being
an embedded element inside a larger structure like cpuset or mem_cgroup.

The /proc/cgroups file shows the number of cgroups for each of the
subsystems.  With cgroup v1, the number of CSSes is the same as the
number of cgroups.  That is not the case anymore with cgroup v2. The
/proc/cgroups file cannot show the actual number of CSSes for the
subsystems that are bound to cgroup v2.

So if a v2 cgroup subsystem is leaking cgroups (usually memory cgroup),
we can't tell by looking at /proc/cgroups which cgroup subsystems may
be responsible.

As cgroup v2 had deprecated the use of /proc/cgroups, the hierarchical
cgroup.stat file is now being extended to show the number of live and
dying CSSes associated with all the non-inhibited cgroup subsystems that
have been bound to cgroup v2. The number includes CSSes in the current
cgroup as well as in all the descendants underneath it.  This will help
us pinpoint which subsystems are responsible for the increasing number
of dying (nr_dying_descendants) cgroups.

The CSSes dying counts are stored in the cgroup structure itself
instead of inside the CSS as suggested by Johannes. This will allow
us to accurately track dying counts of cgroup subsystems that have
recently been disabled in a cgroup. It is now possible that a zero
subsystem number is coupled with a non-zero dying subsystem number.

The cgroup-v2.rst file is updated to discuss this new behavior.

With this patch applied, a sample output from root cgroup.stat file
was shown below.

	nr_descendants 56
	nr_subsys_cpuset 1
	nr_subsys_cpu 43
	nr_subsys_io 43
	nr_subsys_memory 56
	nr_subsys_perf_event 57
	nr_subsys_hugetlb 1
	nr_subsys_pids 56
	nr_subsys_rdma 1
	nr_subsys_misc 1
	nr_dying_descendants 30
	nr_dying_subsys_cpuset 0
	nr_dying_subsys_cpu 0
	nr_dying_subsys_io 0
	nr_dying_subsys_memory 30
	nr_dying_subsys_perf_event 0
	nr_dying_subsys_hugetlb 0
	nr_dying_subsys_pids 0
	nr_dying_subsys_rdma 0
	nr_dying_subsys_misc 0

Another sample output from system.slice/cgroup.stat was:

	nr_descendants 34
	nr_subsys_cpuset 0
	nr_subsys_cpu 32
	nr_subsys_io 32
	nr_subsys_memory 34
	nr_subsys_perf_event 35
	nr_subsys_hugetlb 0
	nr_subsys_pids 34
	nr_subsys_rdma 0
	nr_subsys_misc 0
	nr_dying_descendants 30
	nr_dying_subsys_cpuset 0
	nr_dying_subsys_cpu 0
	nr_dying_subsys_io 0
	nr_dying_subsys_memory 30
	nr_dying_subsys_perf_event 0
	nr_dying_subsys_hugetlb 0
	nr_dying_subsys_pids 0
	nr_dying_subsys_rdma 0
	nr_dying_subsys_misc 0

Note that 'debug' controller wasn't used to provide this information because
the controller is not recommended in productions kernels, also many of them
won't enable CONFIG_CGROUP_DEBUG by default.

Similar information could be retrieved with debuggers like drgn but that's
also not always available (e.g. lockdown) and the additional cost of runtime
tracking here is deemed marginal.

tj: Added Michal's paragraphs on why this is not added the debug controller
    to the commit message.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Cc: Michal Koutný <mkoutny@suse.com>
Link: http://lkml.kernel.org/r/20240715150034.2583772-1-longman@redhat.com
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ae04035b6cbe..eb0f6f349496 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -210,6 +210,14 @@ struct cgroup_subsys_state {
 	 * fields of the containing structure.
 	 */
 	struct cgroup_subsys_state *parent;
+
+	/*
+	 * Keep track of total numbers of visible descendant CSSes.
+	 * The total number of dying CSSes is tracked in
+	 * css->cgroup->nr_dying_subsys[ssid].
+	 * Protected by cgroup_mutex.
+	 */
+	int nr_descendants;
 };
 
 /*
@@ -470,6 +478,12 @@ struct cgroup {
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
 
+	/*
+	 * Keep track of total number of dying CSSes at and below this cgroup.
+	 * Protected by cgroup_mutex.
+	 */
+	int nr_dying_subsys[CGROUP_SUBSYS_COUNT];
+
 	struct cgroup_root *root;
 
 	/*
-- 
cgit v1.2.3


From f81489a136ac2c18a7b9bb22cf37c0a6f6d58545 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Fri, 12 Jul 2024 13:57:13 -0700
Subject: hwmon: (max6697) Drop platform data support

Platform data is not used anywhere in the upstram kernel.
Drop support for it to simplify code maintenance.

Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 include/linux/platform_data/max6697.h | 33 ---------------------------------
 1 file changed, 33 deletions(-)
 delete mode 100644 include/linux/platform_data/max6697.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/max6697.h b/include/linux/platform_data/max6697.h
deleted file mode 100644
index 6fbb70005541..000000000000
--- a/include/linux/platform_data/max6697.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * max6697.h
- *     Copyright (c) 2012 Guenter Roeck <linux@roeck-us.net>
- */
-
-#ifndef MAX6697_H
-#define MAX6697_H
-
-#include <linux/types.h>
-
-/*
- * For all bit masks:
- * bit 0:    local temperature
- * bit 1..7: remote temperatures
- */
-struct max6697_platform_data {
-	bool smbus_timeout_disable;	/* set to disable SMBus timeouts */
-	bool extended_range_enable;	/* set to enable extended temp range */
-	bool beta_compensation;		/* set to enable beta compensation */
-	u8 alert_mask;			/* set bit to 1 to disable alert */
-	u8 over_temperature_mask;	/* set bit to 1 to disable */
-	u8 resistance_cancellation;	/* set bit to 0 to disable
-					 * bit mask for MAX6581,
-					 * boolean for other chips
-					 */
-	u8 ideality_mask;		/* set bit to 0 to disable */
-	u8 ideality_value;		/* transistor ideality as per
-					 * MAX6581 datasheet
-					 */
-};
-
-#endif /* MAX6697_H */
-- 
cgit v1.2.3


From 298dec19bdeb6e33ac220502504d969272b50cf6 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 31 Jul 2024 00:14:36 -0500
Subject: scx: Allow calling sleepable kfuncs from BPF_PROG_TYPE_SYSCALL

We currently only allow calling sleepable scx kfuncs (i.e.
scx_bpf_create_dsq()) from BPF_PROG_TYPE_STRUCT_OPS progs. The idea here
was that we'd never have to call scx_bpf_create_dsq() outside of a
sched_ext struct_ops callback, but that might not actually be true. For
example, a scheduler could do something like the following:

1. Open and load (not yet attach) a scheduler skel

2. Synchronously call into a BPF_PROG_TYPE_SYSCALL prog from user space.
   For example, to initialize an LLC domain, or some other global,
   read-only state.

3. Attach the skel, which actually enables the scheduler

The advantage of doing this is that it can preclude having to do pretty
ugly boilerplate like initializing a read-only, statically sized array of
u64[]'s which the kernel consumes literally once at init time to then
create struct bpf_cpumask objects which are actually queried at runtime.

Doing the above is already possible given that we can invoke core BPF
kfuncs, such as bpf_cpumask_create(), from BPF_PROG_TYPE_SYSCALL progs. We
already allow many scx kfuncs to be called from BPF_PROG_TYPE_SYSCALL progs
(e.g. scx_bpf_kick_cpu()). Let's allow the sleepable kfuncs as well.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 593d2f4909dd..26e1c33bc844 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -106,16 +106,14 @@ enum scx_ent_dsq_flags {
  * mechanism. See scx_kf_allow().
  */
 enum scx_kf_mask {
-	SCX_KF_UNLOCKED		= 0,	  /* not sleepable, not rq locked */
-	/* all non-sleepables may be nested inside SLEEPABLE */
-	SCX_KF_SLEEPABLE	= 1 << 0, /* sleepable init operations */
+	SCX_KF_UNLOCKED		= 0,	  /* sleepable and not rq locked */
 	/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
-	SCX_KF_CPU_RELEASE	= 1 << 1, /* ops.cpu_release() */
+	SCX_KF_CPU_RELEASE	= 1 << 0, /* ops.cpu_release() */
 	/* ops.dequeue (in REST) may be nested inside DISPATCH */
-	SCX_KF_DISPATCH		= 1 << 2, /* ops.dispatch() */
-	SCX_KF_ENQUEUE		= 1 << 3, /* ops.enqueue() and ops.select_cpu() */
-	SCX_KF_SELECT_CPU	= 1 << 4, /* ops.select_cpu() */
-	SCX_KF_REST		= 1 << 5, /* other rq-locked operations */
+	SCX_KF_DISPATCH		= 1 << 1, /* ops.dispatch() */
+	SCX_KF_ENQUEUE		= 1 << 2, /* ops.enqueue() and ops.select_cpu() */
+	SCX_KF_SELECT_CPU	= 1 << 3, /* ops.select_cpu() */
+	SCX_KF_REST		= 1 << 4, /* other rq-locked operations */
 
 	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
 				  SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
-- 
cgit v1.2.3


From be72a57527fde6c80061c5f9d0e28762eb817b03 Mon Sep 17 00:00:00 2001
From: Xu Kuohai <xukuohai@huawei.com>
Date: Wed, 24 Jul 2024 10:06:58 +0800
Subject: lsm: Refactor return value of LSM hook vm_enough_memory

To be consistent with most LSM hooks, convert the return value of
hook vm_enough_memory to 0 or a negative error code.

Before:
- Hook vm_enough_memory returns 1 if permission is granted, 0 if not.
- LSM_RET_DEFAULT(vm_enough_memory_mm) is 1.

After:
- Hook vm_enough_memory reutrns 0 if permission is granted, negative
  error code if not.
- LSM_RET_DEFAULT(vm_enough_memory_mm) is 0.

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 +-
 include/linux/security.h      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 12c81be78eb9..63e2656d1d56 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -48,7 +48,7 @@ LSM_HOOK(int, 0, quota_on, struct dentry *dentry)
 LSM_HOOK(int, 0, syslog, int type)
 LSM_HOOK(int, 0, settime, const struct timespec64 *ts,
 	 const struct timezone *tz)
-LSM_HOOK(int, 1, vm_enough_memory, struct mm_struct *mm, long pages)
+LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages)
 LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm)
 LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, const struct file *file)
 LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm)
diff --git a/include/linux/security.h b/include/linux/security.h
index 1390f1efb4f0..62233fec8ead 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -634,7 +634,7 @@ static inline int security_settime64(const struct timespec64 *ts,
 
 static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
 {
-	return __vm_enough_memory(mm, pages, cap_vm_enough_memory(mm, pages));
+	return __vm_enough_memory(mm, pages, !cap_vm_enough_memory(mm, pages));
 }
 
 static inline int security_bprm_creds_for_exec(struct linux_binprm *bprm)
-- 
cgit v1.2.3


From c9c0ee5f20c593faf289fa8850c3ed84031dd12a Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 29 Jul 2024 03:47:40 -0700
Subject: net: skbuff: Skip early return in skb_unref when debugging

This patch modifies the skb_unref function to skip the early return
optimization when CONFIG_DEBUG_NET is enabled. The change ensures that
the reference count decrement always occurs in debug builds, allowing
for more thorough checking of SKB reference counting.

Previously, when the SKB's reference count was 1 and CONFIG_DEBUG_NET
was not set, the function would return early after a memory barrier
(smp_rmb()) without decrementing the reference count. This optimization
assumes it's safe to proceed with freeing the SKB without the overhead
of an atomic decrement from 1 to 0.

With this change:
- In non-debug builds (CONFIG_DEBUG_NET not set), behavior remains
  unchanged, preserving the performance optimization.
- In debug builds (CONFIG_DEBUG_NET set), the reference count is always
  decremented, even when it's 1, allowing for consistent behavior and
  potentially catching subtle SKB management bugs.

This modification enhances debugging capabilities for networking code
without impacting performance in production kernels. It helps kernel
developers identify and diagnose issues related to SKB management and
reference counting in the network stack.

Cc: Chris Mason <clm@fb.com>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20240729104741.370327-1-leitao@debian.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 29c3ea5b6e93..cf8f6ce06742 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1225,7 +1225,7 @@ static inline bool skb_unref(struct sk_buff *skb)
 {
 	if (unlikely(!skb))
 		return false;
-	if (likely(refcount_read(&skb->users) == 1))
+	if (!IS_ENABLED(CONFIG_DEBUG_NET) && likely(refcount_read(&skb->users) == 1))
 		smp_rmb();
 	else if (likely(!refcount_dec_and_test(&skb->users)))
 		return false;
-- 
cgit v1.2.3


From f0fcdd2cb0db62605d85f3b97a1b443e7c91f886 Mon Sep 17 00:00:00 2001
From: John Allen <john.allen@amd.com>
Date: Tue, 30 Jul 2024 15:17:30 +0000
Subject: ACPI: PRM: Add PRM handler direct call support

Platform Runtime Mechanism (PRM) handlers can be invoked from either the AML
interpreter or directly by an OS driver. Implement the latter.

  [ bp: Massage commit message. ]

Signed-off-by: John Allen <john.allen@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20240730151731.15363-2-john.allen@amd.com
---
 include/linux/prmt.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/prmt.h b/include/linux/prmt.h
index 24da8364b919..9c094294403f 100644
--- a/include/linux/prmt.h
+++ b/include/linux/prmt.h
@@ -2,6 +2,11 @@
 
 #ifdef CONFIG_ACPI_PRMT
 void init_prmt(void);
+int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer);
 #else
 static inline void init_prmt(void) { }
+static inline int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer)
+{
+	return -EOPNOTSUPP;
+}
 #endif
-- 
cgit v1.2.3


From 6b08d07cac64c0dcf1dbb4584bdf95d0f789a520 Mon Sep 17 00:00:00 2001
From: Lukasz Majewski <lukma@denx.de>
Date: Wed, 10 Jul 2024 12:06:51 +0200
Subject: leds: trigger: netdev: Add support for tx_err and rx_err notification
 with LEDs

This patch provides support for enabling blinking of LEDs when RX or TX
errors are detected.

Approach taken in this patch is similar to one for TX or RX data
transmission indication (i.e. TRIGGER_NETDEV_TX/RX attribute).

One can inspect transmission errors with:
ip -s link show eth0

Example LED configuration:
cd /sys/devices/platform/amba_pl@0/a001a000.leds/leds/
echo netdev > mode:blue/trigger && \
echo eth0 > mode:blue/device_name && \
echo 1 > mode:blue/tx_err

Signed-off-by: Lukasz Majewski <lukma@denx.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20240710100651.4059887-1-lukma@denx.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/leds.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index 6885603f211b..e5968c3ed4ae 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -611,6 +611,8 @@ enum led_trigger_netdev_modes {
 	TRIGGER_NETDEV_FULL_DUPLEX,
 	TRIGGER_NETDEV_TX,
 	TRIGGER_NETDEV_RX,
+	TRIGGER_NETDEV_TX_ERR,
+	TRIGGER_NETDEV_RX_ERR,
 
 	/* Keep last */
 	__TRIGGER_NETDEV_MAX,
-- 
cgit v1.2.3


From 480a8ad683d7a54e8d38c9c7778fb6b15aac241a Mon Sep 17 00:00:00 2001
From: Haibo Chen <haibo.chen@nxp.com>
Date: Mon, 22 Jul 2024 15:10:58 +0300
Subject: mfd: adp5585: Add Analog Devices ADP5585 core support

The ADP5585 is a 10/11 input/output port expander with a built in keypad
matrix decoder, programmable logic, reset generator, and PWM generator.
This driver supports the chip by modelling it as an MFD device, with two
child devices for the GPIO and PWM functions.

The driver is derived from an initial implementation from NXP, available
in commit 8059835bee19 ("MLK-25917-1 mfd: adp5585: add ADI adp5585 core
support") in their BSP kernel tree. It has been extensively rewritten.

Signed-off-by: Haibo Chen <haibo.chen@nxp.com>
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20240722121100.2855-3-laurent.pinchart@ideasonboard.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/adp5585.h | 126 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 include/linux/mfd/adp5585.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h
new file mode 100644
index 000000000000..016033cd68e4
--- /dev/null
+++ b/include/linux/mfd/adp5585.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Analog Devices ADP5585 I/O expander, PWM controller and keypad controller
+ *
+ * Copyright 2022 NXP
+ * Copyright 2024 Ideas on Board Oy
+ */
+
+#ifndef __MFD_ADP5585_H_
+#define __MFD_ADP5585_H_
+
+#include <linux/bits.h>
+
+#define ADP5585_ID			0x00
+#define		ADP5585_MAN_ID_VALUE		0x20
+#define		ADP5585_MAN_ID_MASK		GENMASK(7, 4)
+#define ADP5585_INT_STATUS		0x01
+#define ADP5585_STATUS			0x02
+#define ADP5585_FIFO_1			0x03
+#define ADP5585_FIFO_2			0x04
+#define ADP5585_FIFO_3			0x05
+#define ADP5585_FIFO_4			0x06
+#define ADP5585_FIFO_5			0x07
+#define ADP5585_FIFO_6			0x08
+#define ADP5585_FIFO_7			0x09
+#define ADP5585_FIFO_8			0x0a
+#define ADP5585_FIFO_9			0x0b
+#define ADP5585_FIFO_10			0x0c
+#define ADP5585_FIFO_11			0x0d
+#define ADP5585_FIFO_12			0x0e
+#define ADP5585_FIFO_13			0x0f
+#define ADP5585_FIFO_14			0x10
+#define ADP5585_FIFO_15			0x11
+#define ADP5585_FIFO_16			0x12
+#define ADP5585_GPI_INT_STAT_A		0x13
+#define ADP5585_GPI_INT_STAT_B		0x14
+#define ADP5585_GPI_STATUS_A		0x15
+#define ADP5585_GPI_STATUS_B		0x16
+#define ADP5585_RPULL_CONFIG_A		0x17
+#define ADP5585_RPULL_CONFIG_B		0x18
+#define ADP5585_RPULL_CONFIG_C		0x19
+#define ADP5585_RPULL_CONFIG_D		0x1a
+#define		ADP5585_Rx_PULL_CFG_PU_300K	0
+#define		ADP5585_Rx_PULL_CFG_PD_300K	1
+#define		ADP5585_Rx_PULL_CFG_PU_100K	2
+#define		ADP5585_Rx_PULL_CFG_DISABLE	3
+#define		ADP5585_Rx_PULL_CFG_MASK	3
+#define ADP5585_GPI_INT_LEVEL_A		0x1b
+#define ADP5585_GPI_INT_LEVEL_B		0x1c
+#define ADP5585_GPI_EVENT_EN_A		0x1d
+#define ADP5585_GPI_EVENT_EN_B		0x1e
+#define ADP5585_GPI_INTERRUPT_EN_A	0x1f
+#define ADP5585_GPI_INTERRUPT_EN_B	0x20
+#define ADP5585_DEBOUNCE_DIS_A		0x21
+#define ADP5585_DEBOUNCE_DIS_B		0x22
+#define ADP5585_GPO_DATA_OUT_A		0x23
+#define ADP5585_GPO_DATA_OUT_B		0x24
+#define ADP5585_GPO_OUT_MODE_A		0x25
+#define ADP5585_GPO_OUT_MODE_B		0x26
+#define ADP5585_GPIO_DIRECTION_A	0x27
+#define ADP5585_GPIO_DIRECTION_B	0x28
+#define ADP5585_RESET1_EVENT_A		0x29
+#define ADP5585_RESET1_EVENT_B		0x2a
+#define ADP5585_RESET1_EVENT_C		0x2b
+#define ADP5585_RESET2_EVENT_A		0x2c
+#define ADP5585_RESET2_EVENT_B		0x2d
+#define ADP5585_RESET_CFG		0x2e
+#define ADP5585_PWM_OFFT_LOW		0x2f
+#define ADP5585_PWM_OFFT_HIGH		0x30
+#define ADP5585_PWM_ONT_LOW		0x31
+#define ADP5585_PWM_ONT_HIGH		0x32
+#define ADP5585_PWM_CFG			0x33
+#define		ADP5585_PWM_IN_AND		BIT(2)
+#define		ADP5585_PWM_MODE		BIT(1)
+#define		ADP5585_PWM_EN			BIT(0)
+#define ADP5585_LOGIC_CFG		0x34
+#define ADP5585_LOGIC_FF_CFG		0x35
+#define ADP5585_LOGIC_INT_EVENT_EN	0x36
+#define ADP5585_POLL_PTIME_CFG		0x37
+#define ADP5585_PIN_CONFIG_A		0x38
+#define ADP5585_PIN_CONFIG_B		0x39
+#define ADP5585_PIN_CONFIG_C		0x3a
+#define		ADP5585_PULL_SELECT		BIT(7)
+#define		ADP5585_C4_EXTEND_CFG_GPIO11	(0U << 6)
+#define		ADP5585_C4_EXTEND_CFG_RESET2	(1U << 6)
+#define		ADP5585_C4_EXTEND_CFG_MASK	GENMASK(6, 6)
+#define		ADP5585_R4_EXTEND_CFG_GPIO5	(0U << 5)
+#define		ADP5585_R4_EXTEND_CFG_RESET1	(1U << 5)
+#define		ADP5585_R4_EXTEND_CFG_MASK	GENMASK(5, 5)
+#define		ADP5585_R3_EXTEND_CFG_GPIO4	(0U << 2)
+#define		ADP5585_R3_EXTEND_CFG_LC	(1U << 2)
+#define		ADP5585_R3_EXTEND_CFG_PWM_OUT	(2U << 2)
+#define		ADP5585_R3_EXTEND_CFG_MASK	GENMASK(3, 2)
+#define		ADP5585_R0_EXTEND_CFG_GPIO1	(0U << 0)
+#define		ADP5585_R0_EXTEND_CFG_LY	(1U << 0)
+#define		ADP5585_R0_EXTEND_CFG_MASK	GENMASK(0, 0)
+#define ADP5585_GENERAL_CFG		0x3b
+#define		ADP5585_OSC_EN			BIT(7)
+#define		ADP5585_OSC_FREQ_50KHZ		(0U << 5)
+#define		ADP5585_OSC_FREQ_100KHZ		(1U << 5)
+#define		ADP5585_OSC_FREQ_200KHZ		(2U << 5)
+#define		ADP5585_OSC_FREQ_500KHZ		(3U << 5)
+#define		ADP5585_OSC_FREQ_MASK		GENMASK(6, 5)
+#define		ADP5585_INT_CFG			BIT(1)
+#define		ADP5585_RST_CFG			BIT(0)
+#define ADP5585_INT_EN			0x3c
+
+#define ADP5585_MAX_REG			ADP5585_INT_EN
+
+/*
+ * Bank 0 covers pins "GPIO 1/R0" to "GPIO 6/R5", numbered 0 to 5 by the
+ * driver, and bank 1 covers pins "GPIO 7/C0" to "GPIO 11/C4", numbered 6 to
+ * 10. Some variants of the ADP5585 don't support "GPIO 6/R5". As the driver
+ * uses identical GPIO numbering for all variants to avoid confusion, GPIO 5 is
+ * marked as reserved in the device tree for variants that don't support it.
+ */
+#define ADP5585_BANK(n)			((n) >= 6 ? 1 : 0)
+#define ADP5585_BIT(n)			((n) >= 6 ? BIT((n) - 6) : BIT(n))
+
+struct regmap;
+
+struct adp5585_dev {
+	struct regmap *regmap;
+};
+
+#endif
-- 
cgit v1.2.3


From bcbfcebda2cbc6a10a347d726e4a4f69e43a864e Mon Sep 17 00:00:00 2001
From: Mohamed Ghanmi <mohamed.ghanmi@supcom.tn>
Date: Sun, 9 Jun 2024 15:48:49 +0100
Subject: platform/x86: asus-wmi: add support for vivobook fan profiles

Add support for vivobook fan profiles wmi call on the ASUS VIVOBOOK
to adjust power limits.

These fan profiles have a different device id than the ROG series
and different order. This reorders the existing modes.

As part of keeping the patch clean the throttle_thermal_policy_available
boolean stored in the driver struct is removed and
throttle_thermal_policy_dev is used in place (as on init it is zeroed).

Co-developed-by: Luke D. Jones <luke@ljones.dev>
Signed-off-by: Luke D. Jones <luke@ljones.dev>
Signed-off-by: Mohamed Ghanmi <mohamed.ghanmi@supcom.tn>
Reviewed-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20240609144849.2532-2-mohamed.ghanmi@supcom.tn
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/asus-wmi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 0aeeae1c1943..5e6f407b2def 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -68,6 +68,7 @@
 #define ASUS_WMI_DEVID_SCREENPAD_LIGHT	0x00050032
 #define ASUS_WMI_DEVID_FAN_BOOST_MODE	0x00110018
 #define ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY 0x00120075
+#define ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY_VIVO 0x00110019
 
 /* Misc */
 #define ASUS_WMI_DEVID_PANEL_OD		0x00050019
-- 
cgit v1.2.3


From b40824500eaa77668026b6d1ade6924901a680f9 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Tue, 30 Jul 2024 14:38:07 +0900
Subject: ata: libata: Remove ata_noop_qc_prep()

The function ata_noop_qc_prep(), as its name implies, does nothing and
simply returns AC_ERR_OK. For drivers that do not need any special
preparations of queued commands, we can avoid having to define struct
ata_port qc_prep operation by simply testing if that operation is
defined or not in ata_qc_issue(). Make this change and remove
ata_noop_qc_prep().

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
---
 include/linux/libata.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index d598ef690e50..d5446e18d9df 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1168,7 +1168,6 @@ extern int ata_xfer_mode2shift(u8 xfer_mode);
 extern const char *ata_mode_string(unsigned int xfer_mask);
 extern unsigned int ata_id_xfermask(const u16 *id);
 extern int ata_std_qc_defer(struct ata_queued_cmd *qc);
-extern enum ata_completion_errors ata_noop_qc_prep(struct ata_queued_cmd *qc);
 extern void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
 		 unsigned int n_elem);
 extern unsigned int ata_dev_classify(const struct ata_taskfile *tf);
-- 
cgit v1.2.3


From bf1807c6ee1f66608c7835f6f9d9139c9c477942 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Thu, 1 Aug 2024 18:04:22 +0900
Subject: ata: libata: Print device quirks only once

In ata_dev_print_quirks(), return early if ata_dev_print_info() returns
false or if we already printed quirk information. This is to avoid
printing a device quirks multiple times (that is, each time
ata_dev_revalidate() is called).

To remember if ata_dev_print_quirks() was already executed, define the
EH context flag ATA_EHI_DID_PRINT_QUIRKS and set this flag in
ata_dev_print_quirks().

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Fixes: 58157d607aec ("ata: libata: Print quirks applied to devices")
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/linux/libata.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index d5446e18d9df..0279c0a6302f 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -378,6 +378,7 @@ enum {
 	ATA_EHI_PRINTINFO	= (1 << 18), /* print configuration info */
 	ATA_EHI_SETMODE		= (1 << 19), /* configure transfer mode */
 	ATA_EHI_POST_SETMODE	= (1 << 20), /* revalidating after setmode */
+	ATA_EHI_DID_PRINT_QUIRKS = (1 << 21), /* already printed quirks info */
 
 	ATA_EHI_DID_RESET	= ATA_EHI_DID_SOFTRESET | ATA_EHI_DID_HARDRESET,
 
-- 
cgit v1.2.3


From e6ab45005772014ce49a693a63f928203c0cbbb0 Mon Sep 17 00:00:00 2001
From: Luigi Leonardi <luigi.leonardi@outlook.com>
Date: Tue, 30 Jul 2024 21:43:07 +0200
Subject: vsock/virtio: add SIOCOUTQ support for all virtio based transports

Introduce support for virtio_transport_unsent_bytes
ioctl for virtio_transport, vhost_vsock and vsock_loopback.

For all transports the unsent bytes counter is incremented
in virtio_transport_get_credit.

In virtio_transport (G2H) and in vhost-vsock (H2G) the counter
is decremented when the skbuff is consumed. In vsock_loopback the
same skbuff is passed from the transmitter to the receiver, so
the counter is decremented before queuing the skbuff to the
receiver.

Signed-off-by: Luigi Leonardi <luigi.leonardi@outlook.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_vsock.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index c82089dee0c8..0387d64e2c66 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -133,6 +133,7 @@ struct virtio_vsock_sock {
 	u32 tx_cnt;
 	u32 peer_fwd_cnt;
 	u32 peer_buf_alloc;
+	size_t bytes_unsent;
 
 	/* Protected by rx_lock */
 	u32 fwd_cnt;
@@ -193,6 +194,11 @@ s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
 s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);
 u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk);
 
+ssize_t virtio_transport_unsent_bytes(struct vsock_sock *vsk);
+
+void virtio_transport_consume_skb_sent(struct sk_buff *skb,
+				       bool consume);
+
 int virtio_transport_do_socket_init(struct vsock_sock *vsk,
 				 struct vsock_sock *psk);
 int
-- 
cgit v1.2.3


From 79bd233010859463e46a0a4b3926eaaba25a6110 Mon Sep 17 00:00:00 2001
From: Ben Gainey <ben.gainey@arm.com>
Date: Tue, 30 Jul 2024 09:44:14 +0100
Subject: perf: Rename perf_event_context.nr_pending to nr_no_switch_fast.

nr_pending counts the number of events in the context that
either pending_sigtrap or pending_work, but it is used
to prevent taking the fast path in perf_event_context_sched_out.

Renamed to reflect what it is used for, rather than what it
counts. This change allows using the field to track other
event properties that also require skipping the fast path
without possible confusion over the name.

Signed-off-by: Ben Gainey <ben.gainey@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20240730084417.7693-2-ben.gainey@arm.com
---
 include/linux/perf_event.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6bb0c21d6335..655f66b18418 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -966,12 +966,13 @@ struct perf_event_context {
 	struct rcu_head			rcu_head;
 
 	/*
-	 * Sum (event->pending_work + event->pending_work)
+	 * The count of events for which using the switch-out fast path
+	 * should be avoided.
 	 *
 	 * The SIGTRAP is targeted at ctx->task, as such it won't do changing
 	 * that until the signal is delivered.
 	 */
-	local_t				nr_pending;
+	local_t				nr_no_switch_fast;
 };
 
 struct perf_cpu_pmu_context {
-- 
cgit v1.2.3


From 7e8b255650fcfa1d05a57e4093d8405e6d8dd488 Mon Sep 17 00:00:00 2001
From: Ben Gainey <ben.gainey@arm.com>
Date: Tue, 30 Jul 2024 09:44:15 +0100
Subject: perf: Support PERF_SAMPLE_READ with inherit

This change allows events to use PERF_SAMPLE_READ with inherit
so long as PERF_SAMPLE_TID is also set. This enables sample based
profiling of a group of counters over a hierarchy of processes or
threads. This is useful, for example, for collecting per-thread
counters/metrics, event based sampling of multiple counters as a unit,
access to the enabled and running time when using multiplexing and so
on.

Prior to this, users were restricted to either collecting aggregate
statistics for a multi-threaded/-process application (e.g. with
"perf stat"), or to sample individual threads, or to profile the entire
system (which requires root or CAP_PERFMON, and may produce much more
data than is required). Theoretically a tool could poll for or otherwise
monitor thread/process creation and construct whatever events the user
is interested in using perf_event_open, for each new thread or process,
but this is racy, can lead to file-descriptor exhaustion, and ultimately
just replicates the behaviour of inherit, but in userspace.

This configuration differs from inherit without PERF_SAMPLE_READ in that
the accumulated event count, and consequently any sample (such as if
triggered by overflow of sample_period) will be on a per-thread rather
than on an aggregate basis.

The meaning of read_format::value field of both PERF_RECORD_READ and
PERF_RECORD_SAMPLE is changed such that if the sampled event uses this
new configuration then the values reported will be per-thread rather
than the global aggregate value. This is a change from the existing
semantics of read_format (where PERF_SAMPLE_READ is used without
inherit), but it is necessary to expose the per-thread counter values,
and it avoids reinventing a separate "read_format_thread" field that
otherwise replicates the same behaviour. This change should not break
existing tools, since this configuration was not previously valid and
was rejected by the kernel. Tools that opt into this new mode will need
to account for this when calculating the counter delta for a given
sample. Tools that wish to have both the per-thread and aggregate value
can perform the global aggregation themselves from the per-thread
values.

The change to read_format::value does not affect existing valid
perf_event_attr configurations, nor does it change the behaviour of
calls to "read" on an event descriptor. Both continue to report the
aggregate value for the entire thread/process hierarchy. The difference
between the results reported by "read" and PERF_RECORD_SAMPLE in this
new configuration is justified on the basis that it is not (easily)
possible for "read" to target a specific thread (the caller only has
the fd for the original parent event).

Signed-off-by: Ben Gainey <ben.gainey@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20240730084417.7693-3-ben.gainey@arm.com
---
 include/linux/perf_event.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 655f66b18418..701549967c18 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -969,6 +969,9 @@ struct perf_event_context {
 	 * The count of events for which using the switch-out fast path
 	 * should be avoided.
 	 *
+	 * Sum (event->pending_work + events with
+	 *    (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)))
+	 *
 	 * The SIGTRAP is targeted at ctx->task, as such it won't do changing
 	 * that until the signal is delivered.
 	 */
-- 
cgit v1.2.3


From cfa7f3d2c526c224a6271cc78a4a27a0de06f4f0 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 29 Jul 2024 10:52:23 -0700
Subject: perf,x86: avoid missing caller address in stack traces captured in
 uprobe

When tracing user functions with uprobe functionality, it's common to
install the probe (e.g., a BPF program) at the first instruction of the
function. This is often going to be `push %rbp` instruction in function
preamble, which means that within that function frame pointer hasn't
been established yet. This leads to consistently missing an actual
caller of the traced function, because perf_callchain_user() only
records current IP (capturing traced function) and then following frame
pointer chain (which would be caller's frame, containing the address of
caller's caller).

So when we have target_1 -> target_2 -> target_3 call chain and we are
tracing an entry to target_3, captured stack trace will report
target_1 -> target_3 call chain, which is wrong and confusing.

This patch proposes a x86-64-specific heuristic to detect `push %rbp`
(`push %ebp` on 32-bit architecture) instruction being traced. Given
entire kernel implementation of user space stack trace capturing works
under assumption that user space code was compiled with frame pointer
register (%rbp/%ebp) preservation, it seems pretty reasonable to use
this instruction as a strong indicator that this is the entry to the
function. In that case, return address is still pointed to by %rsp/%esp,
so we fetch it and add to stack trace before proceeding to unwind the
rest using frame pointer-based logic.

We also check for `endbr64` (for 64-bit modes) as another common pattern
for function entry, as suggested by Josh Poimboeuf. Even if we get this
wrong sometimes for uprobes attached not at the function entry, it's OK
because stack trace will still be overall meaningful, just with one
extra bogus entry. If we don't detect this, we end up with guaranteed to
be missing caller function entry in the stack trace, which is worse
overall.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20240729175223.23914-1-andrii@kernel.org
---
 include/linux/uprobes.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b503fafb7fb3..a270a5892ab4 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -76,6 +76,8 @@ struct uprobe_task {
 	struct uprobe			*active_uprobe;
 	unsigned long			xol_vaddr;
 
+	struct arch_uprobe              *auprobe;
+
 	struct return_instance		*return_instances;
 	unsigned int			depth;
 };
-- 
cgit v1.2.3


From e04332ebc8ac128fa551e83f1161ab1c094d13a9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 1 Aug 2024 15:27:28 +0200
Subject: uprobes: kill uprobe_register_refctr()

It doesn't make any sense to have 2 versions of _register(). Note that
trace_uprobe_enable(), the only user of uprobe_register(), doesn't need
to check tu->ref_ctr_offset to decide which one should be used, it could
safely pass ref_ctr_offset == 0 to uprobe_register_refctr().

Add this argument to uprobe_register(), update the callers, and kill
uprobe_register_refctr().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240801132728.GA8800@redhat.com
---
 include/linux/uprobes.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index a270a5892ab4..788813c0b8fc 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -112,8 +112,7 @@ extern bool is_trap_insn(uprobe_opcode_t *insn);
 extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
 extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
-extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
-extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
+extern int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
 extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern int uprobe_mmap(struct vm_area_struct *vma);
@@ -154,11 +153,7 @@ static inline void uprobes_init(void)
 #define uprobe_get_trap_addr(regs)	instruction_pointer(regs)
 
 static inline int
-uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
-{
-	return -ENOSYS;
-}
-static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc)
 {
 	return -ENOSYS;
 }
-- 
cgit v1.2.3


From 3c83a9ad0295eb63bdeb81d821b8c3b9417fbcac Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 1 Aug 2024 15:27:34 +0200
Subject: uprobes: make uprobe_register() return struct uprobe *

This way uprobe_unregister() and uprobe_apply() can use "struct uprobe *"
rather than inode + offset. This simplifies the code and allows to avoid
the unnecessary find_uprobe() + put_uprobe() in these functions.

TODO: uprobe_unregister() still needs get_uprobe/put_uprobe to ensure that
this uprobe can't be freed before up_write(&uprobe->register_rwsem).

Co-developed-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20240801132734.GA8803@redhat.com
---
 include/linux/uprobes.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 788813c0b8fc..f50df1fa93e7 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 
+struct uprobe;
 struct vm_area_struct;
 struct mm_struct;
 struct inode;
@@ -112,9 +113,9 @@ extern bool is_trap_insn(uprobe_opcode_t *insn);
 extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
 extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
-extern int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
-extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
-extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
+extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
+extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
+extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc);
 extern int uprobe_mmap(struct vm_area_struct *vma);
 extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
 extern void uprobe_start_dup_mmap(void);
@@ -152,18 +153,18 @@ static inline void uprobes_init(void)
 
 #define uprobe_get_trap_addr(regs)	instruction_pointer(regs)
 
-static inline int
+static inline struct uprobe *
 uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc)
 {
-	return -ENOSYS;
+	return ERR_PTR(-ENOSYS);
 }
 static inline int
-uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add)
+uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add)
 {
 	return -ENOSYS;
 }
 static inline void
-uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
 }
 static inline int uprobe_mmap(struct vm_area_struct *vma)
-- 
cgit v1.2.3


From 6edb8cd87cca5dd4389d4ea2d84b66ea94341bbd Mon Sep 17 00:00:00 2001
From: Kerem Karabay <kekrby@gmail.com>
Date: Fri, 5 Jul 2024 11:17:42 +0000
Subject: HID: core: add helper for finding a field with a certain usage

This helper will allow HID drivers to easily determine if they should
bind to a hid_device by checking for the prescence of a certain field
when its ID is not enough, which can be the case on USB devices with
multiple interfaces and/or configurations.

Convert google-hammer driver to use it, and remove now superfluous
hammer_has_usage().

[jkosina@suse.com: expand changelog with the information about
 google-hammer being added as user of this API ]
Signed-off-by: Kerem Karabay <kekrby@gmail.com>
Signed-off-by: Aditya Garg <gargaditya08@live.com>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 include/linux/hid.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index 1533c9dcd3a6..2deff79f39a1 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -940,6 +940,8 @@ extern void hidinput_report_event(struct hid_device *hid, struct hid_report *rep
 extern int hidinput_connect(struct hid_device *hid, unsigned int force);
 extern void hidinput_disconnect(struct hid_device *);
 
+struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_type,
+				 unsigned int application, unsigned int usage);
 int hid_set_field(struct hid_field *, unsigned, __s32);
 int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size,
 		     int interrupt);
-- 
cgit v1.2.3


From 79f194dd54cef3a0212914cd79dda1f91b7bf7f6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 29 Jul 2024 18:11:29 +0200
Subject: thermal: trip: Get rid of thermal_zone_get_num_trips()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The only existing caller of thermal_zone_get_num_trips(), which is
rcar_gen3_thermal_probe(), uses this function to put the number of
trip points into a kernel log message, but this information is also
available from the thermal sysfs interface.

For this reason, remove the thermal_zone_get_num_trips() call from
rcar_gen3_thermal_probe() and drop the former altogether.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://patch.msgid.link/2636988.Lt9SDvczpP@rjwysocki.net
---
 include/linux/thermal.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 25fbf960b474..4fdfd5abd2c4 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -210,7 +210,6 @@ int for_each_thermal_trip(struct thermal_zone_device *tz,
 int thermal_zone_for_each_trip(struct thermal_zone_device *tz,
 			       int (*cb)(struct thermal_trip *, void *),
 			       void *data);
-int thermal_zone_get_num_trips(struct thermal_zone_device *tz);
 void thermal_zone_set_trip_temp(struct thermal_zone_device *tz,
 				struct thermal_trip *trip, int temp);
 
-- 
cgit v1.2.3


From 8ecd953ca5850d2f83f548d4bbcf672d2d2cd0ca Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 29 Jul 2024 18:12:45 +0200
Subject: thermal: trip: Drop thermal_zone_get_trip()

There are no more callers of thermal_zone_get_trip() in the tree, so
drop it.

No functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://patch.msgid.link/2220301.Mh6RI2rZIc@rjwysocki.net
---
 include/linux/thermal.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 4fdfd5abd2c4..db44a57a7474 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -202,8 +202,6 @@ static inline void devm_thermal_of_zone_unregister(struct device *dev,
 }
 #endif
 
-int thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id,
-			  struct thermal_trip *trip);
 int for_each_thermal_trip(struct thermal_zone_device *tz,
 			  int (*cb)(struct thermal_trip *, void *),
 			  void *data);
-- 
cgit v1.2.3


From 37c6dccd683797a5a4ec85d2e94939bf829d3499 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qyousef@layalina.io>
Date: Sun, 28 Jul 2024 20:26:59 +0100
Subject: cpufreq: Remove LATENCY_MULTIPLIER

The current LATENCY_MULTIPLIER which has been around for nearly 20 years
causes rate_limit_us to be always in ms range.

On M1 mac mini I get 50 and 56us transition latency, but due to the 1000
multiplier we end up setting rate_limit_us to 50 and 56ms, which gets
capped into 2ms and was 10ms before e13aa799c2a6 ("cpufreq: Change
default transition delay to 2ms")

On Intel I5 system transition latency is 20us but due to the multiplier
we end up with 20ms that again is capped to 2ms.

Given how good modern hardware and how modern workloads require systems
to be more responsive to cater for sudden changes in workload (tasks
sleeping/wakeup/migrating, uclamp causing a sudden boost or cap) and
that 2ms is quarter of the time of 120Hz refresh rate system, drop the
old logic in favour of providing 50% headroom.

	rate_limit_us = 1.5 * latency.

I considered not adding any headroom which could mean that we can end up
with infinite back-to-back requests.

I also considered providing a constant headroom (e.g: 100us) assuming
that any h/w or f/w dealing with the request shouldn't require a large
headroom when transition_latency is actually high.

But for both cases I wasn't sure if h/w or f/w can end up being
overwhelmed dealing with the freq requests in a potentially busy system.
So I opted for providing 50% breathing room.

This is expected to impact schedutil only as the other user,
dbs_governor, takes the max(2*tick, transition_delay_us) and the former
was at least 2ms on 1ms TICK, which is equivalent to the max_delay_us
before applying this patch. For systems with TICK of 4ms, this value
would have almost always ended up with 8ms sampling rate.

For systems that report 0 transition latency, we still default to
returning 1ms as transition delay.

This helps in eliminating a source of latency for applying requests as
mentioned in [1]. For example if we have a 1ms tick, most systems will
miss sending an update at tick when updating the util_avg for a task/CPU
(rate_limit_us will be 2ms for most systems).

Link: https://lore.kernel.org/lkml/20240724212255.mfr2ybiv2j2uqek7@airbuntu/ # [1]
Link: https://lore.kernel.org/lkml/20240205022500.2232124-1-qyousef@layalina.io/
Signed-off-by: Qais Yousef <qyousef@layalina.io>
Link: https://patch.msgid.link/20240728192659.58115-1-qyousef@layalina.io
[ rjw: Subject edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpufreq.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index d4d2f4d1d7cb..e0e19d9c1323 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -577,12 +577,6 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div,
 #define CPUFREQ_POLICY_POWERSAVE	(1)
 #define CPUFREQ_POLICY_PERFORMANCE	(2)
 
-/*
- * The polling frequency depends on the capability of the processor. Default
- * polling frequency is 1000 times the transition latency of the processor.
- */
-#define LATENCY_MULTIPLIER		(1000)
-
 struct cpufreq_governor {
 	char	name[CPUFREQ_NAME_LEN];
 	int	(*init)(struct cpufreq_policy *policy);
-- 
cgit v1.2.3


From 70e6b7d9ae3c63df90a7bba7700e8d5c300c3c60 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 2 Aug 2024 14:55:54 +0100
Subject: x86/i8253: Disable PIT timer 0 when not in use

Leaving the PIT interrupt running can cause noticeable steal time for
virtual guests. The VMM generally has a timer which toggles the IRQ input
to the PIC and I/O APIC, which takes CPU time away from the guest. Even
on real hardware, running the counter may use power needlessly (albeit
not much).

Make sure it's turned off if it isn't going to be used.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mhkelley@outlook.com>
Link: https://lore.kernel.org/all/20240802135555.564941-1-dwmw2@infradead.org
---
 include/linux/i8253.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/i8253.h b/include/linux/i8253.h
index 8336b2f6f834..bf169cfef7f1 100644
--- a/include/linux/i8253.h
+++ b/include/linux/i8253.h
@@ -24,6 +24,7 @@ extern raw_spinlock_t i8253_lock;
 extern bool i8253_clear_counter_on_shutdown;
 extern struct clock_event_device i8253_clockevent;
 extern void clockevent_i8253_init(bool oneshot);
+extern void clockevent_i8253_disable(void);
 
 extern void setup_pit_timer(void);
 
-- 
cgit v1.2.3


From 531b2ca0a940ac9db03f246c8b77c4201de72b00 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 2 Aug 2024 14:55:55 +0100
Subject: clockevents/drivers/i8253: Fix stop sequence for timer 0

According to the data sheet, writing the MODE register should stop the
counter (and thus the interrupts). This appears to work on real hardware,
at least modern Intel and AMD systems. It should also work on Hyper-V.

However, on some buggy virtual machines the mode change doesn't have any
effect until the counter is subsequently loaded (or perhaps when the IRQ
next fires).

So, set MODE 0 and then load the counter, to ensure that those buggy VMs
do the right thing and the interrupts stop. And then write MODE 0 *again*
to stop the counter on compliant implementations too.

Apparently, Hyper-V keeps firing the IRQ *repeatedly* even in mode zero
when it should only happen once, but the second MODE write stops that too.

Userspace test program (mostly written by tglx):
=====
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <sys/io.h>

static __always_inline void __out##bwl(type value, uint16_t port)	\
{									\
	asm volatile("out" #bwl " %" #bw "0, %w1"			\
		     : : "a"(value), "Nd"(port));			\
}									\
									\
static __always_inline type __in##bwl(uint16_t port)			\
{									\
	type value;							\
	asm volatile("in" #bwl " %w1, %" #bw "0"			\
		     : "=a"(value) : "Nd"(port));			\
	return value;							\
}

BUILDIO(b, b, uint8_t)

 #define inb __inb
 #define outb __outb

 #define PIT_MODE	0x43
 #define PIT_CH0	0x40
 #define PIT_CH2	0x42

static int is8254;

static void dump_pit(void)
{
	if (is8254) {
		// Latch and output counter and status
		outb(0xC2, PIT_MODE);
		printf("%02x %02x %02x\n", inb(PIT_CH0), inb(PIT_CH0), inb(PIT_CH0));
	} else {
		// Latch and output counter
		outb(0x0, PIT_MODE);
		printf("%02x %02x\n", inb(PIT_CH0), inb(PIT_CH0));
	}
}

int main(int argc, char* argv[])
{
	int nr_counts = 2;

	if (argc > 1)
		nr_counts = atoi(argv[1]);

	if (argc > 2)
		is8254 = 1;

	if (ioperm(0x40, 4, 1) != 0)
		return 1;

	dump_pit();

	printf("Set oneshot\n");
	outb(0x38, PIT_MODE);
	outb(0x00, PIT_CH0);
	outb(0x0F, PIT_CH0);

	dump_pit();
	usleep(1000);
	dump_pit();

	printf("Set periodic\n");
	outb(0x34, PIT_MODE);
	outb(0x00, PIT_CH0);
	outb(0x0F, PIT_CH0);

	dump_pit();
	usleep(1000);
	dump_pit();
	dump_pit();
	usleep(100000);
	dump_pit();
	usleep(100000);
	dump_pit();

	printf("Set stop (%d counter writes)\n", nr_counts);
	outb(0x30, PIT_MODE);
	while (nr_counts--)
		outb(0xFF, PIT_CH0);

	dump_pit();
	usleep(100000);
	dump_pit();
	usleep(100000);
	dump_pit();

	printf("Set MODE 0\n");
	outb(0x30, PIT_MODE);

	dump_pit();
	usleep(100000);
	dump_pit();
	usleep(100000);
	dump_pit();

	return 0;
}
=====

Suggested-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Michael Kelley <mhkelley@outlook.com>
Link: https://lore.kernel.org/all/20240802135555.564941-2-dwmw2@infradead.org
---
 include/linux/i8253.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/i8253.h b/include/linux/i8253.h
index bf169cfef7f1..56c280eb2d4f 100644
--- a/include/linux/i8253.h
+++ b/include/linux/i8253.h
@@ -21,7 +21,6 @@
 #define PIT_LATCH	((PIT_TICK_RATE + HZ/2) / HZ)
 
 extern raw_spinlock_t i8253_lock;
-extern bool i8253_clear_counter_on_shutdown;
 extern struct clock_event_device i8253_clockevent;
 extern void clockevent_i8253_init(bool oneshot);
 extern void clockevent_i8253_disable(void);
-- 
cgit v1.2.3


From e99129e5dbf7ca87233d31ad19348f6ce8627b38 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 1 Aug 2024 13:32:59 -1000
Subject: sched_ext: Allow p->scx.disallow only while loading

From 1232da7eced620537a78f19c8cf3d4a3508e2419 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 31 Jul 2024 09:14:52 -1000

p->scx.disallow provides a way for the BPF scheduler to reject certain tasks
from attaching. It's currently allowed for both the load and fork paths;
however, the latter doesn't actually work as p->sched_class is already set
by the time scx_ops_init_task() is called during fork.

This is a convenience feature which is mostly useful from the load path
anyway. Allow it only from the load path.

v2: Trigger scx_ops_error() iff @p->policy == SCHED_EXT to make it a bit
    easier for the BPF scheduler (David).

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Zhangqiao (2012 lab)" <zhangqiao22@huawei.com>
Link: http://lkml.kernel.org/r/20240711110720.1285-1-zhangqiao22@huawei.com
Fixes: 7bb6f0810ecf ("sched_ext: Allow BPF schedulers to disallow specific tasks from joining SCHED_EXT")
Acked-by: David Vernet <void@manifault.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 26e1c33bc844..69f68e2121a8 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -179,11 +179,12 @@ struct sched_ext_entity {
 	 * If set, reject future sched_setscheduler(2) calls updating the policy
 	 * to %SCHED_EXT with -%EACCES.
 	 *
-	 * If set from ops.init_task() and the task's policy is already
-	 * %SCHED_EXT, which can happen while the BPF scheduler is being loaded
-	 * or by inhering the parent's policy during fork, the task's policy is
-	 * rejected and forcefully reverted to %SCHED_NORMAL. The number of
-	 * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected.
+	 * Can be set from ops.init_task() while the BPF scheduler is being
+	 * loaded (!scx_init_task_args->fork). If set and the task's policy is
+	 * already %SCHED_EXT, the task's policy is rejected and forcefully
+	 * reverted to %SCHED_NORMAL. The number of such events are reported
+	 * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag
+	 * during fork is not allowed.
 	 */
 	bool			disallow;	/* reject switching into SCX */
 
-- 
cgit v1.2.3


From 49675f5bdf9ae2624b430dafda4cb29024521625 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 1 Aug 2024 09:34:01 -0700
Subject: net: remove IFF_* re-definition

We re-define values of enum netdev_priv_flags as preprocessor
macros with the same name. I guess this was done to avoid breaking
out of tree modules which may use #ifdef X for kernel compatibility?
Commit 7aa98047df95 ("net: move net_device priv_flags out from UAPI")
which added the enum doesn't say. In any case, the flags with defines
are quite old now, and defines for new flags don't get added.
OOT drivers have to resort to code greps for compat detection, anyway.
Let's delete these defines, save LoC, help LXR link to the right place.

Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20240801163401.378723-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 32 --------------------------------
 1 file changed, 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 607009150b5f..0ef3eaa23f4b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1689,38 +1689,6 @@ enum netdev_priv_flags {
 	IFF_SEE_ALL_HWTSTAMP_REQUESTS	= BIT_ULL(33),
 };
 
-#define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
-#define IFF_EBRIDGE			IFF_EBRIDGE
-#define IFF_BONDING			IFF_BONDING
-#define IFF_ISATAP			IFF_ISATAP
-#define IFF_WAN_HDLC			IFF_WAN_HDLC
-#define IFF_XMIT_DST_RELEASE		IFF_XMIT_DST_RELEASE
-#define IFF_DONT_BRIDGE			IFF_DONT_BRIDGE
-#define IFF_DISABLE_NETPOLL		IFF_DISABLE_NETPOLL
-#define IFF_MACVLAN_PORT		IFF_MACVLAN_PORT
-#define IFF_BRIDGE_PORT			IFF_BRIDGE_PORT
-#define IFF_OVS_DATAPATH		IFF_OVS_DATAPATH
-#define IFF_TX_SKB_SHARING		IFF_TX_SKB_SHARING
-#define IFF_UNICAST_FLT			IFF_UNICAST_FLT
-#define IFF_TEAM_PORT			IFF_TEAM_PORT
-#define IFF_SUPP_NOFCS			IFF_SUPP_NOFCS
-#define IFF_LIVE_ADDR_CHANGE		IFF_LIVE_ADDR_CHANGE
-#define IFF_MACVLAN			IFF_MACVLAN
-#define IFF_XMIT_DST_RELEASE_PERM	IFF_XMIT_DST_RELEASE_PERM
-#define IFF_L3MDEV_MASTER		IFF_L3MDEV_MASTER
-#define IFF_NO_QUEUE			IFF_NO_QUEUE
-#define IFF_OPENVSWITCH			IFF_OPENVSWITCH
-#define IFF_L3MDEV_SLAVE		IFF_L3MDEV_SLAVE
-#define IFF_TEAM			IFF_TEAM
-#define IFF_RXFH_CONFIGURED		IFF_RXFH_CONFIGURED
-#define IFF_PHONY_HEADROOM		IFF_PHONY_HEADROOM
-#define IFF_MACSEC			IFF_MACSEC
-#define IFF_NO_RX_HANDLER		IFF_NO_RX_HANDLER
-#define IFF_FAILOVER			IFF_FAILOVER
-#define IFF_FAILOVER_SLAVE		IFF_FAILOVER_SLAVE
-#define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER
-#define IFF_TX_SKB_NO_LINEAR		IFF_TX_SKB_NO_LINEAR
-
 /* Specifies the type of the struct net_device::ml_priv pointer */
 enum netdev_ml_priv_type {
 	ML_PRIV_NONE,
-- 
cgit v1.2.3


From 7e1d512dab5042aa1c2224ed362be79e3f22a15e Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Thu, 1 Aug 2024 21:00:03 +0100
Subject: linkmode: Change return type of linkmode_andnot to bool

linkmode_andnot() simply returns the result of bitmap_andnot().
And the return type of bitmap_andnot() is bool.
So it makes sense for the return type of linkmode_andnot()
to also be bool.

I checked all call-sites and they either ignore the return
value or treat it as a bool.

Compile tested only.

Link: https://lore.kernel.org/netdev/68088998-4486-4930-90a4-96a32f08c490@lunn.ch/
Signed-off-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20240801-linkfield-bowl-v1-1-d58f68967802@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/linkmode.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index d94bfd9ac8cc..3b9de09871f6 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -37,8 +37,9 @@ static inline bool linkmode_empty(const unsigned long *src)
 	return bitmap_empty(src, __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
-static inline int linkmode_andnot(unsigned long *dst, const unsigned long *src1,
-				  const unsigned long *src2)
+static inline bool linkmode_andnot(unsigned long *dst,
+				   const unsigned long *src1,
+				   const unsigned long *src2)
 {
 	return bitmap_andnot(dst, src1, src2,  __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
-- 
cgit v1.2.3


From 8585632a8600d26d4e6b7246d375a9c3a344f0d2 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Tue, 9 Jul 2024 13:14:28 +0200
Subject: iio: backend: remove unused parameter

Indio_dev was not being used in iio_backend_extend_chan_spec() so remove
it.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://patch.msgid.link/20240709-dev-iio-backend-add-debugfs-v1-1-fb4b8f2373c7@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/backend.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index 8099759d7242..4e81931703ab 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -142,8 +142,7 @@ ssize_t iio_backend_ext_info_set(struct iio_dev *indio_dev, uintptr_t private,
 ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private,
 				 const struct iio_chan_spec *chan, char *buf);
 
-int iio_backend_extend_chan_spec(struct iio_dev *indio_dev,
-				 struct iio_backend *back,
+int iio_backend_extend_chan_spec(struct iio_backend *back,
 				 struct iio_chan_spec *chan);
 void *iio_backend_get_priv(const struct iio_backend *conv);
 struct iio_backend *devm_iio_backend_get(struct device *dev, const char *name);
-- 
cgit v1.2.3


From 2477d7b179d3295c9978420027750f047976bd04 Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Fri, 26 Jul 2024 15:18:40 -0500
Subject: iio: backend: spelling: continuous -> continuous

This fixes the spelling in IIO_BACKEND_INTERNAL_CONTINUOUS_WAVE.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Link: https://patch.msgid.link/20240726-iio-backend-spelling-continuous-v1-1-467c6e3f78ff@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/backend.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index 4e81931703ab..29c4cf0bd761 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -17,7 +17,7 @@ enum iio_backend_data_type {
 };
 
 enum iio_backend_data_source {
-	IIO_BACKEND_INTERNAL_CONTINUOS_WAVE,
+	IIO_BACKEND_INTERNAL_CONTINUOUS_WAVE,
 	IIO_BACKEND_EXTERNAL,
 	IIO_BACKEND_DATA_SOURCE_MAX
 };
-- 
cgit v1.2.3


From f44e94afbb340be90100f8a172ebb14a9e717c8b Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Fri, 26 Jul 2024 10:23:15 +0200
Subject: iio: core: annotate masklength as __private

Now that all users are using the proper accessors, we can mark
masklength as __private so that no one tries to write. We also get help
from checkers in warning us in case someone does it.

To access the private field from IIO core code, we need to use the
ACCESS_PRIVATE() macro.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://patch.msgid.link/20240726-dev-iio-masklength-private3-v1-23-82913fc0fb87@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index dd6bbc468283..f6c0499853bb 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -609,7 +609,7 @@ struct iio_dev {
 	int				scan_bytes;
 
 	const unsigned long		*available_scan_masks;
-	unsigned			masklength;
+	unsigned			__private masklength;
 	const unsigned long		*active_scan_mask;
 	bool				scan_timestamp;
 	struct iio_trigger		*trig;
@@ -861,7 +861,7 @@ static inline const struct iio_scan_type
  */
 static inline unsigned int iio_get_masklength(const struct iio_dev *indio_dev)
 {
-	return indio_dev->masklength;
+	return ACCESS_PRIVATE(indio_dev, masklength);
 }
 
 /**
-- 
cgit v1.2.3


From d092b6869817208326005df0a656e3bb9724d89f Mon Sep 17 00:00:00 2001
From: Julien Stephan <jstephan@baylibre.com>
Date: Wed, 31 Jul 2024 09:05:43 +0200
Subject: iio: core: add function to retrieve active_scan_mask index

Add a function to retrieve the index of the active scan mask inside the
available scan masks array.

As in iio_scan_mask_match and iio_sanity_check_avail_scan_masks,
this function does not handle multi-long masks correctly.
It only checks the first long to be zero, and will use such mask
as a terminator even if there was bits set after the first long.

This should be fine since the available_scan_mask has already been
sanity tested using iio_sanity_check_avail_scan_masks.

See iio_scan_mask_match and iio_sanity_check_avail_scan_masks for
more details

Signed-off-by: Julien Stephan <jstephan@baylibre.com>
Reviewed-by: David Lechner <dlechner@baylibre.com>
Link: https://patch.msgid.link/20240731-ad7380-add-single-ended-chips-v2-2-cd63bf05744c@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index f6c0499853bb..3a735a9a9eae 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -864,6 +864,8 @@ static inline unsigned int iio_get_masklength(const struct iio_dev *indio_dev)
 	return ACCESS_PRIVATE(indio_dev, masklength);
 }
 
+int iio_active_scan_mask_index(struct iio_dev *indio_dev);
+
 /**
  * iio_for_each_active_channel - Iterated over active channels
  * @indio_dev: the IIO device
-- 
cgit v1.2.3


From 2256f37e24b1979f9a97de0b76cabbf8544a8aff Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Fri, 2 Aug 2024 16:26:59 +0200
Subject: iio: backend: introduce struct iio_backend_info

Instead of only passing the backend ops when calling
devm_iio_backend_register(), pass an info like structure that will
contains the ops and additional information. Fow now, the backend name
is being added as that will be used by the debugFS interface introduced
in a later patch.

It also opens the door for further customizations passed by backends.

All users of devm_iio_backend_register() were updated accordingly.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://patch.msgid.link/20240802-dev-iio-backend-add-debugfs-v2-1-4cb62852f0d0@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/backend.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index 29c4cf0bd761..f120fa2e0a43 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -115,6 +115,16 @@ struct iio_backend_ops {
 			    const struct iio_chan_spec *chan, char *buf);
 };
 
+/**
+ * struct iio_backend_info - info structure for an iio_backend
+ * @name: Backend name.
+ * @ops: Backend operations.
+ */
+struct iio_backend_info {
+	const char *name;
+	const struct iio_backend_ops *ops;
+};
+
 int iio_backend_chan_enable(struct iio_backend *back, unsigned int chan);
 int iio_backend_chan_disable(struct iio_backend *back, unsigned int chan);
 int devm_iio_backend_enable(struct device *dev, struct iio_backend *back);
@@ -151,6 +161,6 @@ __devm_iio_backend_get_from_fwnode_lookup(struct device *dev,
 					  struct fwnode_handle *fwnode);
 
 int devm_iio_backend_register(struct device *dev,
-			      const struct iio_backend_ops *ops, void *priv);
+			      const struct iio_backend_info *info, void *priv);
 
 #endif
-- 
cgit v1.2.3


From cdf01e0809a4c6c7877ea52401c2a6679df7aed6 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Fri, 2 Aug 2024 16:27:00 +0200
Subject: iio: backend: add debugFs interface

This adds a basic debugfs interface for backends. Two new ops are being
added:

 * debugfs_reg_access: Analogous to the core IIO one but for backend
   devices.
 * debugfs_print_chan_status: One useful usecase for this one is for
   testing test tones in a digital interface and "ask" the backend to
   dump more details on why a test tone might have errors.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://patch.msgid.link/20240802-dev-iio-backend-add-debugfs-v2-2-4cb62852f0d0@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/backend.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index f120fa2e0a43..9d0dba7ab9e7 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -22,6 +22,8 @@ enum iio_backend_data_source {
 	IIO_BACKEND_DATA_SOURCE_MAX
 };
 
+#define iio_backend_debugfs_ptr(ptr)	PTR_IF(IS_ENABLED(CONFIG_DEBUG_FS), ptr)
+
 /**
  * IIO_BACKEND_EX_INFO - Helper for an IIO extended channel attribute
  * @_name: Attribute name
@@ -81,6 +83,8 @@ enum iio_backend_sample_trigger {
  * @extend_chan_spec: Extend an IIO channel.
  * @ext_info_set: Extended info setter.
  * @ext_info_get: Extended info getter.
+ * @debugfs_print_chan_status: Print channel status into a buffer.
+ * @debugfs_reg_access: Read or write register value of backend.
  **/
 struct iio_backend_ops {
 	int (*enable)(struct iio_backend *back);
@@ -113,6 +117,11 @@ struct iio_backend_ops {
 			    const char *buf, size_t len);
 	int (*ext_info_get)(struct iio_backend *back, uintptr_t private,
 			    const struct iio_chan_spec *chan, char *buf);
+	int (*debugfs_print_chan_status)(struct iio_backend *back,
+					 unsigned int chan, char *buf,
+					 size_t len);
+	int (*debugfs_reg_access)(struct iio_backend *back, unsigned int reg,
+				  unsigned int writeval, unsigned int *readval);
 };
 
 /**
@@ -163,4 +172,9 @@ __devm_iio_backend_get_from_fwnode_lookup(struct device *dev,
 int devm_iio_backend_register(struct device *dev,
 			      const struct iio_backend_info *info, void *priv);
 
+ssize_t iio_backend_debugfs_print_chan_status(struct iio_backend *back,
+					      unsigned int chan, char *buf,
+					      size_t len);
+void iio_backend_debugfs_add(struct iio_backend *back,
+			     struct iio_dev *indio_dev);
 #endif
-- 
cgit v1.2.3


From b001635a7c3014fe7745a507840c5d8f58235c04 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Fri, 2 Aug 2024 16:27:01 +0200
Subject: iio: backend: add a modified prbs23 support

Support ADI specific prb23 sequence that can be used both for
calibrating or debugging digital interfaces.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://patch.msgid.link/20240802-dev-iio-backend-add-debugfs-v2-3-4cb62852f0d0@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/backend.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index 9d0dba7ab9e7..2b9d1aa86552 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -56,6 +56,8 @@ enum iio_backend_test_pattern {
 	IIO_BACKEND_NO_TEST_PATTERN,
 	/* modified prbs9 */
 	IIO_BACKEND_ADI_PRBS_9A = 32,
+	/* modified prbs23 */
+	IIO_BACKEND_ADI_PRBS_23A,
 	IIO_BACKEND_TEST_PATTERN_MAX
 };
 
-- 
cgit v1.2.3


From d21fe1e9a6a44e752e227d3a43f41aed790dad95 Mon Sep 17 00:00:00 2001
From: Inochi Amaoto <inochiama@outlook.com>
Date: Thu, 18 Jul 2024 10:23:54 +0800
Subject: pinctrl: pinconf-generic: Add support for "input-schmitt-microvolt"
 property

Add "input-schmitt-microvolt" property to generic options used for DT
parsing files. This enables drivers, which use generic pin configurations,
to get the value passed to this property.

Signed-off-by: Inochi Amaoto <inochiama@outlook.com>
Link: https://lore.kernel.org/IA1PR20MB4953806785BA04E075DC4F03BBAC2@IA1PR20MB4953.namprd20.prod.outlook.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinconf-generic.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index a65d3d078e58..53cfde98433d 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -81,6 +81,8 @@ struct pinctrl_map;
  * @PIN_CONFIG_INPUT_SCHMITT_ENABLE: control schmitt-trigger mode on the pin.
  *      If the argument != 0, schmitt-trigger mode is enabled. If it's 0,
  *      schmitt-trigger mode is disabled.
+ * @PIN_CONFIG_INPUT_SCHMITT_UV: this will configure an input pin to run in
+ *	schmitt-trigger mode. The argument is in uV.
  * @PIN_CONFIG_MODE_LOW_POWER: this will configure the pin for low power
  *	operation, if several modes of operation are supported these can be
  *	passed in the argument on a custom form, else just use argument 1
@@ -132,6 +134,7 @@ enum pin_config_param {
 	PIN_CONFIG_INPUT_ENABLE,
 	PIN_CONFIG_INPUT_SCHMITT,
 	PIN_CONFIG_INPUT_SCHMITT_ENABLE,
+	PIN_CONFIG_INPUT_SCHMITT_UV,
 	PIN_CONFIG_MODE_LOW_POWER,
 	PIN_CONFIG_MODE_PWM,
 	PIN_CONFIG_OUTPUT,
-- 
cgit v1.2.3


From d7bdb8e6aabe218fd980768a1486434e42761539 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 27 May 2024 16:25:51 +0200
Subject: pmdomain: core: Enable s2idle for CPU PM domains on PREEMPT_RT

To allow a genpd provider for a CPU PM domain to enter a domain-idle-state
during s2idle on a PREEMPT_RT based configuration, we can't use the regular
spinlock, as they are turned into sleepable locks on PREEMPT_RT.

To address this problem, let's convert into using the raw spinlock, but
only for genpd providers that have the GENPD_FLAG_CPU_DOMAIN bit set. In
this way, the lock can still be acquired/released in atomic context, which
is needed in the idle-path for PREEMPT_RT.

Do note that the genpd power-on/off notifiers may also be fired during
s2idle, but these are already prepared for PREEMPT_RT as they are based on
the raw notifiers. However, consumers of them may need to adopt accordingly
to work properly on PREEMPT_RT.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Raghavendra Kakarla <quic_rkakarla@quicinc.com>  # qcm6490 with PREEMPT_RT set
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lore.kernel.org/r/20240527142557.321610-2-ulf.hansson@linaro.org
---
 include/linux/pm_domain.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 858c8e7851fb..b86bb52858ac 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -198,8 +198,11 @@ struct generic_pm_domain {
 			spinlock_t slock;
 			unsigned long lock_flags;
 		};
+		struct {
+			raw_spinlock_t raw_slock;
+			unsigned long raw_lock_flags;
+		};
 	};
-
 };
 
 static inline struct generic_pm_domain *pd_to_genpd(struct dev_pm_domain *pd)
-- 
cgit v1.2.3


From d5934e76316e84eced836b6b2bafae1837d1cd58 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 29 Mar 2024 16:48:48 -0700
Subject: cleanup: Add usage and style documentation

When proposing that PCI grow some new cleanup helpers for pci_dev_put()
and pci_dev_{lock,unlock} [1], Bjorn had some fundamental questions
about expectations and best practices. Upon reviewing an updated
changelog with those details he recommended adding them to documentation
in the header file itself.

Add that documentation and link it into the rendering for
Documentation/core-api/.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/171175585714.2192972.12661675876300167762.stgit@dwillia2-xfh.jf.intel.com
---
 include/linux/cleanup.h | 136 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index d9e613803df1..9c6b4f2c0176 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -4,6 +4,142 @@
 
 #include <linux/compiler.h>
 
+/**
+ * DOC: scope-based cleanup helpers
+ *
+ * The "goto error" pattern is notorious for introducing subtle resource
+ * leaks. It is tedious and error prone to add new resource acquisition
+ * constraints into code paths that already have several unwind
+ * conditions. The "cleanup" helpers enable the compiler to help with
+ * this tedium and can aid in maintaining LIFO (last in first out)
+ * unwind ordering to avoid unintentional leaks.
+ *
+ * As drivers make up the majority of the kernel code base, here is an
+ * example of using these helpers to clean up PCI drivers. The target of
+ * the cleanups are occasions where a goto is used to unwind a device
+ * reference (pci_dev_put()), or unlock the device (pci_dev_unlock())
+ * before returning.
+ *
+ * The DEFINE_FREE() macro can arrange for PCI device references to be
+ * dropped when the associated variable goes out of scope::
+ *
+ *	DEFINE_FREE(pci_dev_put, struct pci_dev *, if (_T) pci_dev_put(_T))
+ *	...
+ *	struct pci_dev *dev __free(pci_dev_put) =
+ *		pci_get_slot(parent, PCI_DEVFN(0, 0));
+ *
+ * The above will automatically call pci_dev_put() if @dev is non-NULL
+ * when @dev goes out of scope (automatic variable scope). If a function
+ * wants to invoke pci_dev_put() on error, but return @dev (i.e. without
+ * freeing it) on success, it can do::
+ *
+ *	return no_free_ptr(dev);
+ *
+ * ...or::
+ *
+ *	return_ptr(dev);
+ *
+ * The DEFINE_GUARD() macro can arrange for the PCI device lock to be
+ * dropped when the scope where guard() is invoked ends::
+ *
+ *	DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T))
+ *	...
+ *	guard(pci_dev)(dev);
+ *
+ * The lifetime of the lock obtained by the guard() helper follows the
+ * scope of automatic variable declaration. Take the following example::
+ *
+ *	func(...)
+ *	{
+ *		if (...) {
+ *			...
+ *			guard(pci_dev)(dev); // pci_dev_lock() invoked here
+ *			...
+ *		} // <- implied pci_dev_unlock() triggered here
+ *	}
+ *
+ * Observe the lock is held for the remainder of the "if ()" block not
+ * the remainder of "func()".
+ *
+ * Now, when a function uses both __free() and guard(), or multiple
+ * instances of __free(), the LIFO order of variable definition order
+ * matters. GCC documentation says:
+ *
+ * "When multiple variables in the same scope have cleanup attributes,
+ * at exit from the scope their associated cleanup functions are run in
+ * reverse order of definition (last defined, first cleanup)."
+ *
+ * When the unwind order matters it requires that variables be defined
+ * mid-function scope rather than at the top of the file.  Take the
+ * following example and notice the bug highlighted by "!!"::
+ *
+ *	LIST_HEAD(list);
+ *	DEFINE_MUTEX(lock);
+ *
+ *	struct object {
+ *	        struct list_head node;
+ *	};
+ *
+ *	static struct object *alloc_add(void)
+ *	{
+ *	        struct object *obj;
+ *
+ *	        lockdep_assert_held(&lock);
+ *	        obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+ *	        if (obj) {
+ *	                LIST_HEAD_INIT(&obj->node);
+ *	                list_add(obj->node, &list):
+ *	        }
+ *	        return obj;
+ *	}
+ *
+ *	static void remove_free(struct object *obj)
+ *	{
+ *	        lockdep_assert_held(&lock);
+ *	        list_del(&obj->node);
+ *	        kfree(obj);
+ *	}
+ *
+ *	DEFINE_FREE(remove_free, struct object *, if (_T) remove_free(_T))
+ *	static int init(void)
+ *	{
+ *	        struct object *obj __free(remove_free) = NULL;
+ *	        int err;
+ *
+ *	        guard(mutex)(&lock);
+ *	        obj = alloc_add();
+ *
+ *	        if (!obj)
+ *	                return -ENOMEM;
+ *
+ *	        err = other_init(obj);
+ *	        if (err)
+ *	                return err; // remove_free() called without the lock!!
+ *
+ *	        no_free_ptr(obj);
+ *	        return 0;
+ *	}
+ *
+ * That bug is fixed by changing init() to call guard() and define +
+ * initialize @obj in this order::
+ *
+ *	guard(mutex)(&lock);
+ *	struct object *obj __free(remove_free) = alloc_add();
+ *
+ * Given that the "__free(...) = NULL" pattern for variables defined at
+ * the top of the function poses this potential interdependency problem
+ * the recommendation is to always define and assign variables in one
+ * statement and not group variable definitions at the top of the
+ * function when __free() is used.
+ *
+ * Lastly, given that the benefit of cleanup helpers is removal of
+ * "goto", and that the "goto" statement can jump between scopes, the
+ * expectation is that usage of "goto" and cleanup helpers is never
+ * mixed in the same function. I.e. for a given routine, convert all
+ * resources that need a "goto" cleanup to scope-based cleanup, or
+ * convert none of them.
+ */
+
 /*
  * DEFINE_FREE(name, type, free):
  *	simple helper macro that defines the required wrapper for a __free()
-- 
cgit v1.2.3


From a720dee5e039238a44c0142dfccdc0e35c1125f7 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Sat, 13 Jul 2024 19:47:33 +1200
Subject: hid-asus: use hid for brightness control on keyboard

On almost all ASUS ROG series laptops the MCU used for the USB keyboard
also has a HID packet used for setting the brightness. This is usually
the same as the WMI method. But in some laptops the WMI method either
is missing or doesn't work, so we should default to the HID control.

Signed-off-by: Luke D. Jones <luke@ljones.dev>
Acked-by: Benjamin Tissoires <bentiss@kernel.org>
Link: https://lore.kernel.org/r/20240713074733.77334-2-luke@ljones.dev
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/asus-wmi.h | 36 ++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 5e6f407b2def..b601b245a035 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -4,6 +4,7 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
+#include <linux/dmi.h>
 
 /* WMI Methods */
 #define ASUS_WMI_METHODID_SPEC	        0x43455053 /* BIOS SPECification */
@@ -165,4 +166,39 @@ static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1,
 }
 #endif
 
+/* To be used by both hid-asus and asus-wmi to determine which controls kbd_brightness */
+static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = {
+	{
+		.matches = {
+			DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Zephyrus"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Strix"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Flow"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GA403U"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "GU605M"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "RC71L"),
+		},
+	},
+	{ },
+};
+
 #endif	/* __PLATFORM_DATA_X86_ASUS_WMI_H */
-- 
cgit v1.2.3


From 25162a4f64f8ba0065f300977589fe1f6af332f0 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sun, 4 Aug 2024 17:16:25 -0700
Subject: Input: cyttsp4 - remove driver

The cyttsp4 touchscreen driver was contributed in 2013 and since then
has seen no updates. The driver uses platform data (no device tree
support) and there are no users of it in the mainline kernel. There were
occasional fixes to it for issues either found by static code analysis
tools or via visual inspection, but otherwise the driver is completely
untested.

Remove the driver.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://lore.kernel.org/r/ZrAZ2cUow_z838tp@google.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/platform_data/cyttsp4.h | 62 -----------------------------------
 1 file changed, 62 deletions(-)
 delete mode 100644 include/linux/platform_data/cyttsp4.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cyttsp4.h b/include/linux/platform_data/cyttsp4.h
deleted file mode 100644
index 5dc9d2be384b..000000000000
--- a/include/linux/platform_data/cyttsp4.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Header file for:
- * Cypress TrueTouch(TM) Standard Product (TTSP) touchscreen drivers.
- * For use with Cypress Txx3xx parts.
- * Supported parts include:
- * CY8CTST341
- * CY8CTMA340
- *
- * Copyright (C) 2009, 2010, 2011 Cypress Semiconductor, Inc.
- * Copyright (C) 2012 Javier Martinez Canillas <javier@dowhile0.org>
- *
- * Contact Cypress Semiconductor at www.cypress.com (kev@cypress.com)
- */
-#ifndef _CYTTSP4_H_
-#define _CYTTSP4_H_
-
-#define CYTTSP4_MT_NAME "cyttsp4_mt"
-#define CYTTSP4_I2C_NAME "cyttsp4_i2c_adapter"
-#define CYTTSP4_SPI_NAME "cyttsp4_spi_adapter"
-
-#define CY_TOUCH_SETTINGS_MAX 32
-
-struct touch_framework {
-	const uint16_t  *abs;
-	uint8_t         size;
-	uint8_t         enable_vkeys;
-} __packed;
-
-struct cyttsp4_mt_platform_data {
-	struct touch_framework *frmwrk;
-	unsigned short flags;
-	char const *inp_dev_name;
-};
-
-struct touch_settings {
-	const uint8_t *data;
-	uint32_t size;
-	uint8_t tag;
-} __packed;
-
-struct cyttsp4_core_platform_data {
-	int irq_gpio;
-	int rst_gpio;
-	int level_irq_udelay;
-	int (*xres)(struct cyttsp4_core_platform_data *pdata,
-		struct device *dev);
-	int (*init)(struct cyttsp4_core_platform_data *pdata,
-		int on, struct device *dev);
-	int (*power)(struct cyttsp4_core_platform_data *pdata,
-		int on, struct device *dev, atomic_t *ignore_irq);
-	int (*irq_stat)(struct cyttsp4_core_platform_data *pdata,
-		struct device *dev);
-	struct touch_settings *sett[CY_TOUCH_SETTINGS_MAX];
-};
-
-struct cyttsp4_platform_data {
-	struct cyttsp4_core_platform_data *core_pdata;
-	struct cyttsp4_mt_platform_data *mt_pdata;
-};
-
-#endif /* _CYTTSP4_H_ */
-- 
cgit v1.2.3


From d4433e8b405a882fa2ef29601c4ad262ba6e5526 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Aug 2024 13:40:26 +0000
Subject: inet: constify 'struct net' parameter of various lookup helpers

Following helpers do not touch their struct net argument:

- bpf_sk_lookup_run_v4()
- inet_lookup_reuseport()
- inet_lhash2_lookup()
- inet_lookup_run_sk_lookup()
- __inet_lookup_listener()
- __inet_lookup_established()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20240802134029.3748005-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/filter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index b6672ff61407..4acd1da4dac6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1616,7 +1616,7 @@ extern struct static_key_false bpf_sk_lookup_enabled;
 		_all_pass || _selected_sk ? SK_PASS : SK_DROP;		\
 	 })
 
-static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol,
+static inline bool bpf_sk_lookup_run_v4(const struct net *net, int protocol,
 					const __be32 saddr, const __be16 sport,
 					const __be32 daddr, const u16 dport,
 					const int ifindex, struct sock **psk)
-- 
cgit v1.2.3


From 10b2a44ccb0cdf85480b6f73a23530aa3ac722de Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Aug 2024 13:40:28 +0000
Subject: inet6: constify 'struct net' parameter of various lookup helpers

Following helpers do not touch their struct net argument:

- bpf_sk_lookup_run_v6()
- __inet6_lookup_established()
- inet6_lookup_reuseport()
- inet6_lookup_listener()
- inet6_lookup_run_sk_lookup()
- __inet6_lookup()
- inet6_lookup()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20240802134029.3748005-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/filter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4acd1da4dac6..64e1506fefb8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1653,7 +1653,7 @@ static inline bool bpf_sk_lookup_run_v4(const struct net *net, int protocol,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
+static inline bool bpf_sk_lookup_run_v6(const struct net *net, int protocol,
 					const struct in6_addr *saddr,
 					const __be16 sport,
 					const struct in6_addr *daddr,
-- 
cgit v1.2.3


From 7e45c1e9edc0004b914e2d5a33245908a06f072c Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Tue, 30 Jul 2024 16:40:52 +0300
Subject: net/mlx5: Add support for MTPTM and MTCTR registers

Make Management Precision Time Measurement (MTPTM) register and Management
Cross Timestamp (MTCTR) register usable in mlx5 driver.

Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>
Tested-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20240730134055.1835261-2-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/device.h   |  7 ++++++-
 include/linux/mlx5/driver.h   |  2 ++
 include/linux/mlx5/mlx5_ifc.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index ba875a619b97..a94bc9e3af96 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1243,7 +1243,8 @@ enum mlx5_pcam_feature_groups {
 enum mlx5_mcam_reg_groups {
 	MLX5_MCAM_REGS_FIRST_128                    = 0x0,
 	MLX5_MCAM_REGS_0x9100_0x917F                = 0x2,
-	MLX5_MCAM_REGS_NUM                          = 0x3,
+	MLX5_MCAM_REGS_0x9180_0x91FF                = 0x3,
+	MLX5_MCAM_REGS_NUM                          = 0x4,
 };
 
 enum mlx5_mcam_feature_groups {
@@ -1392,6 +1393,10 @@ enum mlx5_qcam_feature_groups {
 	MLX5_GET(mcam_reg, (mdev)->caps.mcam[MLX5_MCAM_REGS_0x9100_0x917F], \
 		 mng_access_reg_cap_mask.access_regs2.reg)
 
+#define MLX5_CAP_MCAM_REG3(mdev, reg) \
+	MLX5_GET(mcam_reg, (mdev)->caps.mcam[MLX5_MCAM_REGS_0x9180_0x91FF], \
+		 mng_access_reg_cap_mask.access_regs3.reg)
+
 #define MLX5_CAP_MCAM_FEATURE(mdev, fld) \
 	MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_feature_cap_mask.enhanced_features.fld)
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a96438ded15f..9f42834f57c5 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -159,6 +159,8 @@ enum {
 	MLX5_REG_MSECQ		 = 0x9155,
 	MLX5_REG_MSEES		 = 0x9156,
 	MLX5_REG_MIRC		 = 0x9162,
+	MLX5_REG_MTPTM		 = 0x9180,
+	MLX5_REG_MTCTR		 = 0x9181,
 	MLX5_REG_SBCAM		 = 0xB01F,
 	MLX5_REG_RESOURCE_DUMP   = 0xC000,
 	MLX5_REG_DTOR            = 0xC00E,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index cab228cf51c6..234ad6f16e92 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10401,6 +10401,18 @@ struct mlx5_ifc_mcam_access_reg_bits2 {
 	u8         regs_31_to_0[0x20];
 };
 
+struct mlx5_ifc_mcam_access_reg_bits3 {
+	u8         regs_127_to_96[0x20];
+
+	u8         regs_95_to_64[0x20];
+
+	u8         regs_63_to_32[0x20];
+
+	u8         regs_31_to_2[0x1e];
+	u8         mtctr[0x1];
+	u8         mtptm[0x1];
+};
+
 struct mlx5_ifc_mcam_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         feature_group[0x8];
@@ -10413,6 +10425,7 @@ struct mlx5_ifc_mcam_reg_bits {
 		struct mlx5_ifc_mcam_access_reg_bits access_regs;
 		struct mlx5_ifc_mcam_access_reg_bits1 access_regs1;
 		struct mlx5_ifc_mcam_access_reg_bits2 access_regs2;
+		struct mlx5_ifc_mcam_access_reg_bits3 access_regs3;
 		u8         reserved_at_0[0x80];
 	} mng_access_reg_cap_mask;
 
@@ -11166,6 +11179,34 @@ struct mlx5_ifc_mtmp_reg_bits {
 	u8         sensor_name_lo[0x20];
 };
 
+struct mlx5_ifc_mtptm_reg_bits {
+	u8         reserved_at_0[0x10];
+	u8         psta[0x1];
+	u8         reserved_at_11[0xf];
+
+	u8         reserved_at_20[0x60];
+};
+
+enum {
+	MLX5_MTCTR_REQUEST_NOP = 0x0,
+	MLX5_MTCTR_REQUEST_PTM_ROOT_CLOCK = 0x1,
+	MLX5_MTCTR_REQUEST_FREE_RUNNING_COUNTER = 0x2,
+	MLX5_MTCTR_REQUEST_REAL_TIME_CLOCK = 0x3,
+};
+
+struct mlx5_ifc_mtctr_reg_bits {
+	u8         first_clock_timestamp_request[0x8];
+	u8         second_clock_timestamp_request[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         first_clock_valid[0x1];
+	u8         second_clock_valid[0x1];
+	u8         reserved_at_22[0x1e];
+
+	u8         first_clock_timestamp[0x40];
+	u8         second_clock_timestamp[0x40];
+};
+
 union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_bufferx_reg_bits bufferx_reg;
 	struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout;
@@ -11230,6 +11271,8 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_mrtc_reg_bits mrtc_reg;
 	struct mlx5_ifc_mtcap_reg_bits mtcap_reg;
 	struct mlx5_ifc_mtmp_reg_bits mtmp_reg;
+	struct mlx5_ifc_mtptm_reg_bits mtptm_reg;
+	struct mlx5_ifc_mtctr_reg_bits mtctr_reg;
 	u8         reserved_at_0[0x60e0];
 };
 
-- 
cgit v1.2.3


From c114e9948c2b6a0b400266e59cc656b59e795bca Mon Sep 17 00:00:00 2001
From: Roman Kisel <romank@linux.microsoft.com>
Date: Thu, 18 Jul 2024 11:27:24 -0700
Subject: coredump: Standartize and fix logging

The coredump code does not log the process ID and the comm
consistently, logs unescaped comm when it does log it, and
does not always use the ratelimited logging. That makes it
harder to analyze logs and puts the system at the risk of
spamming the system log incase something crashes many times
over and over again.

Fix that by logging TGID and comm (escaped) consistently and
using the ratelimited logging always.

Signed-off-by: Roman Kisel <romank@linux.microsoft.com>
Tested-by: Allen Pais <apais@linux.microsoft.com>
Link: https://lore.kernel.org/r/20240718182743.1959160-2-romank@linux.microsoft.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/coredump.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 0904ba010341..45e598fe3476 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -43,8 +43,30 @@ extern int dump_align(struct coredump_params *cprm, int align);
 int dump_user_range(struct coredump_params *cprm, unsigned long start,
 		    unsigned long len);
 extern void do_coredump(const kernel_siginfo_t *siginfo);
+
+/*
+ * Logging for the coredump code, ratelimited.
+ * The TGID and comm fields are added to the message.
+ */
+
+#define __COREDUMP_PRINTK(Level, Format, ...) \
+	do {	\
+		char comm[TASK_COMM_LEN];	\
+	\
+		get_task_comm(comm, current);	\
+		printk_ratelimited(Level "coredump: %d(%*pE): " Format "\n",	\
+			task_tgid_vnr(current), (int)strlen(comm), comm, ##__VA_ARGS__);	\
+	} while (0)	\
+
+#define coredump_report(fmt, ...) __COREDUMP_PRINTK(KERN_INFO, fmt, ##__VA_ARGS__)
+#define coredump_report_failure(fmt, ...) __COREDUMP_PRINTK(KERN_WARNING, fmt, ##__VA_ARGS__)
+
 #else
 static inline void do_coredump(const kernel_siginfo_t *siginfo) {}
+
+#define coredump_report(...)
+#define coredump_report_failure(...)
+
 #endif
 
 #if defined(CONFIG_COREDUMP) && defined(CONFIG_SYSCTL)
-- 
cgit v1.2.3


From fb97d2eb542faf19a8725afbd75cbc2518903210 Mon Sep 17 00:00:00 2001
From: Roman Kisel <romank@linux.microsoft.com>
Date: Thu, 18 Jul 2024 11:27:25 -0700
Subject: binfmt_elf, coredump: Log the reason of the failed core dumps

Missing, failed, or corrupted core dumps might impede crash
investigations. To improve reliability of that process and consequently
the programs themselves, one needs to trace the path from producing
a core dumpfile to analyzing it. That path starts from the core dump file
written to the disk by the kernel or to the standard input of a user
mode helper program to which the kernel streams the coredump contents.
There are cases where the kernel will interrupt writing the core out or
produce a truncated/not-well-formed core dump without leaving a note.

Add logging for the core dump collection failure paths to be able to reason
what has gone wrong when the core dump is malformed or missing.
Report the size of the data written to aid in diagnosing the user mode
helper.

Signed-off-by: Roman Kisel <romank@linux.microsoft.com>
Link: https://lore.kernel.org/r/20240718182743.1959160-3-romank@linux.microsoft.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/coredump.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 45e598fe3476..edeb8532ce0f 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -42,7 +42,7 @@ extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr);
 extern int dump_align(struct coredump_params *cprm, int align);
 int dump_user_range(struct coredump_params *cprm, unsigned long start,
 		    unsigned long len);
-extern void do_coredump(const kernel_siginfo_t *siginfo);
+extern int do_coredump(const kernel_siginfo_t *siginfo);
 
 /*
  * Logging for the coredump code, ratelimited.
@@ -62,7 +62,11 @@ extern void do_coredump(const kernel_siginfo_t *siginfo);
 #define coredump_report_failure(fmt, ...) __COREDUMP_PRINTK(KERN_WARNING, fmt, ##__VA_ARGS__)
 
 #else
-static inline void do_coredump(const kernel_siginfo_t *siginfo) {}
+static inline int do_coredump(const kernel_siginfo_t *siginfo)
+{
+	/* Coredump support is not available, can't fail. */
+	return 0;
+}
 
 #define coredump_report(...)
 #define coredump_report_failure(...)
-- 
cgit v1.2.3


From 7424fc6b86c8980a87169e005f5cd4438d18efe6 Mon Sep 17 00:00:00 2001
From: Gatlin Newhouse <gatlin.newhouse@gmail.com>
Date: Wed, 24 Jul 2024 00:01:55 +0000
Subject: x86/traps: Enable UBSAN traps on x86

Currently ARM64 extracts which specific sanitizer has caused a trap via
encoded data in the trap instruction. Clang on x86 currently encodes the
same data in the UD1 instruction but x86 handle_bug() and
is_valid_bugaddr() currently only look at UD2.

Bring x86 to parity with ARM64, similar to commit 25b84002afb9 ("arm64:
Support Clang UBSAN trap codes for better reporting"). See the llvm
links for information about the code generation.

Enable the reporting of UBSAN sanitizer details on x86 compiled with clang
when CONFIG_UBSAN_TRAP=y by analysing UD1 and retrieving the type immediate
which is encoded by the compiler after the UD1.

[ tglx: Simplified it by moving the printk() into handle_bug() ]

Signed-off-by: Gatlin Newhouse <gatlin.newhouse@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/all/20240724000206.451425-1-gatlin.newhouse@gmail.com
Link: https://github.com/llvm/llvm-project/commit/c5978f42ec8e9#diff-bb68d7cd885f41cfc35843998b0f9f534adb60b415f647109e597ce448e92d9f
Link: https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/X86/X86InstrSystem.td#L27
---
 include/linux/ubsan.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ubsan.h b/include/linux/ubsan.h
index bff7445498de..d8219cbe09ff 100644
--- a/include/linux/ubsan.h
+++ b/include/linux/ubsan.h
@@ -4,6 +4,11 @@
 
 #ifdef CONFIG_UBSAN_TRAP
 const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type);
+#else
+static inline const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type)
+{
+	return NULL;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 97edbc02b2efdb0cd0f507b6a45fc509acc861bb Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 10 Jul 2024 14:51:11 -0400
Subject: buffer: Convert block_write_end() to take a folio

All callers now have a folio, so pass it in instead of converting
from a folio to a page and back to a folio again.  Saves a call
to compound_head().

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/buffer_head.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 14acf1bbe0ce..3a3fec154536 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -262,8 +262,8 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
 int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 		get_block_t *get_block);
 int block_write_end(struct file *, struct address_space *,
-				loff_t, unsigned, unsigned,
-				struct page *, void *);
+				loff_t, unsigned len, unsigned copied,
+				struct folio *, void *);
 int generic_write_end(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page *, void *);
-- 
cgit v1.2.3


From a225800f322a3d6cc8b8b6c7dc4d5281f2f5375b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 10 Jul 2024 15:45:32 -0400
Subject: fs: Convert aops->write_end to take a folio

Most callers have a folio, and most implementations operate on a folio,
so remove the conversion from folio->page->folio to fit through this
interface.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/buffer_head.h | 4 ++--
 include/linux/fs.h          | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 3a3fec154536..165e859664a5 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -265,8 +265,8 @@ int block_write_end(struct file *, struct address_space *,
 				loff_t, unsigned len, unsigned copied,
 				struct folio *, void *);
 int generic_write_end(struct file *, struct address_space *,
-				loff_t, unsigned, unsigned,
-				struct page *, void *);
+				loff_t, unsigned len, unsigned copied,
+				struct folio *, void *);
 void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to);
 int cont_write_begin(struct file *, struct address_space *, loff_t,
 			unsigned, struct page **, void **,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd34b5755c0b..1af3373c7cd2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -411,7 +411,7 @@ struct address_space_operations {
 				struct page **pagep, void **fsdata);
 	int (*write_end)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata);
+				struct folio *folio, void *fsdata);
 
 	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
 	sector_t (*bmap)(struct address_space *, sector_t);
-- 
cgit v1.2.3


From 1da86618bdce301d23e89ecce92161f9d3b3c5e7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 15 Jul 2024 14:24:01 -0400
Subject: fs: Convert aops->write_begin to take a folio

Convert all callers from working on a page to working on one page
of a folio (support for working on an entire folio can come later).
Removes a lot of folio->page->folio conversions.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/buffer_head.h | 4 ++--
 include/linux/fs.h          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 165e859664a5..254563a2a9de 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -258,7 +258,7 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
 int block_read_full_folio(struct folio *, get_block_t *);
 bool block_is_partially_uptodate(struct folio *, size_t from, size_t count);
 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
-		struct page **pagep, get_block_t *get_block);
+		struct folio **foliop, get_block_t *get_block);
 int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 		get_block_t *get_block);
 int block_write_end(struct file *, struct address_space *,
@@ -269,7 +269,7 @@ int generic_write_end(struct file *, struct address_space *,
 				struct folio *, void *);
 void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to);
 int cont_write_begin(struct file *, struct address_space *, loff_t,
-			unsigned, struct page **, void **,
+			unsigned, struct folio **, void **,
 			get_block_t *, loff_t *);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
 void block_commit_write(struct page *page, unsigned int from, unsigned int to);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1af3373c7cd2..7a79b88a8f5a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -408,7 +408,7 @@ struct address_space_operations {
 
 	int (*write_begin)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len,
-				struct page **pagep, void **fsdata);
+				struct folio **foliop, void **fsdata);
 	int (*write_end)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct folio *folio, void *fsdata);
@@ -3331,7 +3331,7 @@ extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
 extern int simple_empty(struct dentry *);
 extern int simple_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len,
-			struct page **pagep, void **fsdata);
+			struct folio **foliop, void **fsdata);
 extern const struct address_space_operations ram_aops;
 extern int always_delete_dentry(const struct dentry *);
 extern struct inode *alloc_anon_inode(struct super_block *);
-- 
cgit v1.2.3


From 9f04609f74ec7a439e1ac42da5db9e6ddf4f7b13 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 10 Jul 2024 23:09:04 -0400
Subject: buffer: Convert __block_write_begin() to take a folio

Almost all callers have a folio now, so change __block_write_begin()
to take a folio and remove a call to compound_head().

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/buffer_head.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 254563a2a9de..c85e65fd4cdc 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -259,7 +259,7 @@ int block_read_full_folio(struct folio *, get_block_t *);
 bool block_is_partially_uptodate(struct folio *, size_t from, size_t count);
 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
 		struct folio **foliop, get_block_t *get_block);
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
 		get_block_t *get_block);
 int block_write_end(struct file *, struct address_space *,
 				loff_t, unsigned len, unsigned copied,
-- 
cgit v1.2.3


From 07a83512fa5be3f3d46369eee6352102fd9fb505 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Draszik?= <andre.draszik@linaro.org>
Date: Wed, 10 Jul 2024 11:36:08 +0100
Subject: usb: typec: tcpci: fix a comment typo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

autonously -> autonomously

Signed-off-by: André Draszik <andre.draszik@linaro.org>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20240710-tcpc-cleanup-v1-1-0ec1f41f4263@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/tcpci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h
index 0ab39b6ea205..d27fe0c22a8b 100644
--- a/include/linux/usb/tcpci.h
+++ b/include/linux/usb/tcpci.h
@@ -190,7 +190,7 @@ struct tcpci;
  *		Optional; Callback to perform chip specific operations when FRS
  *		is sourcing vbus.
  * @auto_discharge_disconnect:
- *		Optional; Enables TCPC to autonously discharge vbus on disconnect.
+ *		Optional; Enables TCPC to autonomously discharge vbus on disconnect.
  * @vbus_vsafe0v:
  *		optional; Set when TCPC can detect whether vbus is at VSAFE0V.
  * @set_partner_usb_comm_capable:
-- 
cgit v1.2.3


From f523aa6d72598bcd2946c7e02d29f33f94806a15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Draszik?= <andre.draszik@linaro.org>
Date: Wed, 10 Jul 2024 11:36:10 +0100
Subject: usb: typec: tcpci: use GENMASK() for TCPC_CC_STATUS_CC[12]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The existing code here, particularly in maxim_contaminant.c, is
arguably quite hard to read due to the open-coded masking and shifting
spanning multiple lines.

Use GENMASK() and FIELD_GET() instead, which arguably make the code
much easier to follow.

While at it, use the symbolic name TCPC_CC_STATE_SRC_OPEN for one
instance of open-coded 0x0.

Signed-off-by: André Draszik <andre.draszik@linaro.org>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20240710-tcpc-cleanup-v1-3-0ec1f41f4263@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/tcpci.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h
index d27fe0c22a8b..31d21ccf662b 100644
--- a/include/linux/usb/tcpci.h
+++ b/include/linux/usb/tcpci.h
@@ -92,11 +92,9 @@
 #define TCPC_CC_STATUS_TERM		BIT(4)
 #define TCPC_CC_STATUS_TERM_RP		0
 #define TCPC_CC_STATUS_TERM_RD		1
+#define TCPC_CC_STATUS_CC2		GENMASK(3, 2)
+#define TCPC_CC_STATUS_CC1		GENMASK(1, 0)
 #define TCPC_CC_STATE_SRC_OPEN		0
-#define TCPC_CC_STATUS_CC2_SHIFT	2
-#define TCPC_CC_STATUS_CC2_MASK		0x3
-#define TCPC_CC_STATUS_CC1_SHIFT	0
-#define TCPC_CC_STATUS_CC1_MASK		0x3
 
 #define TCPC_POWER_STATUS		0x1e
 #define TCPC_POWER_STATUS_DBG_ACC_CON	BIT(7)
@@ -256,7 +254,7 @@ static inline enum typec_cc_status tcpci_to_typec_cc(unsigned int cc, bool sink)
 		if (sink)
 			return TYPEC_CC_RP_3_0;
 		fallthrough;
-	case 0x0:
+	case TCPC_CC_STATE_SRC_OPEN:
 	default:
 		return TYPEC_CC_OPEN;
 	}
-- 
cgit v1.2.3


From 83b254c13ac093810e1058d9cb1bbb4b7ef9c246 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Draszik?= <andre.draszik@linaro.org>
Date: Wed, 10 Jul 2024 11:36:11 +0100
Subject: usb: typec: tcpci: use GENMASK() for TCPC_ROLE_CTRL_CC[12]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All this open-coded shifting and masking is quite hard to read, in
particular the if-statement in tcpci_apply_rc().

Declare TCPC_ROLE_CTRL_CC[12] using GENMASK() which allows using
FIELD_GET() and FIELD_PREP() to arguably make the code more legible.

Signed-off-by: André Draszik <andre.draszik@linaro.org>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20240710-tcpc-cleanup-v1-4-0ec1f41f4263@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/tcpci.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h
index 31d21ccf662b..552d074429f0 100644
--- a/include/linux/usb/tcpci.h
+++ b/include/linux/usb/tcpci.h
@@ -68,10 +68,8 @@
 #define TCPC_ROLE_CTRL_RP_VAL_DEF	0x0
 #define TCPC_ROLE_CTRL_RP_VAL_1_5	0x1
 #define TCPC_ROLE_CTRL_RP_VAL_3_0	0x2
-#define TCPC_ROLE_CTRL_CC2_SHIFT	2
-#define TCPC_ROLE_CTRL_CC2_MASK		0x3
-#define TCPC_ROLE_CTRL_CC1_SHIFT	0
-#define TCPC_ROLE_CTRL_CC1_MASK		0x3
+#define TCPC_ROLE_CTRL_CC2		GENMASK(3, 2)
+#define TCPC_ROLE_CTRL_CC1		GENMASK(1, 0)
 #define TCPC_ROLE_CTRL_CC_RA		0x0
 #define TCPC_ROLE_CTRL_CC_RP		0x1
 #define TCPC_ROLE_CTRL_CC_RD		0x2
@@ -176,8 +174,7 @@
 
 #define tcpc_presenting_rd(reg, cc) \
 	(!(TCPC_ROLE_CTRL_DRP & (reg)) && \
-	 (((reg) & (TCPC_ROLE_CTRL_## cc ##_MASK << TCPC_ROLE_CTRL_## cc ##_SHIFT)) == \
-	  (TCPC_ROLE_CTRL_CC_RD << TCPC_ROLE_CTRL_## cc ##_SHIFT)))
+	 FIELD_GET(TCPC_ROLE_CTRL_## cc, reg) == TCPC_ROLE_CTRL_CC_RD)
 
 struct tcpci;
 
-- 
cgit v1.2.3


From 46b1e0f87ba89f0a06ffbde5fe8d13c5af93f94e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Draszik?= <andre.draszik@linaro.org>
Date: Wed, 10 Jul 2024 11:36:12 +0100
Subject: usb: typec: tcpci: use GENMASK() for TCPC_ROLE_CTRL_RP_VAL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Align the last remaining field TCPC_ROLE_CTRL_RP_VAL with the other
fields in the TCPC_ROLE_CTRL register by using GENMASK() and
FIELD_PREP().

Signed-off-by: André Draszik <andre.draszik@linaro.org>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20240710-tcpc-cleanup-v1-5-0ec1f41f4263@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/tcpci.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h
index 552d074429f0..80652d4f722e 100644
--- a/include/linux/usb/tcpci.h
+++ b/include/linux/usb/tcpci.h
@@ -63,8 +63,7 @@
 
 #define TCPC_ROLE_CTRL			0x1a
 #define TCPC_ROLE_CTRL_DRP		BIT(6)
-#define TCPC_ROLE_CTRL_RP_VAL_SHIFT	4
-#define TCPC_ROLE_CTRL_RP_VAL_MASK	0x3
+#define TCPC_ROLE_CTRL_RP_VAL		GENMASK(5, 4)
 #define TCPC_ROLE_CTRL_RP_VAL_DEF	0x0
 #define TCPC_ROLE_CTRL_RP_VAL_1_5	0x1
 #define TCPC_ROLE_CTRL_RP_VAL_3_0	0x2
-- 
cgit v1.2.3


From aee4568f42e0d7526207a10944bb89bb45522b59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Draszik?= <andre.draszik@linaro.org>
Date: Wed, 10 Jul 2024 11:36:13 +0100
Subject: usb: typec: tcpci: use GENMASK() for TCPC_MSG_HDR_INFO_REV
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert field TCPC_MSG_HDR_INFO_REV from register TCPC_MSG_HDR_INFO to
using GENMASK() and FIELD_PREP() so as to keep using a similar approach
for all fields.

Signed-off-by: André Draszik <andre.draszik@linaro.org>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20240710-tcpc-cleanup-v1-6-0ec1f41f4263@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/tcpci.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h
index 80652d4f722e..3cd61e9f73b3 100644
--- a/include/linux/usb/tcpci.h
+++ b/include/linux/usb/tcpci.h
@@ -129,9 +129,8 @@
 
 #define TCPC_MSG_HDR_INFO		0x2e
 #define TCPC_MSG_HDR_INFO_DATA_ROLE	BIT(3)
+#define TCPC_MSG_HDR_INFO_REV		GENMASK(2, 1)
 #define TCPC_MSG_HDR_INFO_PWR_ROLE	BIT(0)
-#define TCPC_MSG_HDR_INFO_REV_SHIFT	1
-#define TCPC_MSG_HDR_INFO_REV_MASK	0x3
 
 #define TCPC_RX_DETECT			0x2f
 #define TCPC_RX_DETECT_HARD_RESET	BIT(5)
-- 
cgit v1.2.3


From 7cd41974d2c81055f61ba9f69e31c02e866716e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Draszik?= <andre.draszik@linaro.org>
Date: Wed, 10 Jul 2024 11:36:14 +0100
Subject: usb: typec: tcpci: use GENMASK() for TCPC_TRANSMIT register fields
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert all fields from register TCPC_TRANSMIT to using GENMASK() and
FIELD_PREP() so as to keep using a similar approach throughout the code
base and make it arguably easier to read.

Signed-off-by: André Draszik <andre.draszik@linaro.org>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20240710-tcpc-cleanup-v1-7-0ec1f41f4263@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/tcpci.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h
index 3cd61e9f73b3..f7f5cfbdef12 100644
--- a/include/linux/usb/tcpci.h
+++ b/include/linux/usb/tcpci.h
@@ -148,10 +148,8 @@
 #define TCPC_RX_DATA			0x34 /* through 0x4f */
 
 #define TCPC_TRANSMIT			0x50
-#define TCPC_TRANSMIT_RETRY_SHIFT	4
-#define TCPC_TRANSMIT_RETRY_MASK	0x3
-#define TCPC_TRANSMIT_TYPE_SHIFT	0
-#define TCPC_TRANSMIT_TYPE_MASK		0x7
+#define TCPC_TRANSMIT_RETRY		GENMASK(5, 4)
+#define TCPC_TRANSMIT_TYPE		GENMASK(2, 0)
 
 #define TCPC_TX_BYTE_CNT		0x51
 #define TCPC_TX_HDR			0x52
-- 
cgit v1.2.3


From 08d6fd691bdd5d6e7881ce42ca6774ed92b10896 Mon Sep 17 00:00:00 2001
From: Akash Kumar <quic_akakum@quicinc.com>
Date: Thu, 1 Aug 2024 11:00:03 +0530
Subject: usb: gadget: Increase max configuration interface to 32

Currently, max configuration interfaces are limited to 16, which fails
for compositions containing 10 UVC configurations with interrupt ep
disabled along with other configurations , and we see bind failures
while allocating interface ID in uvc bind.

Increase max configuration interface to 32 to support any large
compositions limited to same size as usb device endpoints as
interfaces cannot be more than endpoints.

Signed-off-by: Akash Kumar <quic_akakum@quicinc.com>
Link: https://lore.kernel.org/r/20240801053003.15153-1-quic_akakum@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/composite.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index af3cd2aae4bc..6e38fb9d2117 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -256,7 +256,7 @@ int config_ep_by_speed(struct usb_gadget *g, struct usb_function *f,
 			struct usb_ep *_ep);
 int usb_func_wakeup(struct usb_function *func);
 
-#define	MAX_CONFIG_INTERFACES		16	/* arbitrary; max 255 */
+#define	MAX_CONFIG_INTERFACES		32
 
 /**
  * struct usb_configuration - represents one gadget configuration
-- 
cgit v1.2.3


From 259b46204885431f2853afa2a2bffa63f3705e67 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Mon, 5 Aug 2024 12:20:37 +0200
Subject: serial: remove quot_frac from serial8250_do_set_divisor()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

quot_frac is unused in serial8250_do_set_divisor() since commit
b2b4b8ed3c06 (serial: 8250_exar: Move custom divisor support out from
8250_port). So no point to pass it.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20240805102046.307511-5-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_8250.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index fd59ed2cca53..e0717c8393d7 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -193,7 +193,7 @@ void serial8250_do_pm(struct uart_port *port, unsigned int state,
 		      unsigned int oldstate);
 void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl);
 void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud,
-			       unsigned int quot, unsigned int quot_frac);
+			       unsigned int quot);
 int fsl8250_handle_irq(struct uart_port *port);
 int serial8250_handle_irq(struct uart_port *port, unsigned int iir);
 u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr);
-- 
cgit v1.2.3


From 130fd056dd82b02db9a661c013071af35309be1a Mon Sep 17 00:00:00 2001
From: Qais Yousef <qyousef@layalina.io>
Date: Mon, 10 Jun 2024 20:20:16 +0100
Subject: sched/rt: Clean up usage of rt_task()

rt_task() checks if a task has RT priority. But depends on your
dictionary, this could mean it belongs to RT class, or is a 'realtime'
task, which includes RT and DL classes.

Since this has caused some confusion already on discussion [1], it
seemed a clean up is due.

I define the usage of rt_task() to be tasks that belong to RT class.
Make sure that it returns true only for RT class and audit the users and
replace the ones required the old behavior with the new realtime_task()
which returns true for RT and DL classes. Introduce similar
realtime_prio() to create similar distinction to rt_prio() and update
the users that required the old behavior to use the new function.

Move MAX_DL_PRIO to prio.h so it can be used in the new definitions.

Document the functions to make it more obvious what is the difference
between them. PI-boosted tasks is a factor that must be taken into
account when choosing which function to use.

Rename task_is_realtime() to realtime_task_policy() as the old name is
confusing against the new realtime_task().

No functional changes were intended.

[1] https://lore.kernel.org/lkml/20240506100509.GL40213@noisy.programming.kicks-ass.net/

Signed-off-by: Qais Yousef <qyousef@layalina.io>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
Reviewed-by: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lore.kernel.org/r/20240610192018.1567075-2-qyousef@layalina.io
---
 include/linux/ioprio.h         |  2 +-
 include/linux/sched/deadline.h |  6 ++++--
 include/linux/sched/prio.h     |  1 +
 include/linux/sched/rt.h       | 27 ++++++++++++++++++++++++++-
 4 files changed, 32 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index db1249cd9692..75859b78d540 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -40,7 +40,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
 {
 	if (task->policy == SCHED_IDLE)
 		return IOPRIO_CLASS_IDLE;
-	else if (task_is_realtime(task))
+	else if (realtime_task_policy(task))
 		return IOPRIO_CLASS_RT;
 	else
 		return IOPRIO_CLASS_BE;
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index df3aca89d4f5..5cb88b748ad6 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -10,8 +10,6 @@
 
 #include <linux/sched.h>
 
-#define MAX_DL_PRIO		0
-
 static inline int dl_prio(int prio)
 {
 	if (unlikely(prio < MAX_DL_PRIO))
@@ -19,6 +17,10 @@ static inline int dl_prio(int prio)
 	return 0;
 }
 
+/*
+ * Returns true if a task has a priority that belongs to DL class. PI-boosted
+ * tasks will return true. Use dl_policy() to ignore PI-boosted tasks.
+ */
 static inline int dl_task(struct task_struct *p)
 {
 	return dl_prio(p->prio);
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index ab83d85e1183..6ab43b4f72f9 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -14,6 +14,7 @@
  */
 
 #define MAX_RT_PRIO		100
+#define MAX_DL_PRIO		0
 
 #define MAX_PRIO		(MAX_RT_PRIO + NICE_WIDTH)
 #define DEFAULT_PRIO		(MAX_RT_PRIO + NICE_WIDTH / 2)
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index b2b9e6eb9683..a055dd68a77c 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -7,18 +7,43 @@
 struct task_struct;
 
 static inline int rt_prio(int prio)
+{
+	if (unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO))
+		return 1;
+	return 0;
+}
+
+static inline int realtime_prio(int prio)
 {
 	if (unlikely(prio < MAX_RT_PRIO))
 		return 1;
 	return 0;
 }
 
+/*
+ * Returns true if a task has a priority that belongs to RT class. PI-boosted
+ * tasks will return true. Use rt_policy() to ignore PI-boosted tasks.
+ */
 static inline int rt_task(struct task_struct *p)
 {
 	return rt_prio(p->prio);
 }
 
-static inline bool task_is_realtime(struct task_struct *tsk)
+/*
+ * Returns true if a task has a priority that belongs to RT or DL classes.
+ * PI-boosted tasks will return true. Use realtime_task_policy() to ignore
+ * PI-boosted tasks.
+ */
+static inline int realtime_task(struct task_struct *p)
+{
+	return realtime_prio(p->prio);
+}
+
+/*
+ * Returns true if a task has a policy that belongs to RT or DL classes.
+ * PI-boosted tasks will return false.
+ */
+static inline bool realtime_task_policy(struct task_struct *tsk)
 {
 	int policy = tsk->policy;
 
-- 
cgit v1.2.3


From b166af3db70fdcecf125662a2360471bb20be203 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qyousef@layalina.io>
Date: Mon, 10 Jun 2024 20:20:17 +0100
Subject: sched/rt, dl: Convert functions to return bool

{rt, realtime, dl}_{task, prio}() functions' return value is actually
a bool. Convert their return type to reflect that.

Suggested-by: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Signed-off-by: Qais Yousef <qyousef@layalina.io>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Reviewed-by: Metin Kaya <metin.kaya@arm.com>
Link: https://lore.kernel.org/r/20240610192018.1567075-3-qyousef@layalina.io
---
 include/linux/sched/deadline.h |  8 +++-----
 include/linux/sched/rt.h       | 16 ++++++----------
 2 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 5cb88b748ad6..3a912ab42bb5 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -10,18 +10,16 @@
 
 #include <linux/sched.h>
 
-static inline int dl_prio(int prio)
+static inline bool dl_prio(int prio)
 {
-	if (unlikely(prio < MAX_DL_PRIO))
-		return 1;
-	return 0;
+	return unlikely(prio < MAX_DL_PRIO);
 }
 
 /*
  * Returns true if a task has a priority that belongs to DL class. PI-boosted
  * tasks will return true. Use dl_policy() to ignore PI-boosted tasks.
  */
-static inline int dl_task(struct task_struct *p)
+static inline bool dl_task(struct task_struct *p)
 {
 	return dl_prio(p->prio);
 }
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index a055dd68a77c..91ef1ef2019f 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -6,25 +6,21 @@
 
 struct task_struct;
 
-static inline int rt_prio(int prio)
+static inline bool rt_prio(int prio)
 {
-	if (unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO))
-		return 1;
-	return 0;
+	return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO);
 }
 
-static inline int realtime_prio(int prio)
+static inline bool realtime_prio(int prio)
 {
-	if (unlikely(prio < MAX_RT_PRIO))
-		return 1;
-	return 0;
+	return unlikely(prio < MAX_RT_PRIO);
 }
 
 /*
  * Returns true if a task has a priority that belongs to RT class. PI-boosted
  * tasks will return true. Use rt_policy() to ignore PI-boosted tasks.
  */
-static inline int rt_task(struct task_struct *p)
+static inline bool rt_task(struct task_struct *p)
 {
 	return rt_prio(p->prio);
 }
@@ -34,7 +30,7 @@ static inline int rt_task(struct task_struct *p)
  * PI-boosted tasks will return true. Use realtime_task_policy() to ignore
  * PI-boosted tasks.
  */
-static inline int realtime_task(struct task_struct *p)
+static inline bool realtime_task(struct task_struct *p)
 {
 	return realtime_prio(p->prio);
 }
-- 
cgit v1.2.3


From ae04f69de0bef93c7086cf2983dbc8e8fd624ebe Mon Sep 17 00:00:00 2001
From: Qais Yousef <qyousef@layalina.io>
Date: Mon, 10 Jun 2024 20:20:18 +0100
Subject: sched/rt: Rename realtime_{prio, task}() to rt_or_dl_{prio, task}()

Some find the name realtime overloaded. Use rt_or_dl() as an
alternative, hopefully better, name.

Suggested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Qais Yousef <qyousef@layalina.io>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240610192018.1567075-4-qyousef@layalina.io
---
 include/linux/ioprio.h   |  2 +-
 include/linux/sched/rt.h | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 75859b78d540..b25377b6ea98 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -40,7 +40,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
 {
 	if (task->policy == SCHED_IDLE)
 		return IOPRIO_CLASS_IDLE;
-	else if (realtime_task_policy(task))
+	else if (rt_or_dl_task_policy(task))
 		return IOPRIO_CLASS_RT;
 	else
 		return IOPRIO_CLASS_BE;
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 91ef1ef2019f..4e3338103654 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -11,7 +11,7 @@ static inline bool rt_prio(int prio)
 	return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO);
 }
 
-static inline bool realtime_prio(int prio)
+static inline bool rt_or_dl_prio(int prio)
 {
 	return unlikely(prio < MAX_RT_PRIO);
 }
@@ -27,19 +27,19 @@ static inline bool rt_task(struct task_struct *p)
 
 /*
  * Returns true if a task has a priority that belongs to RT or DL classes.
- * PI-boosted tasks will return true. Use realtime_task_policy() to ignore
+ * PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore
  * PI-boosted tasks.
  */
-static inline bool realtime_task(struct task_struct *p)
+static inline bool rt_or_dl_task(struct task_struct *p)
 {
-	return realtime_prio(p->prio);
+	return rt_or_dl_prio(p->prio);
 }
 
 /*
  * Returns true if a task has a policy that belongs to RT or DL classes.
  * PI-boosted tasks will return false.
  */
-static inline bool realtime_task_policy(struct task_struct *tsk)
+static inline bool rt_or_dl_task_policy(struct task_struct *tsk)
 {
 	int policy = tsk->policy;
 
-- 
cgit v1.2.3


From c772a2c690182410642ead740f7a84b3a7544b2b Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 1 Aug 2024 15:05:10 +0300
Subject: net/mlx5: Add IFC related stuff for data direct

Add IFC related stuff for data direct.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://patch.msgid.link/82da7f578a567909bb5858a64ba844fe4cc298fa.1722512548.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 51 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index cab228cf51c6..970c9d8473ef 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -313,6 +313,7 @@ enum {
 	MLX5_CMD_OP_MODIFY_VHCA_STATE             = 0xb0e,
 	MLX5_CMD_OP_SYNC_CRYPTO                   = 0xb12,
 	MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS       = 0xb16,
+	MLX5_CMD_OPCODE_QUERY_VUID                = 0xb22,
 	MLX5_CMD_OP_MAX
 };
 
@@ -1885,7 +1886,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_5a0[0x10];
 	u8         enhanced_cqe_compression[0x1];
-	u8         reserved_at_5b1[0x2];
+	u8         reserved_at_5b1[0x1];
+	u8         crossing_vhca_mkey[0x1];
 	u8         log_max_dek[0x5];
 	u8         reserved_at_5b8[0x4];
 	u8         mini_cqe_resp_stride_index[0x1];
@@ -1954,7 +1956,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   dynamic_msix_table_size[0xc];
 	u8	   reserved_at_740[0xc];
 	u8	   min_dynamic_vf_msix_table_size[0x4];
-	u8	   reserved_at_750[0x4];
+	u8	   reserved_at_750[0x2];
+	u8	   data_direct[0x1];
+	u8	   reserved_at_753[0x1];
 	u8	   max_dynamic_vf_msix_table_size[0xc];
 
 	u8         reserved_at_760[0x3];
@@ -1982,7 +1986,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_0[0x80];
 
 	u8         migratable[0x1];
-	u8         reserved_at_81[0x1f];
+	u8         reserved_at_81[0x11];
+	u8         query_vuid[0x1];
+	u8         reserved_at_93[0xd];
 
 	u8	   max_reformat_insert_size[0x8];
 	u8	   max_reformat_insert_offset[0x8];
@@ -4154,6 +4160,7 @@ enum {
 	MLX5_MKC_ACCESS_MODE_KSM   = 0x3,
 	MLX5_MKC_ACCESS_MODE_SW_ICM = 0x4,
 	MLX5_MKC_ACCESS_MODE_MEMIC = 0x5,
+	MLX5_MKC_ACCESS_MODE_CROSSING = 0x6,
 };
 
 struct mlx5_ifc_mkc_bits {
@@ -4196,7 +4203,10 @@ struct mlx5_ifc_mkc_bits {
 
 	u8         bsf_octword_size[0x20];
 
-	u8         reserved_at_120[0x80];
+	u8         reserved_at_120[0x60];
+
+	u8         crossing_target_vhca_id[0x10];
+	u8         reserved_at_190[0x10];
 
 	u8         translations_octword_size[0x20];
 
@@ -5124,6 +5134,36 @@ struct mlx5_ifc_query_vport_state_out_bits {
 	u8         state[0x4];
 };
 
+struct mlx5_ifc_array1024_auto_bits {
+	u8         array1024_auto[32][0x20];
+};
+
+struct mlx5_ifc_query_vuid_in_bits {
+	u8         opcode[0x10];
+	u8         uid[0x10];
+
+	u8         reserved_at_20[0x40];
+
+	u8         query_vfs_vuid[0x1];
+	u8         data_direct[0x1];
+	u8         reserved_at_62[0xe];
+	u8         vhca_id[0x10];
+};
+
+struct mlx5_ifc_query_vuid_out_bits {
+	u8        status[0x8];
+	u8        reserved_at_8[0x18];
+
+	u8        syndrome[0x20];
+
+	u8        reserved_at_40[0x1a0];
+
+	u8        reserved_at_1e0[0x10];
+	u8        num_of_entries[0x10];
+
+	struct mlx5_ifc_array1024_auto_bits vuid[];
+};
+
 enum {
 	MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT  = 0x0,
 	MLX5_VPORT_STATE_OP_MOD_ESW_VPORT   = 0x1,
@@ -8989,7 +9029,8 @@ struct mlx5_ifc_create_mkey_in_bits {
 
 	u8         pg_access[0x1];
 	u8         mkey_umem_valid[0x1];
-	u8         reserved_at_62[0x1e];
+	u8         data_direct[0x1];
+	u8         reserved_at_63[0x1d];
 
 	struct mlx5_ifc_mkc_bits memory_key_mkey_entry;
 
-- 
cgit v1.2.3


From a09cdb8f564613769142a60400bb5160864c3269 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Thu, 8 Aug 2024 12:41:17 +0200
Subject: genirq: Remove unused irq_chip_generic:: {type,polarity}_cache

The type_cache and polarity_cache members of struct irq_chip_generic are
unused. Remove them both along with their kernel-doc.

Found by https://github.com/jirislaby/clang-struct.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240808104118.430670-2-jirislaby@kernel.org
---
 include/linux/irq.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 1f5dbf1f92c9..00490d6ead65 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -1040,8 +1040,6 @@ struct irq_chip_type {
  * @irq_base:		Interrupt base nr for this chip
  * @irq_cnt:		Number of interrupts handled by this chip
  * @mask_cache:		Cached mask register shared between all chip types
- * @type_cache:		Cached type register
- * @polarity_cache:	Cached polarity register
  * @wake_enabled:	Interrupt can wakeup from suspend
  * @wake_active:	Interrupt is marked as an wakeup from suspend source
  * @num_ct:		Number of available irq_chip_type instances (usually 1)
@@ -1068,8 +1066,6 @@ struct irq_chip_generic {
 	unsigned int		irq_base;
 	unsigned int		irq_cnt;
 	u32			mask_cache;
-	u32			type_cache;
-	u32			polarity_cache;
 	u32			wake_enabled;
 	u32			wake_active;
 	unsigned int		num_ct;
-- 
cgit v1.2.3


From 60029162a0458832ab2bcfc6fd4986bfd9ca0f55 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Thu, 8 Aug 2024 12:41:18 +0200
Subject: genirq: Remove irq_chip_regs:: Polarity

The polarity member of struct irq_chip_regs is unused. Remove it along
with its kernel-doc.

Found by https://github.com/jirislaby/clang-struct.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240808104118.430670-3-jirislaby@kernel.org
---
 include/linux/irq.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 00490d6ead65..fa711f80957b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -991,7 +991,6 @@ void irq_init_desc(unsigned int irq);
  * @ack:	Ack register offset to reg_base
  * @eoi:	Eoi register offset to reg_base
  * @type:	Type configuration register offset to reg_base
- * @polarity:	Polarity configuration register offset to reg_base
  */
 struct irq_chip_regs {
 	unsigned long		enable;
@@ -1000,7 +999,6 @@ struct irq_chip_regs {
 	unsigned long		ack;
 	unsigned long		eoi;
 	unsigned long		type;
-	unsigned long		polarity;
 };
 
 /**
-- 
cgit v1.2.3


From 09612576046a3cd29bd2b2b88c5813b1dfe96775 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Wed, 7 Aug 2024 14:22:26 +0200
Subject: net: sungem_phy: Constify struct mii_phy_def

'struct mii_phy_def' are not modified in this driver.

Constifying these structures moves some data to a read-only section, so
increase overall security.

While at it fix the checkpatch warning related to this patch (some missing
newlines and spaces around *)

On a x86_64, with allmodconfig:
Before:
======
  27709	    928	      0	  28637	   6fdd	drivers/net/sungem_phy.o

After:
=====
   text	   data	    bss	    dec	    hex	filename
  28157	    476	      0	  28633	   6fd9	drivers/net/sungem_phy.o

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/54c3b30930f80f4895e6fa2f4234714fdea4ef4e.1723033266.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/sungem_phy.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sungem_phy.h b/include/linux/sungem_phy.h
index c505f30e8b68..eecc7eb63bfb 100644
--- a/include/linux/sungem_phy.h
+++ b/include/linux/sungem_phy.h
@@ -40,7 +40,7 @@ enum {
 /* An instance of a PHY, partially borrowed from mii_if_info */
 struct mii_phy
 {
-	struct mii_phy_def*	def;
+	const struct mii_phy_def *def;
 	u32			advertising;
 	int			mii_id;
 
-- 
cgit v1.2.3


From 5fcf329676cf9d132842d9871f83a091c32f3bfc Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 19 Jul 2024 13:41:50 +0200
Subject: fs: add put_mnt_ns() cleanup helper

Add a simple helper to put a mount namespace reference.

Link: https://lore.kernel.org/r/20240719-work-mount-namespace-v1-3-834113cab0d2@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/mnt_namespace.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 8f882f5881e8..70b366b64816 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -3,6 +3,9 @@
 #define _NAMESPACE_H_
 #ifdef __KERNEL__
 
+#include <linux/cleanup.h>
+#include <linux/err.h>
+
 struct mnt_namespace;
 struct fs_struct;
 struct user_namespace;
@@ -11,6 +14,7 @@ struct ns_common;
 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
 		struct user_namespace *, struct fs_struct *);
 extern void put_mnt_ns(struct mnt_namespace *ns);
+DEFINE_FREE(put_mnt_ns, struct mnt_namespace *, if (!IS_ERR_OR_NULL(_T)) put_mnt_ns(_T))
 extern struct ns_common *from_mnt_ns(struct mnt_namespace *);
 
 extern const struct file_operations proc_mounts_operations;
-- 
cgit v1.2.3


From 257b1c2c78c25643526609dee0c15f1544eb3252 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 19 Jul 2024 13:41:51 +0200
Subject: file: add fput() cleanup helper

Add a simple helper to put a file reference.

Link: https://lore.kernel.org/r/20240719-work-mount-namespace-v1-4-834113cab0d2@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/file.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/file.h b/include/linux/file.h
index 237931f20739..d1e768b06069 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -11,6 +11,7 @@
 #include <linux/posix_types.h>
 #include <linux/errno.h>
 #include <linux/cleanup.h>
+#include <linux/err.h>
 
 struct file;
 
@@ -96,6 +97,7 @@ extern void put_unused_fd(unsigned int fd);
 
 DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
 	     get_unused_fd_flags(flags), unsigned flags)
+DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T))
 
 /*
  * take_fd() will take care to set @fd to -EBADF ensuring that
-- 
cgit v1.2.3


From 7ff7509fa52397455e04bd44982e9dfbbd19457f Mon Sep 17 00:00:00 2001
From: Philipp Stanner <pstanner@redhat.com>
Date: Mon, 29 Jul 2024 11:36:26 +0200
Subject: PCI: Make pcim_request_region() a public function

pcim_request_region() is the managed counterpart of pci_request_region().
It is currently only used internally for PCI.

It can be useful for a number of drivers and exporting it is a step towards
deprecating more complicated functions.

Make pcim_request_region() a public function.

Link: https://lore.kernel.org/r/20240729093625.17561-4-pstanner@redhat.com
Signed-off-by: Philipp Stanner <pstanner@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4cf89a4b4cbc..4246cb790c7b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2292,6 +2292,7 @@ static inline void pci_fixup_device(enum pci_fixup_pass pass,
 void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen);
 void pcim_iounmap(struct pci_dev *pdev, void __iomem *addr);
 void __iomem * const *pcim_iomap_table(struct pci_dev *pdev);
+int pcim_request_region(struct pci_dev *pdev, int bar, const char *name);
 int pcim_iomap_regions(struct pci_dev *pdev, int mask, const char *name);
 int pcim_iomap_regions_request_all(struct pci_dev *pdev, int mask,
 				   const char *name);
-- 
cgit v1.2.3


From d140f80f603584d8282817994c0c6241e736bef4 Mon Sep 17 00:00:00 2001
From: Philipp Stanner <pstanner@redhat.com>
Date: Wed, 7 Aug 2024 10:30:18 +0200
Subject: PCI: Deprecate pcim_iomap_regions() in favor of pcim_iomap_region()

pcim_iomap_regions() is a complicated function that uses a bit mask to
determine the BARs the user wishes to request and ioremap. Almost all users
only ever set a single bit in that mask, making that mechanism
questionable.

pcim_iomap_region() is now available as a more simple replacement.

Make pcim_iomap_region() a public function.

Mark pcim_iomap_regions() as deprecated.

Link: https://lore.kernel.org/r/20240807083018.8734-2-pstanner@redhat.com
Signed-off-by: Philipp Stanner <pstanner@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4246cb790c7b..aa9ef7890c50 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2290,6 +2290,8 @@ static inline void pci_fixup_device(enum pci_fixup_pass pass,
 #endif
 
 void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen);
+void __iomem *pcim_iomap_region(struct pci_dev *pdev, int bar,
+				const char *name);
 void pcim_iounmap(struct pci_dev *pdev, void __iomem *addr);
 void __iomem * const *pcim_iomap_table(struct pci_dev *pdev);
 int pcim_request_region(struct pci_dev *pdev, int bar, const char *name);
-- 
cgit v1.2.3


From 70114e7f7585ef078c2b7033ee14218f95f55e22 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount@gmail.com>
Date: Thu, 8 Aug 2024 15:34:02 +0300
Subject: irqdomain: Simplify simple and legacy domain creation

irq_domain_create_simple() and irq_domain_create_legacy() use
__irq_domain_instantiate(), but have extra handling of allocating interrupt
descriptors and associating interrupts in them. Some of that is duplicated.

There are also call sites which have conditonals to invoke different
interrupt domain creator functions, where one of them is usually
irq_domain_create_legacy(). Alternatively they associate the interrupts for
the legacy case after creating the domain.

Moving the extra logic of irq_domain_create_simple()/legacy() into
__irq_domain_instantiate() allows to consolidate that.

Introduce hwirq_base and virq_base members in the irq_domain_info
structure, which allows to transport the required information and add the
conditional interrupt descriptor allocation and interrupt association into
__irq_domain_instantiate().

This reduces irq_domain_create_legacy() and irq_domain_create_simple() to
trivial wrappers which fill in the info structure and allows call sites
which must support the legacy case along with more modern mechanism to
select the domain type via the parameters of the info struct.

[ tglx: Massaged change log ]

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Matti Vaittinen <mazziesaccount@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/32d07bd79eb2b5416e24da9e9e8fe5955423dcf9.1723120028.git.mazziesaccount@gmail.com
---
 include/linux/irqdomain.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index de6105f68fec..bfcffa2c7047 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -291,6 +291,9 @@ struct irq_domain_chip_generic_info;
  * @hwirq_max:		Maximum number of interrupts supported by controller
  * @direct_max:		Maximum value of direct maps;
  *			Use ~0 for no limit; 0 for no direct mapping
+ * @hwirq_base:		The first hardware interrupt number (legacy domains only)
+ * @virq_base:		The first Linux interrupt number for legacy domains to
+ *			immediately associate the interrupts after domain creation
  * @bus_token:		Domain bus token
  * @ops:		Domain operation callbacks
  * @host_data:		Controller private data pointer
@@ -307,6 +310,8 @@ struct irq_domain_info {
 	unsigned int				size;
 	irq_hw_number_t				hwirq_max;
 	int					direct_max;
+	unsigned int				hwirq_base;
+	unsigned int				virq_base;
 	enum irq_domain_bus_token		bus_token;
 	const struct irq_domain_ops		*ops;
 	void					*host_data;
-- 
cgit v1.2.3


From 1e7c05292531e5b6bebe409cd531ed4ec0b2ff56 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount@gmail.com>
Date: Thu, 8 Aug 2024 22:23:06 +0200
Subject: irqdomain: Allow giving name suffix for domain

Devices can provide multiple interrupt lines. One reason for this is that
a device has multiple subfunctions, each providing its own interrupt line.
Another reason is that a device can be designed to be used (also) on a
system where some of the interrupts can be routed to another processor.

A line often further acts as a demultiplex for specific interrupts
and has it's respective set of interrupt (status, mask, ack, ...)
registers.

Regmap supports the handling of these registers and demultiplexing
interrupts, but the interrupt domain code ends up assigning the same name
for the per interrupt line domains. This causes a naming collision in the
debugFS code and leads to confusion, as /proc/interrupts shows two separate
interrupts with the same domain name and hardware interrupt number.

Instead of adding a workaround in regmap or driver code, allow giving a
name suffix for the domain name when the domain is created.

Add a name_suffix field in the irq_domain_info structure and make
irq_domain_instantiate() use this suffix if it is given when a domain is
created.

[ tglx: Adopt it to the cleanup patch and fixup the invalid NULL return ]

Signed-off-by: Matti Vaittinen <mazziesaccount@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/871q2yvk5x.ffs@tglx
---
 include/linux/irqdomain.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index bfcffa2c7047..e432b6a12a32 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -295,6 +295,8 @@ struct irq_domain_chip_generic_info;
  * @virq_base:		The first Linux interrupt number for legacy domains to
  *			immediately associate the interrupts after domain creation
  * @bus_token:		Domain bus token
+ * @name_suffix:	Optional name suffix to avoid collisions when multiple
+ *			domains are added using same fwnode
  * @ops:		Domain operation callbacks
  * @host_data:		Controller private data pointer
  * @dgc_info:		Geneneric chip information structure pointer used to
@@ -313,6 +315,7 @@ struct irq_domain_info {
 	unsigned int				hwirq_base;
 	unsigned int				virq_base;
 	enum irq_domain_bus_token		bus_token;
+	const char				*name_suffix;
 	const struct irq_domain_ops		*ops;
 	void					*host_data;
 #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
-- 
cgit v1.2.3


From da4fe6815aca25603944a64b0965310512e867d0 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 2 Aug 2024 14:09:00 +0800
Subject: Revert "lib/mpi: Introduce ec implementation to MPI library"

This reverts commit d58bb7e55a8a65894cc02f27c3e2bf9403e7c40f.

It's no longer needed since sm2 has been removed.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/mpi.h | 105 ----------------------------------------------------
 1 file changed, 105 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mpi.h b/include/linux/mpi.h
index eb0d1c1db208..89b720893e12 100644
--- a/include/linux/mpi.h
+++ b/include/linux/mpi.h
@@ -157,111 +157,6 @@ void mpi_fdiv_q(MPI quot, MPI dividend, MPI divisor);
 /*-- mpi-inv.c --*/
 int mpi_invm(MPI x, MPI a, MPI n);
 
-/*-- ec.c --*/
-
-/* Object to represent a point in projective coordinates */
-struct gcry_mpi_point {
-	MPI x;
-	MPI y;
-	MPI z;
-};
-
-typedef struct gcry_mpi_point *MPI_POINT;
-
-/* Models describing an elliptic curve */
-enum gcry_mpi_ec_models {
-	/* The Short Weierstrass equation is
-	 *      y^2 = x^3 + ax + b
-	 */
-	MPI_EC_WEIERSTRASS = 0,
-	/* The Montgomery equation is
-	 *      by^2 = x^3 + ax^2 + x
-	 */
-	MPI_EC_MONTGOMERY,
-	/* The Twisted Edwards equation is
-	 *      ax^2 + y^2 = 1 + bx^2y^2
-	 * Note that we use 'b' instead of the commonly used 'd'.
-	 */
-	MPI_EC_EDWARDS
-};
-
-/* Dialects used with elliptic curves */
-enum ecc_dialects {
-	ECC_DIALECT_STANDARD = 0,
-	ECC_DIALECT_ED25519,
-	ECC_DIALECT_SAFECURVE
-};
-
-/* This context is used with all our EC functions. */
-struct mpi_ec_ctx {
-	enum gcry_mpi_ec_models model; /* The model describing this curve. */
-	enum ecc_dialects dialect;     /* The ECC dialect used with the curve. */
-	int flags;                     /* Public key flags (not always used). */
-	unsigned int nbits;            /* Number of bits.  */
-
-	/* Domain parameters.  Note that they may not all be set and if set
-	 * the MPIs may be flagged as constant.
-	 */
-	MPI p;         /* Prime specifying the field GF(p).  */
-	MPI a;         /* First coefficient of the Weierstrass equation.  */
-	MPI b;         /* Second coefficient of the Weierstrass equation.  */
-	MPI_POINT G;   /* Base point (generator).  */
-	MPI n;         /* Order of G.  */
-	unsigned int h;       /* Cofactor.  */
-
-	/* The actual key.  May not be set.  */
-	MPI_POINT Q;   /* Public key.   */
-	MPI d;         /* Private key.  */
-
-	const char *name;      /* Name of the curve.  */
-
-	/* This structure is private to mpi/ec.c! */
-	struct {
-		struct {
-			unsigned int a_is_pminus3:1;
-			unsigned int two_inv_p:1;
-		} valid; /* Flags to help setting the helper vars below.  */
-
-		int a_is_pminus3;  /* True if A = P - 3. */
-
-		MPI two_inv_p;
-
-		mpi_barrett_t p_barrett;
-
-		/* Scratch variables.  */
-		MPI scratch[11];
-
-		/* Helper for fast reduction.  */
-		/*   int nist_nbits; /\* If this is a NIST curve, the # of bits. *\/ */
-		/*   MPI s[10]; */
-		/*   MPI c; */
-	} t;
-
-	/* Curve specific computation routines for the field.  */
-	void (*addm)(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx);
-	void (*subm)(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ec);
-	void (*mulm)(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx);
-	void (*pow2)(MPI w, const MPI b, struct mpi_ec_ctx *ctx);
-	void (*mul2)(MPI w, MPI u, struct mpi_ec_ctx *ctx);
-};
-
-void mpi_ec_init(struct mpi_ec_ctx *ctx, enum gcry_mpi_ec_models model,
-			enum ecc_dialects dialect,
-			int flags, MPI p, MPI a, MPI b);
-void mpi_ec_deinit(struct mpi_ec_ctx *ctx);
-MPI_POINT mpi_point_new(unsigned int nbits);
-void mpi_point_release(MPI_POINT p);
-void mpi_point_init(MPI_POINT p);
-void mpi_point_free_parts(MPI_POINT p);
-int mpi_ec_get_affine(MPI x, MPI y, MPI_POINT point, struct mpi_ec_ctx *ctx);
-void mpi_ec_add_points(MPI_POINT result,
-			MPI_POINT p1, MPI_POINT p2,
-			struct mpi_ec_ctx *ctx);
-void mpi_ec_mul_point(MPI_POINT result,
-			MPI scalar, MPI_POINT point,
-			struct mpi_ec_ctx *ctx);
-int mpi_ec_curve_point(MPI_POINT point, struct mpi_ec_ctx *ctx);
-
 /* inline functions */
 
 /**
-- 
cgit v1.2.3


From 0737158aabc278526c784f63b8dd8963cdc558a9 Mon Sep 17 00:00:00 2001
From: Olivier Moysan <olivier.moysan@foss.st.com>
Date: Tue, 30 Jul 2024 10:46:31 +0200
Subject: iio: add read scale and offset services to iio backend framework

Add iio_backend_read_scale() and iio_backend_read_offset() services
to read channel scale and offset from an IIO backbend device.

Also add a read_raw callback which replicates the read_raw callback of
the IIO framework, and is intended to request miscellaneous channel
attributes from the backend device.
Both scale and offset helpers use this callback.

Signed-off-by: Olivier Moysan <olivier.moysan@foss.st.com>
Reviewed-by: Nuno Sa <nuno.sa@analog.com>
Link: https://patch.msgid.link/20240730084640.1307938-2-olivier.moysan@foss.st.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/backend.h | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index 2b9d1aa86552..838092b0de3d 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -3,6 +3,7 @@
 #define _IIO_BACKEND_H_
 
 #include <linux/types.h>
+#include <linux/iio/iio.h>
 
 struct iio_chan_spec;
 struct fwnode_handle;
@@ -85,6 +86,7 @@ enum iio_backend_sample_trigger {
  * @extend_chan_spec: Extend an IIO channel.
  * @ext_info_set: Extended info setter.
  * @ext_info_get: Extended info getter.
+ * @read_raw: Read a channel attribute from a backend device
  * @debugfs_print_chan_status: Print channel status into a buffer.
  * @debugfs_reg_access: Read or write register value of backend.
  **/
@@ -119,6 +121,9 @@ struct iio_backend_ops {
 			    const char *buf, size_t len);
 	int (*ext_info_get)(struct iio_backend *back, uintptr_t private,
 			    const struct iio_chan_spec *chan, char *buf);
+	int (*read_raw)(struct iio_backend *back,
+			struct iio_chan_spec const *chan, int *val, int *val2,
+			long mask);
 	int (*debugfs_print_chan_status)(struct iio_backend *back,
 					 unsigned int chan, char *buf,
 					 size_t len);
@@ -162,7 +167,9 @@ ssize_t iio_backend_ext_info_set(struct iio_dev *indio_dev, uintptr_t private,
 				 const char *buf, size_t len);
 ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private,
 				 const struct iio_chan_spec *chan, char *buf);
-
+int iio_backend_read_raw(struct iio_backend *back,
+			 struct iio_chan_spec const *chan, int *val, int *val2,
+			 long mask);
 int iio_backend_extend_chan_spec(struct iio_backend *back,
 				 struct iio_chan_spec *chan);
 void *iio_backend_get_priv(const struct iio_backend *conv);
@@ -174,6 +181,21 @@ __devm_iio_backend_get_from_fwnode_lookup(struct device *dev,
 int devm_iio_backend_register(struct device *dev,
 			      const struct iio_backend_info *info, void *priv);
 
+static inline int iio_backend_read_scale(struct iio_backend *back,
+					 struct iio_chan_spec const *chan,
+					 int *val, int *val2)
+{
+	return iio_backend_read_raw(back, chan, val, val2, IIO_CHAN_INFO_SCALE);
+}
+
+static inline int iio_backend_read_offset(struct iio_backend *back,
+					  struct iio_chan_spec const *chan,
+					  int *val, int *val2)
+{
+	return iio_backend_read_raw(back, chan, val, val2,
+				    IIO_CHAN_INFO_OFFSET);
+}
+
 ssize_t iio_backend_debugfs_print_chan_status(struct iio_backend *back,
 					      unsigned int chan, char *buf,
 					      size_t len);
-- 
cgit v1.2.3


From 2530d7d44ca6dc0c1c059f143cdcb2be8600c59a Mon Sep 17 00:00:00 2001
From: Olivier Moysan <olivier.moysan@foss.st.com>
Date: Tue, 30 Jul 2024 10:46:32 +0200
Subject: iio: add enable and disable services to iio backend framework

Add iio_backend_disable() and iio_backend_enable() APIs to allow
IIO backend consumer to request backend disabling and enabling.

Signed-off-by: Olivier Moysan <olivier.moysan@foss.st.com>
Reviewed-by: Nuno Sa <nuno.sa@analog.com>
Link: https://patch.msgid.link/20240730084640.1307938-3-olivier.moysan@foss.st.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/backend.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index 838092b0de3d..b0b663163ff4 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -144,6 +144,8 @@ struct iio_backend_info {
 int iio_backend_chan_enable(struct iio_backend *back, unsigned int chan);
 int iio_backend_chan_disable(struct iio_backend *back, unsigned int chan);
 int devm_iio_backend_enable(struct device *dev, struct iio_backend *back);
+int iio_backend_enable(struct iio_backend *back);
+void iio_backend_disable(struct iio_backend *back);
 int iio_backend_data_format_set(struct iio_backend *back, unsigned int chan,
 				const struct iio_backend_data_fmt *data);
 int iio_backend_data_source_set(struct iio_backend *back, unsigned int chan,
-- 
cgit v1.2.3


From c464cc610f51f6eb5f27bf905d4c1ab1896b2725 Mon Sep 17 00:00:00 2001
From: Olivier Moysan <olivier.moysan@foss.st.com>
Date: Tue, 30 Jul 2024 10:46:33 +0200
Subject: iio: add child nodes support in iio backend framework

Add an API to support IIO generic channels binding:
http://devicetree.org/schemas/iio/adc/adc.yaml#
This new API is needed, as generic channel DT node isn't populated as a
device.
Add devm_iio_backend_fwnode_get() to allow an IIO device backend
consumer to reference backend phandles in its child nodes.

Signed-off-by: Olivier Moysan <olivier.moysan@foss.st.com>
Reviewed-by: Nuno Sa <nuno.sa@analog.com>
Link: https://patch.msgid.link/20240730084640.1307938-4-olivier.moysan@foss.st.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/backend.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index b0b663163ff4..37d56914d485 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -176,6 +176,9 @@ int iio_backend_extend_chan_spec(struct iio_backend *back,
 				 struct iio_chan_spec *chan);
 void *iio_backend_get_priv(const struct iio_backend *conv);
 struct iio_backend *devm_iio_backend_get(struct device *dev, const char *name);
+struct iio_backend *devm_iio_backend_fwnode_get(struct device *dev,
+						const char *name,
+						struct fwnode_handle *fwnode);
 struct iio_backend *
 __devm_iio_backend_get_from_fwnode_lookup(struct device *dev,
 					  struct fwnode_handle *fwnode);
-- 
cgit v1.2.3


From 2837efdc7cef5b86b0c4d94a7410bc03cc2f003f Mon Sep 17 00:00:00 2001
From: Denis Benato <benato.denis96@gmail.com>
Date: Wed, 7 Aug 2024 20:56:18 +0200
Subject: iio: trigger: allow devices to suspend/resume theirs associated
 trigger

When a machine enters a sleep state while a trigger is associated to
an iio device that trigger is not resumed after exiting the sleep state:
provide iio device drivers a way to suspend and resume
the associated trigger to solve the aforementioned bug.

Each iio driver supporting external triggers is expected to call
iio_device_suspend_triggering before suspending,
and iio_device_resume_triggering upon resuming.

Signed-off-by: Denis Benato <benato.denis96@gmail.com>
Link: https://patch.msgid.link/20240807185619.7261-2-benato.denis96@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 3a735a9a9eae..18779b631e90 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -810,6 +810,23 @@ static inline struct dentry *iio_get_debugfs_dentry(struct iio_dev *indio_dev)
 }
 #endif
 
+/**
+ * iio_device_suspend_triggering() - suspend trigger attached to an iio_dev
+ * @indio_dev: iio_dev associated with the device that will have triggers suspended
+ *
+ * Return 0 if successful, negative otherwise
+ **/
+int iio_device_suspend_triggering(struct iio_dev *indio_dev);
+
+/**
+ * iio_device_resume_triggering() - resume trigger attached to an iio_dev
+ *	that was previously suspended with iio_device_suspend_triggering()
+ * @indio_dev: iio_dev associated with the device that will have triggers resumed
+ *
+ * Return 0 if successful, negative otherwise
+ **/
+int iio_device_resume_triggering(struct iio_dev *indio_dev);
+
 #ifdef CONFIG_ACPI
 bool iio_read_acpi_mount_matrix(struct device *dev,
 				struct iio_mount_matrix *orientation,
-- 
cgit v1.2.3


From bf66471987b4bd537bc800353ca7d39bfd1d1022 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Tue, 16 Apr 2024 10:51:10 +0200
Subject: context_tracking, rcu: Rename struct context_tracking
 .dynticks_nesting into .nesting

The context_tracking.state RCU_DYNTICKS subvariable has been renamed to
RCU_WATCHING, reflect that change in the related helpers.

[ neeraj.upadhyay: Fix htmldocs build error reported by Stephen Rothwell ]

Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/context_tracking_state.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index ad6570ffeff3..65290e7677e6 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -39,7 +39,7 @@ struct context_tracking {
 	atomic_t state;
 #endif
 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
-	long dynticks_nesting;		/* Track process nesting level. */
+	long nesting;		/* Track process nesting level. */
 	long dynticks_nmi_nesting;	/* Track irq/NMI nesting level. */
 #endif
 };
@@ -77,14 +77,14 @@ static __always_inline int ct_rcu_watching_cpu_acquire(int cpu)
 
 static __always_inline long ct_dynticks_nesting(void)
 {
-	return __this_cpu_read(context_tracking.dynticks_nesting);
+	return __this_cpu_read(context_tracking.nesting);
 }
 
 static __always_inline long ct_dynticks_nesting_cpu(int cpu)
 {
 	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
 
-	return ct->dynticks_nesting;
+	return ct->nesting;
 }
 
 static __always_inline long ct_dynticks_nmi_nesting(void)
-- 
cgit v1.2.3


From 1089c0078b69bce0c2d540e3cc43470ba9e5dcfd Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Tue, 16 Apr 2024 14:45:30 +0200
Subject: context_tracking, rcu: Rename ct_dynticks_nesting() into ct_nesting()

The context_tracking.state RCU_DYNTICKS subvariable has been renamed to
RCU_WATCHING, reflect that change in the related helpers.

Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/context_tracking_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 65290e7677e6..586c1ff22c2e 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -75,7 +75,7 @@ static __always_inline int ct_rcu_watching_cpu_acquire(int cpu)
 	return atomic_read_acquire(&ct->state) & CT_RCU_WATCHING_MASK;
 }
 
-static __always_inline long ct_dynticks_nesting(void)
+static __always_inline long ct_nesting(void)
 {
 	return __this_cpu_read(context_tracking.nesting);
 }
-- 
cgit v1.2.3


From bca9455da531a3c2520c28ff349d0dab4e471d28 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Tue, 16 Apr 2024 14:48:26 +0200
Subject: context_tracking, rcu: Rename ct_dynticks_nesting_cpu() into
 ct_nesting_cpu()

The context_tracking.state RCU_DYNTICKS subvariable has been renamed to
RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any
meaning.

Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/context_tracking_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 586c1ff22c2e..fd42d8120ac2 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -80,7 +80,7 @@ static __always_inline long ct_nesting(void)
 	return __this_cpu_read(context_tracking.nesting);
 }
 
-static __always_inline long ct_dynticks_nesting_cpu(int cpu)
+static __always_inline long ct_nesting_cpu(int cpu)
 {
 	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
 
-- 
cgit v1.2.3


From dc5fface4b30508933b515a4985401c8c8f6bcfd Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Tue, 16 Apr 2024 10:52:03 +0200
Subject: context_tracking, rcu: Rename struct context_tracking
 .dynticks_nmi_nesting into .nmi_nesting

The context_tracking.state RCU_DYNTICKS subvariable has been renamed to
RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any
meaning.

[ neeraj.upadhyay: Fix htmldocs build error reported by Stephen Rothwell ]

Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/context_tracking_state.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index fd42d8120ac2..12d00adf29e1 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -40,7 +40,7 @@ struct context_tracking {
 #endif
 #ifdef CONFIG_CONTEXT_TRACKING_IDLE
 	long nesting;		/* Track process nesting level. */
-	long dynticks_nmi_nesting;	/* Track irq/NMI nesting level. */
+	long nmi_nesting;	/* Track irq/NMI nesting level. */
 #endif
 };
 
@@ -89,14 +89,14 @@ static __always_inline long ct_nesting_cpu(int cpu)
 
 static __always_inline long ct_dynticks_nmi_nesting(void)
 {
-	return __this_cpu_read(context_tracking.dynticks_nmi_nesting);
+	return __this_cpu_read(context_tracking.nmi_nesting);
 }
 
 static __always_inline long ct_dynticks_nmi_nesting_cpu(int cpu)
 {
 	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
 
-	return ct->dynticks_nmi_nesting;
+	return ct->nmi_nesting;
 }
 #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
 
-- 
cgit v1.2.3


From 8375cb260d7e43610934af63748d1a51201449f8 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Tue, 16 Apr 2024 14:46:14 +0200
Subject: context_tracking, rcu: Rename ct_dynticks_nmi_nesting() into
 ct_nmi_nesting()

The context_tracking.state RCU_DYNTICKS subvariable has been renamed to
RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any
meaning.

Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/context_tracking_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 12d00adf29e1..8f32fe599c5c 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -87,7 +87,7 @@ static __always_inline long ct_nesting_cpu(int cpu)
 	return ct->nesting;
 }
 
-static __always_inline long ct_dynticks_nmi_nesting(void)
+static __always_inline long ct_nmi_nesting(void)
 {
 	return __this_cpu_read(context_tracking.nmi_nesting);
 }
-- 
cgit v1.2.3


From 2ef2890b7a94fbbfa8fdf0a90499ae1df476b8e8 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Tue, 16 Apr 2024 14:49:13 +0200
Subject: context_tracking, rcu: Rename ct_dynticks_nmi_nesting_cpu() into
 ct_nmi_nesting_cpu()

The context_tracking.state RCU_DYNTICKS subvariable has been renamed to
RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any
meaning.

Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/context_tracking_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 8f32fe599c5c..34fd504e53a8 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -92,7 +92,7 @@ static __always_inline long ct_nmi_nesting(void)
 	return __this_cpu_read(context_tracking.nmi_nesting);
 }
 
-static __always_inline long ct_dynticks_nmi_nesting_cpu(int cpu)
+static __always_inline long ct_nmi_nesting_cpu(int cpu)
 {
 	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
 
-- 
cgit v1.2.3


From e1de438336222dbe27f2d4baa8c01074fcac2bef Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Mon, 29 Apr 2024 15:52:32 +0200
Subject: context_tracking, rcu: Rename DYNTICK_IRQ_NONIDLE into
 CT_NESTING_IRQ_NONIDLE

The context_tracking.state RCU_DYNTICKS subvariable has been renamed to
RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any
meaning.

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/context_tracking_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 34fd504e53a8..0dbda59c9f37 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -7,7 +7,7 @@
 #include <linux/context_tracking_irq.h>
 
 /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
-#define DYNTICK_IRQ_NONIDLE	((LONG_MAX / 2) + 1)
+#define CT_NESTING_IRQ_NONIDLE	((LONG_MAX / 2) + 1)
 
 enum ctx_state {
 	CT_STATE_DISABLED	= -1,	/* returned by ct_state() if unknown */
-- 
cgit v1.2.3


From aa9fbc5dd9da9e095a8b1a58540cf20cb06f595f Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 8 Aug 2024 12:41:17 +0100
Subject: net: mii: constify advertising mask

Constify the advertising mask to linkmode functions that only read from
the advertising mask.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mii.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mii.h b/include/linux/mii.h
index d5a959ce4877..b8f26d4513c3 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -140,7 +140,7 @@ static inline u32 ethtool_adv_to_mii_adv_t(u32 ethadv)
  * settings to phy autonegotiation advertisements for the
  * MII_ADVERTISE register.
  */
-static inline u32 linkmode_adv_to_mii_adv_t(unsigned long *advertising)
+static inline u32 linkmode_adv_to_mii_adv_t(const unsigned long *advertising)
 {
 	u32 result = 0;
 
@@ -215,7 +215,8 @@ static inline u32 ethtool_adv_to_mii_ctrl1000_t(u32 ethadv)
  * settings to phy autonegotiation advertisements for the
  * MII_CTRL1000 register when in 1000T mode.
  */
-static inline u32 linkmode_adv_to_mii_ctrl1000_t(unsigned long *advertising)
+static inline u32
+linkmode_adv_to_mii_ctrl1000_t(const unsigned long *advertising)
 {
 	u32 result = 0;
 
@@ -453,7 +454,7 @@ static inline void mii_ctrl1000_mod_linkmode_adv_t(unsigned long *advertising,
  * A small helper function that translates linkmode advertising to LVL
  * pause capabilities.
  */
-static inline u32 linkmode_adv_to_lcl_adv_t(unsigned long *advertising)
+static inline u32 linkmode_adv_to_lcl_adv_t(const unsigned long *advertising)
 {
 	u32 lcl_adv = 0;
 
-- 
cgit v1.2.3


From d0f8a8973f265f6a276f99d091af99edfb2b87de Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Thu, 8 Aug 2024 00:14:13 +0000
Subject: mm/memblock: introduce a new helper
 memblock_estimated_nr_free_pages()

During bootup, system may need the number of free pages in the whole system
to do some calculation before all pages are freed to buddy system. Usually
this number is get from totalram_pages(). Since we plan to move the free
pages accounting in __free_pages_core(), this value may not represent
total free pages at the early stage, especially when
CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled.

Instead of using raw memblock api, let's introduce a new helper for user
to get the estimated number of free pages from memblock point of view.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
CC: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/r/20240808001415.6298-1-richard.weiyang@gmail.com
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 include/linux/memblock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index fc4d75c6cec3..673d5cae7c81 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -467,6 +467,7 @@ static inline __init_memblock bool memblock_bottom_up(void)
 
 phys_addr_t memblock_phys_mem_size(void);
 phys_addr_t memblock_reserved_size(void);
+unsigned long memblock_estimated_nr_free_pages(void);
 phys_addr_t memblock_start_of_DRAM(void);
 phys_addr_t memblock_end_of_DRAM(void);
 void memblock_enforce_memory_limit(phys_addr_t memory_limit);
-- 
cgit v1.2.3


From 8c38617722bdf57a90e6c77ed9ee5ebb60958d2a Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Fri, 9 Aug 2024 16:15:54 +0200
Subject: memory: ti-aemif: remove platform data support

There are no longer any users of the ti-aemif driver that set up platform
data from board files. We can shrink the driver by removing support for
it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Link: https://lore.kernel.org/r/20240809-ti-aemif-v1-1-27b1e5001390@linaro.org
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 include/linux/platform_data/ti-aemif.h | 45 ----------------------------------
 1 file changed, 45 deletions(-)
 delete mode 100644 include/linux/platform_data/ti-aemif.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/ti-aemif.h b/include/linux/platform_data/ti-aemif.h
deleted file mode 100644
index 77625251df07..000000000000
--- a/include/linux/platform_data/ti-aemif.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * TI DaVinci AEMIF platform glue.
- *
- * Copyright (C) 2017 BayLibre SAS
- *
- * Author:
- *   Bartosz Golaszewski <bgolaszewski@baylibre.com>
- */
-
-#ifndef __TI_DAVINCI_AEMIF_DATA_H__
-#define __TI_DAVINCI_AEMIF_DATA_H__
-
-#include <linux/of_platform.h>
-
-/**
- * struct aemif_abus_data - Async bus configuration parameters.
- *
- * @cs - Chip-select number.
- */
-struct aemif_abus_data {
-	u32 cs;
-};
-
-/**
- * struct aemif_platform_data - Data to set up the TI aemif driver.
- *
- * @dev_lookup: of_dev_auxdata passed to of_platform_populate() for aemif
- *              subdevices.
- * @cs_offset: Lowest allowed chip-select number.
- * @abus_data: Array of async bus configuration entries.
- * @num_abus_data: Number of abus entries.
- * @sub_devices: Array of platform subdevices.
- * @num_sub_devices: Number of subdevices.
- */
-struct aemif_platform_data {
-	struct of_dev_auxdata *dev_lookup;
-	u32 cs_offset;
-	struct aemif_abus_data *abus_data;
-	size_t num_abus_data;
-	struct platform_device *sub_devices;
-	size_t num_sub_devices;
-};
-
-#endif /* __TI_DAVINCI_AEMIF_DATA_H__ */
-- 
cgit v1.2.3


From ce056504e2e53b22c1f789cff2ad6a0a135dc24d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 9 Aug 2024 22:37:20 -0700
Subject: ethtool: make ethtool_ops::cap_rss_ctx_supported optional

cap_rss_ctx_supported was created because the API for creating
and configuring additional contexts is mux'ed with the normal
RSS API. Presence of ops does not imply driver can actually
support rss_context != 0 (in fact drivers mostly ignore that
field). cap_rss_ctx_supported lets core check that the driver
is context-aware before calling it.

Now that we have .create_rxfh_context, there is no such
ambiguity. We can depend on presence of the op.
Make setting the bit optional.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Edward Cree <ecree.xilinx@gmail.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 989c94eddb2b..a149485904d8 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -727,7 +727,8 @@ struct kernel_ethtool_ts_info {
  * @cap_link_lanes_supported: indicates if the driver supports lanes
  *	parameter.
  * @cap_rss_ctx_supported: indicates if the driver supports RSS
- *	contexts.
+ *	contexts via legacy API, drivers implementing @create_rxfh_context
+ *	do not have to set this bit.
  * @cap_rss_sym_xor_supported: indicates if the driver supports symmetric-xor
  *	RSS.
  * @rxfh_indir_space: max size of RSS indirection tables, if indirection table
-- 
cgit v1.2.3


From ec6e57beaf8bc64ea0c2dc0cc360afcc7f504425 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 9 Aug 2024 22:37:22 -0700
Subject: ethtool: rss: don't report key if device doesn't support it

marvell/otx2 and mvpp2 do not support setting different
keys for different RSS contexts. Contexts have separate
indirection tables but key is shared with all other contexts.
This is likely fine, indirection table is the most important
piece.

Don't report the key-related parameters from such drivers.
This prevents driver-errors, e.g. otx2 always writes
the main key, even when user asks to change per-context key.
The second reason is that without this change tracking
the keys by the core gets complicated. Even if the driver
correctly reject setting key with rss_context != 0,
change of the main key would have to be reflected in
the XArray for all additional contexts.

Since the additional contexts don't have their own keys
not including the attributes (in Netlink speak) seems
intuitive. ethtool CLI seems to deal with it just fine.

Having to set the flag in majority of the drivers is
a bit tedious but not reporting the key is a safer
default.

Reviewed-by: Edward Cree <ecree.xilinx@gmail.com>
Reviewed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index a149485904d8..12f6dc567598 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -731,6 +731,9 @@ struct kernel_ethtool_ts_info {
  *	do not have to set this bit.
  * @cap_rss_sym_xor_supported: indicates if the driver supports symmetric-xor
  *	RSS.
+ * @rxfh_per_ctx_key: device supports setting different RSS key for each
+ *	additional context. Netlink API should report hfunc, key, and input_xfrm
+ *	for every context, not just context 0.
  * @rxfh_indir_space: max size of RSS indirection tables, if indirection table
  *	size as returned by @get_rxfh_indir_size may change during lifetime
  *	of the device. Leave as 0 if the table size is constant.
@@ -952,6 +955,7 @@ struct ethtool_ops {
 	u32     cap_link_lanes_supported:1;
 	u32     cap_rss_ctx_supported:1;
 	u32	cap_rss_sym_xor_supported:1;
+	u32	rxfh_per_ctx_key:1;
 	u32	rxfh_indir_space;
 	u16	rxfh_key_space;
 	u16	rxfh_priv_size;
-- 
cgit v1.2.3


From fc9aef4382c02774662da3d7e1de8ba224e04f80 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Thu, 25 Jul 2024 08:23:40 -0400
Subject: platform/x86/intel/vsec.h: Move to include/linux
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some drivers outside of PDX86 need access to the vsec header. Move it to
include/linux to make it easier to include.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20240725122346.4063913-2-michael.j.ruhl@intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/intel_vsec.h | 134 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 include/linux/intel_vsec.h

(limited to 'include/linux')

diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h
new file mode 100644
index 000000000000..6495e37c9079
--- /dev/null
+++ b/include/linux/intel_vsec.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _INTEL_VSEC_H
+#define _INTEL_VSEC_H
+
+#include <linux/auxiliary_bus.h>
+#include <linux/bits.h>
+
+#define VSEC_CAP_TELEMETRY	BIT(0)
+#define VSEC_CAP_WATCHER	BIT(1)
+#define VSEC_CAP_CRASHLOG	BIT(2)
+#define VSEC_CAP_SDSI		BIT(3)
+#define VSEC_CAP_TPMI		BIT(4)
+
+/* Intel DVSEC offsets */
+#define INTEL_DVSEC_ENTRIES		0xA
+#define INTEL_DVSEC_SIZE		0xB
+#define INTEL_DVSEC_TABLE		0xC
+#define INTEL_DVSEC_TABLE_BAR(x)	((x) & GENMASK(2, 0))
+#define INTEL_DVSEC_TABLE_OFFSET(x)	((x) & GENMASK(31, 3))
+#define TABLE_OFFSET_SHIFT		3
+
+struct pci_dev;
+struct resource;
+
+enum intel_vsec_id {
+	VSEC_ID_TELEMETRY	= 2,
+	VSEC_ID_WATCHER		= 3,
+	VSEC_ID_CRASHLOG	= 4,
+	VSEC_ID_SDSI		= 65,
+	VSEC_ID_TPMI		= 66,
+};
+
+/**
+ * struct intel_vsec_header - Common fields of Intel VSEC and DVSEC registers.
+ * @rev:         Revision ID of the VSEC/DVSEC register space
+ * @length:      Length of the VSEC/DVSEC register space
+ * @id:          ID of the feature
+ * @num_entries: Number of instances of the feature
+ * @entry_size:  Size of the discovery table for each feature
+ * @tbir:        BAR containing the discovery tables
+ * @offset:      BAR offset of start of the first discovery table
+ */
+struct intel_vsec_header {
+	u8	rev;
+	u16	length;
+	u16	id;
+	u8	num_entries;
+	u8	entry_size;
+	u8	tbir;
+	u32	offset;
+};
+
+enum intel_vsec_quirks {
+	/* Watcher feature not supported */
+	VSEC_QUIRK_NO_WATCHER	= BIT(0),
+
+	/* Crashlog feature not supported */
+	VSEC_QUIRK_NO_CRASHLOG	= BIT(1),
+
+	/* Use shift instead of mask to read discovery table offset */
+	VSEC_QUIRK_TABLE_SHIFT	= BIT(2),
+
+	/* DVSEC not present (provided in driver data) */
+	VSEC_QUIRK_NO_DVSEC	= BIT(3),
+
+	/* Platforms requiring quirk in the auxiliary driver */
+	VSEC_QUIRK_EARLY_HW     = BIT(4),
+};
+
+/**
+ * struct intel_vsec_platform_info - Platform specific data
+ * @parent:    parent device in the auxbus chain
+ * @headers:   list of headers to define the PMT client devices to create
+ * @caps:      bitmask of PMT capabilities for the given headers
+ * @quirks:    bitmask of VSEC device quirks
+ * @base_addr: allow a base address to be specified (rather than derived)
+ */
+struct intel_vsec_platform_info {
+	struct device *parent;
+	struct intel_vsec_header **headers;
+	unsigned long caps;
+	unsigned long quirks;
+	u64 base_addr;
+};
+
+/**
+ * struct intel_sec_device - Auxbus specific device information
+ * @auxdev:        auxbus device struct for auxbus access
+ * @pcidev:        pci device associated with the device
+ * @resource:      any resources shared by the parent
+ * @ida:           id reference
+ * @num_resources: number of resources
+ * @id:            xarray id
+ * @priv_data:     any private data needed
+ * @quirks:        specified quirks
+ * @base_addr:     base address of entries (if specified)
+ */
+struct intel_vsec_device {
+	struct auxiliary_device auxdev;
+	struct pci_dev *pcidev;
+	struct resource *resource;
+	struct ida *ida;
+	int num_resources;
+	int id; /* xa */
+	void *priv_data;
+	size_t priv_data_size;
+	unsigned long quirks;
+	u64 base_addr;
+};
+
+int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent,
+		       struct intel_vsec_device *intel_vsec_dev,
+		       const char *name);
+
+static inline struct intel_vsec_device *dev_to_ivdev(struct device *dev)
+{
+	return container_of(dev, struct intel_vsec_device, auxdev.dev);
+}
+
+static inline struct intel_vsec_device *auxdev_to_ivdev(struct auxiliary_device *auxdev)
+{
+	return container_of(auxdev, struct intel_vsec_device, auxdev);
+}
+
+#if IS_ENABLED(CONFIG_INTEL_VSEC)
+void intel_vsec_register(struct pci_dev *pdev,
+			 struct intel_vsec_platform_info *info);
+#else
+static inline void intel_vsec_register(struct pci_dev *pdev,
+				       struct intel_vsec_platform_info *info)
+{
+}
+#endif
+#endif
-- 
cgit v1.2.3


From e92affc74cd8624a548b380af7364be037adef35 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Thu, 25 Jul 2024 08:23:41 -0400
Subject: platform/x86/intel/vsec: Add PMT read callbacks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some PMT providers require device specific actions before their telemetry
can be read. Provide assignable PMT read callbacks to allow providers to
perform those actions.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20240725122346.4063913-3-michael.j.ruhl@intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/intel_vsec.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h
index 6495e37c9079..11ee185566c3 100644
--- a/include/linux/intel_vsec.h
+++ b/include/linux/intel_vsec.h
@@ -67,10 +67,24 @@ enum intel_vsec_quirks {
 	VSEC_QUIRK_EARLY_HW     = BIT(4),
 };
 
+/**
+ * struct pmt_callbacks - Callback infrastructure for PMT devices
+ * ->read_telem() when specified, called by client driver to access PMT data (instead
+ * of direct copy).
+ * @pdev:  PCI device reference for the callback's use
+ * @guid:  ID of data to acccss
+ * @data:  buffer for the data to be copied
+ * @count: size of buffer
+ */
+struct pmt_callbacks {
+	int (*read_telem)(struct pci_dev *pdev, u32 guid, u64 *data, u32 count);
+};
+
 /**
  * struct intel_vsec_platform_info - Platform specific data
  * @parent:    parent device in the auxbus chain
  * @headers:   list of headers to define the PMT client devices to create
+ * @priv_data: private data, usable by parent devices, currently a callback
  * @caps:      bitmask of PMT capabilities for the given headers
  * @quirks:    bitmask of VSEC device quirks
  * @base_addr: allow a base address to be specified (rather than derived)
@@ -78,6 +92,7 @@ enum intel_vsec_quirks {
 struct intel_vsec_platform_info {
 	struct device *parent;
 	struct intel_vsec_header **headers;
+	void *priv_data;
 	unsigned long caps;
 	unsigned long quirks;
 	u64 base_addr;
-- 
cgit v1.2.3


From 29bc83e4d90546aa794a9584786086b141a6ba4d Mon Sep 17 00:00:00 2001
From: JP Kobryn <inwardvessel@gmail.com>
Date: Mon, 15 Jul 2024 16:23:24 -0700
Subject: srcu: faster gp seq wrap-around

Using a higher value for the initial gp sequence counters allows for
wrapping to occur faster. It can help with surfacing any issues that may
be happening as a result of the wrap around.

Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcupdate.h |  3 +++
 include/linux/srcutree.h | 15 ++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 13f6f00aecf9..8d56db70d417 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -34,6 +34,9 @@
 #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
 
+#define RCU_SEQ_CTR_SHIFT    2
+#define RCU_SEQ_STATE_MASK   ((1 << RCU_SEQ_CTR_SHIFT) - 1)
+
 /* Exported common interfaces */
 void call_rcu(struct rcu_head *head, rcu_callback_t func);
 void rcu_barrier_tasks(void);
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 8f3f72480e78..ed57598394de 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -129,10 +129,23 @@ struct srcu_struct {
 #define SRCU_STATE_SCAN1	1
 #define SRCU_STATE_SCAN2	2
 
+/*
+ * Values for initializing gp sequence fields. Higher values allow wrap arounds to
+ * occur earlier.
+ * The second value with state is useful in the case of static initialization of
+ * srcu_usage where srcu_gp_seq_needed is expected to have some state value in its
+ * lower bits (or else it will appear to be already initialized within
+ * the call check_init_srcu_struct()).
+ */
+#define SRCU_GP_SEQ_INITIAL_VAL ((0UL - 100UL) << RCU_SEQ_CTR_SHIFT)
+#define SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE (SRCU_GP_SEQ_INITIAL_VAL - 1)
+
 #define __SRCU_USAGE_INIT(name)									\
 {												\
 	.lock = __SPIN_LOCK_UNLOCKED(name.lock),						\
-	.srcu_gp_seq_needed = -1UL,								\
+	.srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL,							\
+	.srcu_gp_seq_needed = SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE,				\
+	.srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL,					\
 	.work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0),					\
 }
 
-- 
cgit v1.2.3


From 711f5c5ce6c2c640c1b3b569ab2a8847be5ab21f Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 15 Jul 2024 21:22:51 -0400
Subject: lsm: cleanup lsm_hooks.h

Some cleanup and style corrections for lsm_hooks.h.

 * Drop the lsm_inode_alloc() extern declaration, it is not needed.
 * Relocate lsm_get_xattr_slot() and extern variables in the file to
   improve grouping of related objects.
 * Don't use tabs to needlessly align structure fields.

Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 87 +++++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index f1ca8082075a..11ea0063228f 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -51,8 +51,8 @@ struct security_hook_heads {
  * Contains the information that identifies the LSM.
  */
 struct lsm_id {
-	const char	*name;
-	u64		id;
+	const char *name;
+	u64 id;
 };
 
 /*
@@ -60,49 +60,31 @@ struct lsm_id {
  * For use with generic list macros for common operations.
  */
 struct security_hook_list {
-	struct hlist_node		list;
-	struct hlist_head		*head;
-	union security_list_options	hook;
-	const struct lsm_id		*lsmid;
+	struct hlist_node list;
+	struct hlist_head *head;
+	union security_list_options hook;
+	const struct lsm_id *lsmid;
 } __randomize_layout;
 
 /*
  * Security blob size or offset data.
  */
 struct lsm_blob_sizes {
-	int	lbs_cred;
-	int	lbs_file;
-	int	lbs_ib;
-	int	lbs_inode;
-	int	lbs_sock;
-	int	lbs_superblock;
-	int	lbs_ipc;
-	int	lbs_key;
-	int	lbs_msg_msg;
-	int	lbs_perf_event;
-	int	lbs_task;
-	int	lbs_xattr_count; /* number of xattr slots in new_xattrs array */
-	int	lbs_tun_dev;
+	int lbs_cred;
+	int lbs_file;
+	int lbs_ib;
+	int lbs_inode;
+	int lbs_sock;
+	int lbs_superblock;
+	int lbs_ipc;
+	int lbs_key;
+	int lbs_msg_msg;
+	int lbs_perf_event;
+	int lbs_task;
+	int lbs_xattr_count; /* number of xattr slots in new_xattrs array */
+	int lbs_tun_dev;
 };
 
-/**
- * lsm_get_xattr_slot - Return the next available slot and increment the index
- * @xattrs: array storing LSM-provided xattrs
- * @xattr_count: number of already stored xattrs (updated)
- *
- * Retrieve the first available slot in the @xattrs array to fill with an xattr,
- * and increment @xattr_count.
- *
- * Return: The slot to fill in @xattrs if non-NULL, NULL otherwise.
- */
-static inline struct xattr *lsm_get_xattr_slot(struct xattr *xattrs,
-					       int *xattr_count)
-{
-	if (unlikely(!xattrs))
-		return NULL;
-	return &xattrs[(*xattr_count)++];
-}
-
 /*
  * LSM_RET_VOID is used as the default value in LSM_HOOK definitions for void
  * LSM hooks (in include/linux/lsm_hook_defs.h).
@@ -118,9 +100,6 @@ static inline struct xattr *lsm_get_xattr_slot(struct xattr *xattrs,
 #define LSM_HOOK_INIT(HEAD, HOOK) \
 	{ .head = &security_hook_heads.HEAD, .hook = { .HEAD = HOOK } }
 
-extern struct security_hook_heads security_hook_heads;
-extern char *lsm_names;
-
 extern void security_add_hooks(struct security_hook_list *hooks, int count,
 			       const struct lsm_id *lsmid);
 
@@ -142,9 +121,6 @@ struct lsm_info {
 	struct lsm_blob_sizes *blobs; /* Optional: for blob sharing. */
 };
 
-extern struct lsm_info __start_lsm_info[], __end_lsm_info[];
-extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];
-
 #define DEFINE_LSM(lsm)							\
 	static struct lsm_info __lsm_##lsm				\
 		__used __section(".lsm_info.init")			\
@@ -155,6 +131,29 @@ extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];
 		__used __section(".early_lsm_info.init")		\
 		__aligned(sizeof(unsigned long))
 
-extern int lsm_inode_alloc(struct inode *inode);
+/* DO NOT tamper with these variables outside of the LSM framework */
+extern char *lsm_names;
+extern struct security_hook_heads security_hook_heads;
+extern struct lsm_static_calls_table static_calls_table __ro_after_init;
+extern struct lsm_info __start_lsm_info[], __end_lsm_info[];
+extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];
+
+/**
+ * lsm_get_xattr_slot - Return the next available slot and increment the index
+ * @xattrs: array storing LSM-provided xattrs
+ * @xattr_count: number of already stored xattrs (updated)
+ *
+ * Retrieve the first available slot in the @xattrs array to fill with an xattr,
+ * and increment @xattr_count.
+ *
+ * Return: The slot to fill in @xattrs if non-NULL, NULL otherwise.
+ */
+static inline struct xattr *lsm_get_xattr_slot(struct xattr *xattrs,
+					       int *xattr_count)
+{
+	if (unlikely(!xattrs))
+		return NULL;
+	return &xattrs[(*xattr_count)++];
+}
 
 #endif /* ! __LINUX_LSM_HOOKS_H */
-- 
cgit v1.2.3


From 63dff3e48871b0583be5032ff8fb7260c349a18c Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 9 Jul 2024 19:43:06 -0400
Subject: lsm: add the inode_free_security_rcu() LSM implementation hook

The LSM framework has an existing inode_free_security() hook which
is used by LSMs that manage state associated with an inode, but
due to the use of RCU to protect the inode, special care must be
taken to ensure that the LSMs do not fully release the inode state
until it is safe from a RCU perspective.

This patch implements a new inode_free_security_rcu() implementation
hook which is called when it is safe to free the LSM's internal inode
state.  Unfortunately, this new hook does not have access to the inode
itself as it may already be released, so the existing
inode_free_security() hook is retained for those LSMs which require
access to the inode.

Cc: stable@vger.kernel.org
Reported-by: syzbot+5446fbf332b0602ede0b@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/r/00000000000076ba3b0617f65cc8@google.com
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 63e2656d1d56..520730fe2d94 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -114,6 +114,7 @@ LSM_HOOK(int, 0, path_notify, const struct path *path, u64 mask,
 	 unsigned int obj_type)
 LSM_HOOK(int, 0, inode_alloc_security, struct inode *inode)
 LSM_HOOK(void, LSM_RET_VOID, inode_free_security, struct inode *inode)
+LSM_HOOK(void, LSM_RET_VOID, inode_free_security_rcu, void *inode_security)
 LSM_HOOK(int, -EOPNOTSUPP, inode_init_security, struct inode *inode,
 	 struct inode *dir, const struct qstr *qstr, struct xattr *xattrs,
 	 int *xattr_count)
-- 
cgit v1.2.3


From 1da91ea87aefe2c25b68c9f96947a9271ba6325d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 31 May 2024 14:12:01 -0400
Subject: introduce fd_file(), convert all accessors to it.

	For any changes of struct fd representation we need to
turn existing accesses to fields into calls of wrappers.
Accesses to struct fd::flags are very few (3 in linux/file.h,
1 in net/socket.c, 3 in fs/overlayfs/file.c and 3 more in
explicit initializers).
	Those can be dealt with in the commit converting to
new layout; accesses to struct fd::file are too many for that.
	This commit converts (almost) all of f.file to
fd_file(f).  It's not entirely mechanical ('file' is used as
a member name more than just in struct fd) and it does not
even attempt to distinguish the uses in pointer context from
those in boolean context; the latter will be eventually turned
into a separate helper (fd_empty()).

	NOTE: mass conversion to fd_empty(), tempting as it
might be, is a bad idea; better do that piecewise in commit
that convert from fdget...() to CLASS(...).

[conflicts in fs/fhandle.c, kernel/bpf/syscall.c, mm/memcontrol.c
caught by git; fs/stat.c one got caught by git grep]
[fs/xattr.c conflict]

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/cleanup.h | 2 +-
 include/linux/file.h    | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index d9e613803df1..a3d3e888cf1f 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -98,7 +98,7 @@ const volatile void * __must_check_fn(const volatile void *val)
  * DEFINE_CLASS(fdget, struct fd, fdput(_T), fdget(fd), int fd)
  *
  *	CLASS(fdget, f)(fd);
- *	if (!f.file)
+ *	if (!fd_file(f))
  *		return -EBADF;
  *
  *	// use 'f' without concern
diff --git a/include/linux/file.h b/include/linux/file.h
index 237931f20739..0f3f369f2450 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -42,10 +42,12 @@ struct fd {
 #define FDPUT_FPUT       1
 #define FDPUT_POS_UNLOCK 2
 
+#define fd_file(f) ((f).file)
+
 static inline void fdput(struct fd fd)
 {
 	if (fd.flags & FDPUT_FPUT)
-		fput(fd.file);
+		fput(fd_file(fd));
 }
 
 extern struct file *fget(unsigned int fd);
@@ -79,7 +81,7 @@ static inline struct fd fdget_pos(int fd)
 static inline void fdput_pos(struct fd f)
 {
 	if (f.flags & FDPUT_POS_UNLOCK)
-		__f_unlock_pos(f.file);
+		__f_unlock_pos(fd_file(f));
 	fdput(f);
 }
 
-- 
cgit v1.2.3


From 88a2f6468d013ca1163490dbddfc95135d1c27a1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 31 May 2024 15:45:12 -0400
Subject: struct fd: representation change

	We want the compiler to see that fdput() on empty instance
is a no-op.  The emptiness check is that file reference is NULL,
while fdput() is "fput() if FDPUT_FPUT is present in flags".
The reason why fdput() on empty instance is a no-op is something
compiler can't see - it's that we never generate instances with
NULL file reference combined with non-zero flags.

	It's not that hard to deal with - the real primitives behind
fdget() et.al. are returning an unsigned long value, unpacked by (inlined)
__to_fd() into the current struct file * + int.  The lower bits are
used to store flags, while the rest encodes the pointer.  Linus suggested
that keeping this unsigned long around with the extractions done by inlined
accessors should generate a sane code and that turns out to be the case.
Namely, turning struct fd into a struct-wrapped unsinged long, with
        fd_empty(f) => unlikely(f.word == 0)
	fd_file(f) => (struct file *)(f.word & ~3)
	fdput(f) => if (f.word & 1) fput(fd_file(f))
ends up with compiler doing the right thing.  The cost is the patch
footprint, of course - we need to switch f.file to fd_file(f) all over
the tree, and it's not doable with simple search and replace; there are
false positives, etc.

	Note that the sole member of that structure is an opaque
unsigned long - all accesses should be done via wrappers and I don't
want to use a name that would invite manual casts to file pointers,
etc.  The value of that member is equal either to (unsigned long)p | flags,
p being an address of some struct file instance, or to 0 for an empty fd.

	For now the new predicate (fd_empty(f)) has no users; all the
existing checks have form (!fd_file(f)).  We will convert to fd_empty()
use later; here we only define it (and tell the compiler that it's
unlikely to return true).

	This commit only deals with representation change; there will
be followups.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/file.h | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/file.h b/include/linux/file.h
index 0f3f369f2450..eb28469b1c16 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -35,18 +35,28 @@ static inline void fput_light(struct file *file, int fput_needed)
 		fput(file);
 }
 
+/* either a reference to struct file + flags
+ * (cloned vs. borrowed, pos locked), with
+ * flags stored in lower bits of value,
+ * or empty (represented by 0).
+ */
 struct fd {
-	struct file *file;
-	unsigned int flags;
+	unsigned long word;
 };
 #define FDPUT_FPUT       1
 #define FDPUT_POS_UNLOCK 2
 
-#define fd_file(f) ((f).file)
+#define fd_file(f) ((struct file *)((f).word & ~(FDPUT_FPUT|FDPUT_POS_UNLOCK)))
+static inline bool fd_empty(struct fd f)
+{
+	return unlikely(!f.word);
+}
+
+#define EMPTY_FD (struct fd){0}
 
 static inline void fdput(struct fd fd)
 {
-	if (fd.flags & FDPUT_FPUT)
+	if (fd.word & FDPUT_FPUT)
 		fput(fd_file(fd));
 }
 
@@ -60,7 +70,7 @@ extern void __f_unlock_pos(struct file *);
 
 static inline struct fd __to_fd(unsigned long v)
 {
-	return (struct fd){(struct file *)(v & ~3),v & 3};
+	return (struct fd){v};
 }
 
 static inline struct fd fdget(unsigned int fd)
@@ -80,7 +90,7 @@ static inline struct fd fdget_pos(int fd)
 
 static inline void fdput_pos(struct fd f)
 {
-	if (f.flags & FDPUT_POS_UNLOCK)
+	if (f.word & FDPUT_POS_UNLOCK)
 		__f_unlock_pos(fd_file(f));
 	fdput(f);
 }
-- 
cgit v1.2.3


From de12c3391bce10504c0e7bd767516c74110cfce1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 31 May 2024 16:34:25 -0400
Subject: add struct fd constructors, get rid of __to_fd()

	Make __fdget() et.al. return struct fd directly.
New helpers: BORROWED_FD(file) and CLONED_FD(file), for
borrowed and cloned file references resp.

	NOTE: this might need tuning; in particular, inline on
__fget_light() is there to keep the code generation same as
before - we probably want to keep it inlined in fdget() et.al.
(especially so in fdget_pos()), but that needs profiling.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/file.h | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/file.h b/include/linux/file.h
index eb28469b1c16..aab1caff6713 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -53,6 +53,14 @@ static inline bool fd_empty(struct fd f)
 }
 
 #define EMPTY_FD (struct fd){0}
+static inline struct fd BORROWED_FD(struct file *f)
+{
+	return (struct fd){(unsigned long)f};
+}
+static inline struct fd CLONED_FD(struct file *f)
+{
+	return (struct fd){(unsigned long)f | FDPUT_FPUT};
+}
 
 static inline void fdput(struct fd fd)
 {
@@ -63,30 +71,11 @@ static inline void fdput(struct fd fd)
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_raw(unsigned int fd);
 extern struct file *fget_task(struct task_struct *task, unsigned int fd);
-extern unsigned long __fdget(unsigned int fd);
-extern unsigned long __fdget_raw(unsigned int fd);
-extern unsigned long __fdget_pos(unsigned int fd);
 extern void __f_unlock_pos(struct file *);
 
-static inline struct fd __to_fd(unsigned long v)
-{
-	return (struct fd){v};
-}
-
-static inline struct fd fdget(unsigned int fd)
-{
-	return __to_fd(__fdget(fd));
-}
-
-static inline struct fd fdget_raw(unsigned int fd)
-{
-	return __to_fd(__fdget_raw(fd));
-}
-
-static inline struct fd fdget_pos(int fd)
-{
-	return __to_fd(__fdget_pos(fd));
-}
+struct fd fdget(unsigned int fd);
+struct fd fdget_raw(unsigned int fd);
+struct fd fdget_pos(unsigned int fd);
 
 static inline void fdput_pos(struct fd f)
 {
-- 
cgit v1.2.3


From 12dbc67c3b0bcd7414b511fd8a42ba4e388705bc Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng@loongson.cn>
Date: Wed, 7 Aug 2024 21:45:28 +0800
Subject: net: stmmac: Move the atds flag to the stmmac_dma_cfg structure

ATDS (Alternate Descriptor Size) is a part of the DMA Bus Mode configs
(together with PBL, ALL, EME, etc) of the DW GMAC controllers. Seeing
it's not changed at runtime but is activated as long as the IP-core
has it supported (at least due to the Type 2 Full Checksum Offload
Engine feature), move the respective parameter from the
stmmac_dma_ops::init() callback argument to the stmmac_dma_cfg
structure, which already have the rest of the DMA-related configs
defined.

Besides the being added in the next commit DW GMAC multi-channels
support will require to add the stmmac_dma_ops::init_chan() callback
and have the ATDS flag set/cleared for each channel in there. Having
the atds-flag in the stmmac_dma_cfg structure will make the parameter
accessible from stmmac_dma_ops::init_chan() callback too.

Signed-off-by: Feiyang Chen <chenfeiyang@loongson.cn>
Signed-off-by: Yinggang Gu <guyinggang@loongson.cn>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Acked-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/stmmac.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 84e13bd5df28..338991c08f00 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -100,6 +100,7 @@ struct stmmac_dma_cfg {
 	bool eame;
 	bool multi_msi_en;
 	bool dche;
+	bool atds;
 };
 
 #define AXI_BLEN	7
-- 
cgit v1.2.3


From c343e66ed009fdb6206787558130ccbd504ffc12 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 10 Aug 2024 22:52:17 +0200
Subject: usb: gadget: configfs: Make check_user_usb_string() static

"linux/usb/gadget_configfs.h" is only included in
"drivers/usb/gadget/configfs.c", so there is no need to declare a function
in the header file. it is only used in this .c file.

It's better to have it static.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/958cb49dca1bff4254a3492c018efbf3b01918b4.1723323107.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/gadget_configfs.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/gadget_configfs.h b/include/linux/usb/gadget_configfs.h
index d61aebd68128..6a552dd4dec9 100644
--- a/include/linux/usb/gadget_configfs.h
+++ b/include/linux/usb/gadget_configfs.h
@@ -4,9 +4,6 @@
 
 #include <linux/configfs.h>
 
-int check_user_usb_string(const char *name,
-		struct usb_gadget_strings *stringtab_dev);
-
 #define GS_STRINGS_W(__struct, __name)	\
 static ssize_t __struct##_##__name##_store(struct config_item *item, \
 		const char *page, size_t len)		\
-- 
cgit v1.2.3


From d1e14e06810a96ef0cdabf572513594031d4dad6 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 10 Aug 2024 23:05:46 +0200
Subject: usb: gadget: configfs: Constify struct config_item_type

'struct config_item_type' is not modified in this file.

Apparently, these structures are only used with
config_group_init_type_name() which takes a const struct config_item_type*
as a 3rd argument.

Constifying this structure moves some data to a read-only section, so
increase overall security, especially when the structure holds some
function pointers.

On a x86_64, with allmodconfig:
Before:
======
   text	   data	    bss	    dec	    hex	filename
  40834	   5112	     64	  46010	   b3ba	drivers/usb/gadget/configfs.o

After:
=====
   text	   data	    bss	    dec	    hex	filename
  41218	   4728	     64	  46010	   b3ba	drivers/usb/gadget/configfs.o

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/513223e97082e1bb758e36d55c175ec9ea34a71c.1723323896.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/gadget_configfs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/gadget_configfs.h b/include/linux/usb/gadget_configfs.h
index 6a552dd4dec9..6b5d6838f865 100644
--- a/include/linux/usb/gadget_configfs.h
+++ b/include/linux/usb/gadget_configfs.h
@@ -34,7 +34,7 @@ static struct configfs_item_operations struct_in##_langid_item_ops = {	\
 	.release                = struct_in##_attr_release,		\
 };									\
 									\
-static struct config_item_type struct_in##_langid_type = {		\
+static const struct config_item_type struct_in##_langid_type = {	\
 	.ct_item_ops	= &struct_in##_langid_item_ops,			\
 	.ct_attrs	= struct_in##_langid_attrs,			\
 	.ct_owner	= THIS_MODULE,					\
@@ -91,7 +91,7 @@ static struct configfs_group_operations struct_in##_strings_ops = {	\
 	.drop_item      = &struct_in##_strings_drop,			\
 };									\
 									\
-static struct config_item_type struct_in##_strings_type = {		\
+static const struct config_item_type struct_in##_strings_type = {	\
 	.ct_group_ops   = &struct_in##_strings_ops,			\
 	.ct_owner       = THIS_MODULE,					\
 }
-- 
cgit v1.2.3


From 1ef33652d22c69a2a7519d003cd6c79b8e514f44 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 8 Aug 2024 05:25:07 -0700
Subject: net: netpoll: extract core of netpoll_cleanup

Extract the core part of netpoll_cleanup(), so, it could be called from
a caller that has the rtnl lock already.

Netconsole uses this in a weird way right now:

	__netpoll_cleanup(&nt->np);
	spin_lock_irqsave(&target_list_lock, flags);
	netdev_put(nt->np.dev, &nt->np.dev_tracker);
	nt->np.dev = NULL;
	nt->enabled = false;

This will be replaced by do_netpoll_cleanup() as the locking situation
is overhauled.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netpoll.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index bd19c4b91e31..cd4e28db0cbd 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -64,6 +64,7 @@ int netpoll_setup(struct netpoll *np);
 void __netpoll_cleanup(struct netpoll *np);
 void __netpoll_free(struct netpoll *np);
 void netpoll_cleanup(struct netpoll *np);
+void do_netpoll_cleanup(struct netpoll *np);
 netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb);
 
 #ifdef CONFIG_NETPOLL
-- 
cgit v1.2.3


From dde286ee57704226b500cb9eb59547fec07aad3d Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount@gmail.com>
Date: Thu, 8 Aug 2024 15:36:28 +0300
Subject: regmap: Allow setting IRQ domain name suffix

When multiple IRQ domains are created from the same device-tree node they
will get the same name based on the device-tree path. This will cause a
naming collision in debugFS when IRQ domain specific entries are created.

The regmap-IRQ creates per instance IRQ domains. This will lead to a
domain name conflict when a device which provides more than one
interrupt line uses the regmap-IRQ.

Add support for specifying an IRQ domain name suffix when creating a
regmap-IRQ controller.

Signed-off-by: Matti Vaittinen <mazziesaccount@gmail.com>
Link: https://patch.msgid.link/776bc4996969e5081bcf61b9bdb5517e537147a3.1723120028.git.mazziesaccount@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 122e38161acb..f9ccad32fc5c 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1521,6 +1521,9 @@ struct regmap_irq_chip_data;
  * struct regmap_irq_chip - Description of a generic regmap irq_chip.
  *
  * @name:        Descriptive name for IRQ controller.
+ * @domain_suffix: Name suffix to be appended to end of IRQ domain name. Needed
+ *		   when multiple regmap-IRQ controllers are created from same
+ *		   device.
  *
  * @main_status: Base main status register address. For chips which have
  *		 interrupts arranged in separate sub-irq blocks with own IRQ
@@ -1606,6 +1609,7 @@ struct regmap_irq_chip_data;
  */
 struct regmap_irq_chip {
 	const char *name;
+	const char *domain_suffix;
 
 	unsigned int main_status;
 	unsigned int num_main_status_bits;
-- 
cgit v1.2.3


From ec0a7d44b358afaaf52856d03c72e20587bc888b Mon Sep 17 00:00:00 2001
From: Matthew Brost <matthew.brost@intel.com>
Date: Fri, 9 Aug 2024 15:28:25 -0700
Subject: workqueue: Add interface for user-defined workqueue lockdep map

Add an interface for a user-defined workqueue lockdep map, which is
helpful when multiple workqueues are created for the same purpose. This
also helps avoid leaking lockdep maps on each workqueue creation.

v2:
 - Add alloc_workqueue_lockdep_map (Tejun)
v3:
 - Drop __WQ_USER_OWNED_LOCKDEP (Tejun)
 - static inline alloc_ordered_workqueue_lockdep_map (Tejun)

Cc: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 4eb8f9563136..8ccbf510880b 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -507,6 +507,58 @@ void workqueue_softirq_dead(unsigned int cpu);
 __printf(1, 4) struct workqueue_struct *
 alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...);
 
+#ifdef CONFIG_LOCKDEP
+/**
+ * alloc_workqueue_lockdep_map - allocate a workqueue with user-defined lockdep_map
+ * @fmt: printf format for the name of the workqueue
+ * @flags: WQ_* flags
+ * @max_active: max in-flight work items, 0 for default
+ * @lockdep_map: user-defined lockdep_map
+ * @...: args for @fmt
+ *
+ * Same as alloc_workqueue but with the a user-define lockdep_map. Useful for
+ * workqueues created with the same purpose and to avoid leaking a lockdep_map
+ * on each workqueue creation.
+ *
+ * RETURNS:
+ * Pointer to the allocated workqueue on success, %NULL on failure.
+ */
+__printf(1, 5) struct workqueue_struct *
+alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active,
+			    struct lockdep_map *lockdep_map, ...);
+
+/**
+ * alloc_ordered_workqueue_lockdep_map - allocate an ordered workqueue with
+ * user-defined lockdep_map
+ *
+ * @fmt: printf format for the name of the workqueue
+ * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
+ * @lockdep_map: user-defined lockdep_map
+ * @args: args for @fmt
+ *
+ * Same as alloc_ordered_workqueue but with the a user-define lockdep_map.
+ * Useful for workqueues created with the same purpose and to avoid leaking a
+ * lockdep_map on each workqueue creation.
+ *
+ * RETURNS:
+ * Pointer to the allocated workqueue on success, %NULL on failure.
+ */
+__printf(1, 4) static inline struct workqueue_struct *
+alloc_ordered_workqueue_lockdep_map(const char *fmt, unsigned int flags,
+				    struct lockdep_map *lockdep_map, ...)
+{
+	struct workqueue_struct *wq;
+	va_list args;
+
+	va_start(args, lockdep_map);
+	wq = alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | flags,
+					 1, lockdep_map, args);
+	va_end(args);
+
+	return wq;
+}
+#endif
+
 /**
  * alloc_ordered_workqueue - allocate an ordered workqueue
  * @fmt: printf format for the name of the workqueue
-- 
cgit v1.2.3


From fb4dae4ca315fe48f1a8a452172f29196b2a7f3d Mon Sep 17 00:00:00 2001
From: Junfeng Guo <junfeng.guo@intel.com>
Date: Thu, 25 Jul 2024 16:08:05 -0600
Subject: virtchnl: support raw packet in protocol header

The patch extends existing virtchnl_proto_hdrs structure to allow VF
to pass a pair of buffers as packet data and mask that describe
a match pattern of a filter rule. Then the kernel PF driver is requested
to parse the pair of buffer and figure out low level hardware metadata
(ptype, profile, field vector.. ) to program the expected FDIR or RSS
rules.

Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Marcin Szycik <marcin.szycik@linux.intel.com>
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
Signed-off-by: Junfeng Guo <junfeng.guo@intel.com>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Tested-by: Rafal Romanowski <rafal.romanowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 8e177b67e82f..4f78a65e33dc 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -1121,6 +1121,7 @@ enum virtchnl_vfr_states {
 };
 
 #define VIRTCHNL_MAX_NUM_PROTO_HDRS	32
+#define VIRTCHNL_MAX_SIZE_RAW_PACKET	1024
 #define PROTO_HDR_SHIFT			5
 #define PROTO_HDR_FIELD_START(proto_hdr_type) ((proto_hdr_type) << PROTO_HDR_SHIFT)
 #define PROTO_HDR_FIELD_MASK ((1UL << PROTO_HDR_SHIFT) - 1)
@@ -1266,13 +1267,22 @@ struct virtchnl_proto_hdrs {
 	u8 pad[3];
 	/**
 	 * specify where protocol header start from.
+	 * must be 0 when sending a raw packet request.
 	 * 0 - from the outer layer
 	 * 1 - from the first inner layer
 	 * 2 - from the second inner layer
 	 * ....
 	 **/
 	int count; /* the proto layers must < VIRTCHNL_MAX_NUM_PROTO_HDRS */
-	struct virtchnl_proto_hdr proto_hdr[VIRTCHNL_MAX_NUM_PROTO_HDRS];
+	union {
+		struct virtchnl_proto_hdr
+			proto_hdr[VIRTCHNL_MAX_NUM_PROTO_HDRS];
+		struct {
+			u16 pkt_len;
+			u8 spec[VIRTCHNL_MAX_SIZE_RAW_PACKET];
+			u8 mask[VIRTCHNL_MAX_SIZE_RAW_PACKET];
+		} raw;
+	};
 };
 
 VIRTCHNL_CHECK_STRUCT_LEN(2312, virtchnl_proto_hdrs);
-- 
cgit v1.2.3


From 623122ac1c4074791ea319df4676fb2875817fe4 Mon Sep 17 00:00:00 2001
From: Ahmed Zaki <ahmed.zaki@intel.com>
Date: Thu, 25 Jul 2024 16:08:09 -0600
Subject: iavf: add support for offloading tc U32 cls filters

Add support for offloading cls U32 filters. Only "skbedit queue_mapping"
and "drop" actions are supported. Also, only "ip" and "802_3" tc
protocols are allowed. The PF must advertise the VIRTCHNL_VF_OFFLOAD_TC_U32
capability flag.

Since the filters will be enabled via the FD stage at the PF, a new type
of FDIR filters is added and the existing list and state machine are used.

The new filters can be used to configure flow directors based on raw
(binary) pattern in the rx packet.

Examples:

0. # tc qdisc add dev enp175s0v0  ingress

1. Redirect UDP from src IP 192.168.2.1 to queue 12:

    # tc filter add dev <dev> protocol ip ingress u32 \
	match u32 0x45000000 0xff000000 at 0  \
	match u32 0x00110000 0x00ff0000 at 8  \
	match u32 0xC0A80201 0xffffffff at 12 \
	match u32 0x00000000 0x00000000 at 24 \
	action skbedit queue_mapping 12 skip_sw

2. Drop all ICMP:

    # tc filter add dev <dev> protocol ip ingress u32 \
	match u32 0x45000000 0xff000000 at 0  \
	match u32 0x00010000 0x00ff0000 at 8  \
	match u32 0x00000000 0x00000000 at 24 \
	action drop skip_sw

3. Redirect ICMP traffic from MAC 3c:fd:fe:a5:47:e0 to queue 7
   (note proto: 802_3):

   # tc filter add dev <dev> protocol 802_3 ingress u32 \
	match u32 0x00003CFD 0x0000ffff at 4   \
	match u32 0xFEA547E0 0xffffffff at 8   \
	match u32 0x08004500 0xffffff00 at 12  \
	match u32 0x00000001 0x000000ff at 20  \
	match u32 0x0000 0x0000 at 40          \
	action skbedit queue_mapping 7 skip_sw

Notes on matches:
1 - All intermediate fields that are needed to parse the correct PTYPE
    must be provided (in e.g. 3: Ethernet Type 0x0800 in MAC, IP version
    and IP length: 0x45 and protocol: 0x01 (ICMP)).
2 - The last match must provide an offset that guarantees all required
    headers are accounted for, even if the last header is not matched.
    For example, in #2, the last match is 4 bytes at offset 24 starting
    from IP header, so the total is 14 (MAC) + 24 + 4 = 42, which is the
    sum of MAC+IP+ICMP headers.

Reviewed-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Reviewed-by: Marcin Szycik <marcin.szycik@linux.intel.com>
Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Tested-by: Rafal Romanowski <rafal.romanowski@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 4f78a65e33dc..f41395264dca 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -247,6 +247,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 /* used to negotiate communicating link speeds in Mbps */
 #define VIRTCHNL_VF_CAP_ADV_LINK_SPEED		BIT(7)
 #define  VIRTCHNL_VF_OFFLOAD_CRC		BIT(10)
+#define VIRTCHNL_VF_OFFLOAD_TC_U32		BIT(11)
 #define VIRTCHNL_VF_OFFLOAD_VLAN_V2		BIT(15)
 #define VIRTCHNL_VF_OFFLOAD_VLAN		BIT(16)
 #define VIRTCHNL_VF_OFFLOAD_RX_POLLING		BIT(17)
-- 
cgit v1.2.3


From 55f325958ccc41eaea43eb4546d4dc77c1b5ef8a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 Jul 2024 01:16:04 -0400
Subject: bpf: switch maps to CLASS(fd, ...)

        Calling conventions for __bpf_map_get() would be more convenient
if it left fpdut() on failure to callers.  Makes for simpler logics
in the callers.

	Among other things, the proof of memory safety no longer has to
rely upon file->private_data never being ERR_PTR(...) for bpffs files.
Original calling conventions made it impossible for the caller to tell
whether __bpf_map_get() has returned ERR_PTR(-EINVAL) because it has found
the file not be a bpf map one (in which case it would've done fdput())
or because it found that ERR_PTR(-EINVAL) in file->private_data of a
bpf map file (in which case fdput() would _not_ have been done).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/bpf.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b9425e410bcb..9f35df07e86d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2241,7 +2241,16 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);
 
 struct bpf_map *bpf_map_get(u32 ufd);
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
-struct bpf_map *__bpf_map_get(struct fd f);
+
+static inline struct bpf_map *__bpf_map_get(struct fd f)
+{
+	if (fd_empty(f))
+		return ERR_PTR(-EBADF);
+	if (unlikely(fd_file(f)->f_op != &bpf_map_fops))
+		return ERR_PTR(-EINVAL);
+	return fd_file(f)->private_data;
+}
+
 void bpf_map_inc(struct bpf_map *map);
 void bpf_map_inc_with_uref(struct bpf_map *map);
 struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
-- 
cgit v1.2.3


From 7945b741d1fc071a621366c512a060ea08848955 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Jul 2024 19:47:47 -0700
Subject: rcu-tasks: Remove RCU Tasks Rude asynchronous APIs

The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are currently
unused.  This commit therefore removes their definitions and boot-time
self-tests.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcupdate.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 13f6f00aecf9..31e679c7110e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -37,7 +37,6 @@
 /* Exported common interfaces */
 void call_rcu(struct rcu_head *head, rcu_callback_t func);
 void rcu_barrier_tasks(void);
-void rcu_barrier_tasks_rude(void);
 void synchronize_rcu(void);
 
 struct rcu_gp_oldstate;
@@ -202,7 +201,6 @@ do {									\
 } while (0)
 
 # ifdef CONFIG_TASKS_RUDE_RCU
-void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func);
 void synchronize_rcu_tasks_rude(void);
 # endif
 
-- 
cgit v1.2.3


From fe91cf39db0939ac8d523b5a1c31840f7cbe205c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 1 Aug 2024 17:34:25 -0700
Subject: rcu/tasks: Add detailed grace-period and barrier diagnostics

This commit adds rcu_tasks_torture_stats_print(),
rcu_tasks_trace_torture_stats_print(), and
rcu_tasks_rude_torture_stats_print() functions that provide detailed
diagnostics on grace-period, callback, and barrier state.

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcupdate.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 31e679c7110e..17463e95b6ef 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -164,6 +164,7 @@ static inline void rcu_nocb_flush_deferred_wakeup(void) { }
 	} while (0)
 void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
 void synchronize_rcu_tasks(void);
+void rcu_tasks_torture_stats_print(char *tt, char *tf);
 # else
 # define rcu_tasks_classic_qs(t, preempt) do { } while (0)
 # define call_rcu_tasks call_rcu
@@ -190,6 +191,7 @@ void rcu_tasks_trace_qs_blkd(struct task_struct *t);
 			rcu_tasks_trace_qs_blkd(t);				\
 		}								\
 	} while (0)
+void rcu_tasks_trace_torture_stats_print(char *tt, char *tf);
 # else
 # define rcu_tasks_trace_qs(t) do { } while (0)
 # endif
@@ -202,6 +204,7 @@ do {									\
 
 # ifdef CONFIG_TASKS_RUDE_RCU
 void synchronize_rcu_tasks_rude(void);
+void rcu_tasks_rude_torture_stats_print(char *tt, char *tf);
 # endif
 
 #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false)
-- 
cgit v1.2.3


From 44f65d900698278a8451988abe0d5ca37fd46882 Mon Sep 17 00:00:00 2001
From: Jeff Xu <jeffxu@chromium.org>
Date: Tue, 6 Aug 2024 21:49:27 +0000
Subject: binfmt_elf: mseal address zero
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In load_elf_binary as part of the execve(), when the current
task’s personality has MMAP_PAGE_ZERO set, the kernel allocates
one page at address 0. According to the comment:

/* Why this, you ask???  Well SVr4 maps page 0 as read-only,
    and some applications "depend" upon this behavior.
    Since we do not have the power to recompile these, we
     emulate the SVr4 behavior. Sigh. */

At one point, Linus suggested removing this [1].

Code search in debian didn't see much use of MMAP_PAGE_ZERO [2],
it exists in util and test (rr).

Sealing this is probably safe, the comment doesn't say
the app ever wanting to change the mapping to rwx. Sealing
also ensures that never happens.

If there is a complaint, we can make this configurable.

Link: https://lore.kernel.org/lkml/CAHk-=whVa=nm_GW=NVfPHqcxDbWt4JjjK1YWb0cLjO4ZSGyiDA@mail.gmail.com/ [1]
Link: https://codesearch.debian.net/search?q=MMAP_PAGE_ZERO&literal=1&perpkg=1&page=1 [2]
Signed-off-by: Jeff Xu <jeffxu@chromium.org>
Link: https://lore.kernel.org/r/20240806214931.2198172-2-jeffxu@google.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/mm.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c4b238a20b76..a178c15812eb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4201,4 +4201,14 @@ void vma_pgtable_walk_end(struct vm_area_struct *vma);
 
 int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size);
 
+#ifdef CONFIG_64BIT
+int do_mseal(unsigned long start, size_t len_in, unsigned long flags);
+#else
+static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
+{
+	/* noop on 32 bit */
+	return 0;
+}
+#endif
+
 #endif /* _LINUX_MM_H */
-- 
cgit v1.2.3


From ac9d45544cd571decca395715d0b0a3b617d02f4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 1 Jul 2024 13:33:58 -0700
Subject: locking/csd_lock: Provide an indication of ongoing CSD-lock stall

If a CSD-lock stall goes on long enough, it will cause an RCU CPU
stall warning.  This additional warning provides much additional
console-log traffic and little additional information.  Therefore,
provide a new csd_lock_is_stuck() function that returns true if there
is an ongoing CSD-lock stall.  This function will be used by the RCU
CPU stall warnings to provide a one-line indication of the stall when
this function returns true.

[ neeraj.upadhyay: Apply Rik van Riel feedback. ]
[ neeraj.upadhyay: Apply kernel test robot feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Imran Khan <imran.f.khan@oracle.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Leonardo Bras <leobras@redhat.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/smp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index fcd61dfe2af3..3871bd32018f 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -294,4 +294,10 @@ int smpcfd_prepare_cpu(unsigned int cpu);
 int smpcfd_dead_cpu(unsigned int cpu);
 int smpcfd_dying_cpu(unsigned int cpu);
 
+#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
+bool csd_lock_is_stuck(void);
+#else
+static inline bool csd_lock_is_stuck(void) { return false; }
+#endif
+
 #endif /* __LINUX_SMP_H */
-- 
cgit v1.2.3


From fda70207135b60dd1a7c8f16cc036bb1cc344490 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Tue, 16 Apr 2024 11:01:23 +0200
Subject: context_tracking, rcu: Rename rcu_dynticks_curr_cpu_in_eqs() into
 rcu_is_watching_curr_cpu()

The context_tracking.state RCU_DYNTICKS subvariable has been renamed to
RCU_WATCHING, reflect that change in the related helpers.

Note that "watching" is the opposite of "in EQS", so the negation is lifted
out of the helper and into the callsites.

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/context_tracking.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index a6c36780cc3b..d53092ffa9db 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -113,13 +113,17 @@ extern void ct_idle_enter(void);
 extern void ct_idle_exit(void);
 
 /*
- * Is the current CPU in an extended quiescent state?
+ * Is RCU watching the current CPU (IOW, it is not in an extended quiescent state)?
+ *
+ * Note that this returns the actual boolean data (watching / not watching),
+ * whereas ct_rcu_watching() returns the RCU_WATCHING subvariable of
+ * context_tracking.state.
  *
  * No ordering, as we are sampling CPU-local information.
  */
-static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
+static __always_inline bool rcu_is_watching_curr_cpu(void)
 {
-	return !(raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING);
+	return raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING;
 }
 
 /*
@@ -140,7 +144,7 @@ static __always_inline bool warn_rcu_enter(void)
 	 * lots of the actual reporting also relies on RCU.
 	 */
 	preempt_disable_notrace();
-	if (rcu_dynticks_curr_cpu_in_eqs()) {
+	if (!rcu_is_watching_curr_cpu()) {
 		ret = true;
 		ct_state_inc(CT_RCU_WATCHING);
 	}
-- 
cgit v1.2.3


From 32a9f26e5e266f0116cc8536e5e4544523ddd334 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Mon, 29 Apr 2024 16:43:36 +0200
Subject: rcu: Rename rcu_momentary_dyntick_idle() into rcu_momentary_eqs()

The context_tracking.state RCU_DYNTICKS subvariable has been renamed to
RCU_WATCHING, replace "dyntick_idle" into "eqs" to drop the dyntick
reference.

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcutiny.h | 2 +-
 include/linux/rcutree.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index d9ac7b136aea..cf2b5a188f78 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -158,7 +158,7 @@ void rcu_scheduler_starting(void);
 static inline void rcu_end_inkernel_boot(void) { }
 static inline bool rcu_inkernel_boot_has_ended(void) { return true; }
 static inline bool rcu_is_watching(void) { return true; }
-static inline void rcu_momentary_dyntick_idle(void) { }
+static inline void rcu_momentary_eqs(void) { }
 static inline void kfree_rcu_scheduler_running(void) { }
 static inline bool rcu_gp_might_be_stalled(void) { return false; }
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 254244202ea9..7dbde2b6f714 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -37,7 +37,7 @@ void synchronize_rcu_expedited(void);
 void kvfree_call_rcu(struct rcu_head *head, void *ptr);
 
 void rcu_barrier(void);
-void rcu_momentary_dyntick_idle(void);
+void rcu_momentary_eqs(void);
 void kfree_rcu_scheduler_running(void);
 bool rcu_gp_might_be_stalled(void);
 
-- 
cgit v1.2.3


From 4040b1139904b3f72b56862fc9a8d3e9abb69ffb Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 17 Jun 2024 08:51:14 -0700
Subject: context_tracking: Tag context_tracking_enabled_this_cpu()
 __always_inline

Force context_tracking_enabled_this_cpu() to be inlined so that invoking
it from guest_context_enter_irqoff(), which KVM uses in non-instrumentable
code, doesn't unexpectedly leave a noinstr section.

  vmlinux.o: warning: objtool: vmx_vcpu_enter_exit+0x1c7: call to
    context_tracking_enabled_this_cpu() leaves .noinstr.text section
  vmlinux.o: warning: objtool: svm_vcpu_enter_exit+0x83: call to
    context_tracking_enabled_this_cpu() leaves .noinstr.text section

Note, the CONFIG_CONTEXT_TRACKING_USER=n stub is already __always_inline.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/context_tracking_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 0dbda59c9f37..7b8433d5a8ef 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -113,7 +113,7 @@ static __always_inline bool context_tracking_enabled_cpu(int cpu)
 	return context_tracking_enabled() && per_cpu(context_tracking.active, cpu);
 }
 
-static inline bool context_tracking_enabled_this_cpu(void)
+static __always_inline bool context_tracking_enabled_this_cpu(void)
 {
 	return context_tracking_enabled() && __this_cpu_read(context_tracking.active);
 }
-- 
cgit v1.2.3


From a98ae7f045b29de4f48b191d5aeb4e803183d759 Mon Sep 17 00:00:00 2001
From: Michal Wajdeczko <michal.wajdeczko@intel.com>
Date: Thu, 25 Jul 2024 12:18:40 +0200
Subject: lib/string_choices: Add str_up_down() helper

Add str_up_down() helper to return "up" or "down" string literal.

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Link: https://lore.kernel.org/r/20240725101841.574-1-michal.wajdeczko@intel.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/string_choices.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h
index d9ebe20229f8..bcde3c9cff81 100644
--- a/include/linux/string_choices.h
+++ b/include/linux/string_choices.h
@@ -42,6 +42,11 @@ static inline const char *str_yes_no(bool v)
 	return v ? "yes" : "no";
 }
 
+static inline const char *str_up_down(bool v)
+{
+	return v ? "up" : "down";
+}
+
 /**
  * str_plural - Return the simple pluralization based on English counts
  * @num: Number used for deciding pluralization
-- 
cgit v1.2.3


From f5c1ca3a15fdb867d2b535003f74e0b975eff116 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 12 Aug 2024 11:29:40 -0700
Subject: string_choices: Add wrapper for str_down_up()

The string choice functions which are not clearly true/false synonyms
also have inverted wrappers. Add this for str_down_up() as well.

Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20240812182939.work.424-kees@kernel.org
Reviewed-by: Andy Shevchenko <andy@kernel.org>
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/string_choices.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h
index bcde3c9cff81..1320bcdcb89c 100644
--- a/include/linux/string_choices.h
+++ b/include/linux/string_choices.h
@@ -46,6 +46,7 @@ static inline const char *str_up_down(bool v)
 {
 	return v ? "up" : "down";
 }
+#define str_down_up(v)		str_up_down(!(v))
 
 /**
  * str_plural - Return the simple pluralization based on English counts
-- 
cgit v1.2.3


From 0cb70ee4a6ee6b0a4b0e0f70a64a094c6fe05944 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 14 Aug 2024 13:22:25 +0800
Subject: virtio: rename virtio_config_enabled to virtio_config_core_enabled

Following patch will allow the config interrupt to be disabled by a
specific driver via another boolean. So this patch renames
virtio_config_enabled and relevant helpers to
virtio_config_core_enabled.

Cc: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
Cc: Gia-Khanh Nguyen <gia-khanh.nguyen@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://patch.msgid.link/20240814052228.4654-2-jasowang@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/virtio.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 4b16844c6bc2..de6d3cc176d9 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -118,7 +118,7 @@ struct virtio_admin_cmd {
  * struct virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus
  * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
- * @config_enabled: configuration change reporting enabled
+ * @config_core_enabled: configuration change reporting enabled by core
  * @config_change_pending: configuration change reported while disabled
  * @config_lock: protects configuration change reporting
  * @vqs_list_lock: protects @vqs.
@@ -135,7 +135,7 @@ struct virtio_admin_cmd {
 struct virtio_device {
 	int index;
 	bool failed;
-	bool config_enabled;
+	bool config_core_enabled;
 	bool config_change_pending;
 	spinlock_t config_lock;
 	spinlock_t vqs_list_lock;
-- 
cgit v1.2.3


From 224de6f886f8184fd448700a6d78f216694de948 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 14 Aug 2024 13:22:26 +0800
Subject: virtio: allow driver to disable the configure change notification

Sometime, it would be useful to disable the configure change
notification from the driver. So this patch allows this by introducing
a variable config_change_driver_disabled and only allow the configure
change notification callback to be triggered when it is allowed by
both the virtio core and the driver. It is set to false by default to
hold the current semantic so we don't need to change any drivers.

The first user for this would be virtio-net.

Cc: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
Cc: Gia-Khanh Nguyen <gia-khanh.nguyen@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://patch.msgid.link/20240814052228.4654-3-jasowang@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/virtio.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index de6d3cc176d9..306137a15d07 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -119,6 +119,8 @@ struct virtio_admin_cmd {
  * @index: unique position on the virtio bus
  * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
  * @config_core_enabled: configuration change reporting enabled by core
+ * @config_driver_disabled: configuration change reporting disabled by
+ *                          a driver
  * @config_change_pending: configuration change reported while disabled
  * @config_lock: protects configuration change reporting
  * @vqs_list_lock: protects @vqs.
@@ -136,6 +138,7 @@ struct virtio_device {
 	int index;
 	bool failed;
 	bool config_core_enabled;
+	bool config_driver_disabled;
 	bool config_change_pending;
 	spinlock_t config_lock;
 	spinlock_t vqs_list_lock;
@@ -166,6 +169,10 @@ void __virtqueue_break(struct virtqueue *_vq);
 void __virtqueue_unbreak(struct virtqueue *_vq);
 
 void virtio_config_changed(struct virtio_device *dev);
+
+void virtio_config_driver_disable(struct virtio_device *dev);
+void virtio_config_driver_enable(struct virtio_device *dev);
+
 #ifdef CONFIG_PM_SLEEP
 int virtio_device_freeze(struct virtio_device *dev);
 int virtio_device_restore(struct virtio_device *dev);
-- 
cgit v1.2.3


From 3942bb49728ad9e1f94d953a88af169a8f5d8099 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 14 Aug 2024 13:00:34 +0300
Subject: string: add mem_is_zero() helper to check if memory area is all zeros

Almost two thirds of the memchr_inv() usages check if the memory area is
all zeros, with no interest in where in the buffer the first non-zero
byte is located. Checking for !memchr_inv(s, 0, n) is also not very
intuitive or discoverable. Add an explicit mem_is_zero() helper for this
use case.

Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Andy Shevchenko <andy@kernel.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20240814100035.3100852-1-jani.nikula@intel.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 include/linux/string.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index 9edace076ddb..5855c5626b4b 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -279,6 +279,18 @@ static inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
 void *memchr_inv(const void *s, int c, size_t n);
 char *strreplace(char *str, char old, char new);
 
+/**
+ * mem_is_zero - Check if an area of memory is all 0's.
+ * @s: The memory area
+ * @n: The size of the area
+ *
+ * Return: True if the area of memory is all 0's.
+ */
+static inline bool mem_is_zero(const void *s, size_t n)
+{
+	return !memchr_inv(s, 0, n);
+}
+
 extern void kfree_const(const void *x);
 
 extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
-- 
cgit v1.2.3


From bf5ffc8c80e0cf5205849cd0c9c3cb261d2beee6 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Wed, 31 Jul 2024 10:51:18 -0600
Subject: perf: arm_pmu: Remove event index to counter remapping

Xscale and Armv6 PMUs defined the cycle counter at 0 and event counters
starting at 1 and had 1:1 event index to counter numbering. On Armv7 and
later, this changed the cycle counter to 31 and event counters start at
0. The drivers for Armv7 and PMUv3 kept the old event index numbering
and introduced an event index to counter conversion. The conversion uses
masking to convert from event index to a counter number. This operation
relies on having at most 32 counters so that the cycle counter index 0
can be transformed to counter number 31.

Armv9.4 adds support for an additional fixed function counter
(instructions) which increases possible counters to more than 32, and
the conversion won't work anymore as a simple subtract and mask. The
primary reason for the translation (other than history) seems to be to
have a contiguous mask of counters 0-N. Keeping that would result in
more complicated index to counter conversions. Instead, store a mask of
available counters rather than just number of events. That provides more
information in addition to the number of events.

No (intended) functional changes.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Tested-by: James Clark <james.clark@linaro.org>
Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-1-280a8d7ff465@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf/arm_pmu.h   | 2 +-
 include/linux/perf/arm_pmuv3.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index b3b34f6670cf..e5d6d204beab 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -96,7 +96,7 @@ struct arm_pmu {
 	void		(*stop)(struct arm_pmu *);
 	void		(*reset)(void *);
 	int		(*map_event)(struct perf_event *event);
-	int		num_events;
+	DECLARE_BITMAP(cntr_mask, ARMPMU_MAX_HWEVENTS);
 	bool		secure_access; /* 32-bit ARM only */
 #define ARMV8_PMUV3_MAX_COMMON_EVENTS		0x40
 	DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 7867db04ec98..eccbdd8eb98f 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -6,6 +6,7 @@
 #ifndef __PERF_ARM_PMUV3_H
 #define __PERF_ARM_PMUV3_H
 
+#define ARMV8_PMU_MAX_GENERAL_COUNTERS	31
 #define ARMV8_PMU_MAX_COUNTERS	32
 #define ARMV8_PMU_COUNTER_MASK	(ARMV8_PMU_MAX_COUNTERS - 1)
 
-- 
cgit v1.2.3


From f9b11aa00708d94a0cd78bfde34b68c0f95d8b50 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Wed, 31 Jul 2024 10:51:21 -0600
Subject: KVM: arm64: pmu: Use generated define for PMSELR_EL0.SEL access

ARMV8_PMU_COUNTER_MASK is really a mask for the PMSELR_EL0.SEL register
field. Make that clear by adding a standard sysreg definition for the
register, and using it instead.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Tested-by: James Clark <james.clark@linaro.org>
Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-4-280a8d7ff465@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf/arm_pmuv3.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index eccbdd8eb98f..792b8e10b72a 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -8,7 +8,6 @@
 
 #define ARMV8_PMU_MAX_GENERAL_COUNTERS	31
 #define ARMV8_PMU_MAX_COUNTERS	32
-#define ARMV8_PMU_COUNTER_MASK	(ARMV8_PMU_MAX_COUNTERS - 1)
 
 /*
  * Common architectural and microarchitectural event numbers.
-- 
cgit v1.2.3


From 126d7d7cce5e048fb82477a9842d088d10ff0df6 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Wed, 31 Jul 2024 10:51:22 -0600
Subject: arm64: perf/kvm: Use a common PMU cycle counter define

The PMUv3 and KVM code each have a define for the PMU cycle counter
index. Move KVM's define to a shared location and use it for PMUv3
driver.

Reviewed-by: Marc Zyngier <maz@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Tested-by: James Clark <james.clark@linaro.org>
Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-5-280a8d7ff465@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf/arm_pmuv3.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 792b8e10b72a..f4ec76f725a3 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -9,6 +9,9 @@
 #define ARMV8_PMU_MAX_GENERAL_COUNTERS	31
 #define ARMV8_PMU_MAX_COUNTERS	32
 
+#define ARMV8_PMU_CYCLE_IDX		31
+
+
 /*
  * Common architectural and microarchitectural event numbers.
  */
-- 
cgit v1.2.3


From 2f62701fa5b0ee94c68d2fcfc470d08aef195441 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Wed, 31 Jul 2024 10:51:23 -0600
Subject: KVM: arm64: Refine PMU defines for number of counters

There are 2 defines for the number of PMU counters:
ARMV8_PMU_MAX_COUNTERS and ARMPMU_MAX_HWEVENTS. Both are the same
currently, but Armv9.4/8.9 increases the number of possible counters
from 32 to 33. With this change, the maximum number of counters will
differ for KVM's PMU emulation which is PMUv3.4. Give KVM PMU emulation
its own define to decouple it from the rest of the kernel's number PMU
counters.

The VHE PMU code needs to match the PMU driver, so switch it to use
ARMPMU_MAX_HWEVENTS instead.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Tested-by: James Clark <james.clark@linaro.org>
Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-6-280a8d7ff465@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf/arm_pmuv3.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index f4ec76f725a3..4f7a7f2222e5 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -7,8 +7,6 @@
 #define __PERF_ARM_PMUV3_H
 
 #define ARMV8_PMU_MAX_GENERAL_COUNTERS	31
-#define ARMV8_PMU_MAX_COUNTERS	32
-
 #define ARMV8_PMU_CYCLE_IDX		31
 
 
-- 
cgit v1.2.3


From d8226d8cfbaf5eb9771af8ad8b4e58697e2ffb74 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Wed, 31 Jul 2024 10:51:24 -0600
Subject: perf: arm_pmuv3: Add support for Armv9.4 PMU instruction counter

Armv9.4/8.9 PMU adds optional support for a fixed instruction counter
similar to the fixed cycle counter. Support for the feature is indicated
in the ID_AA64DFR1_EL1 register PMICNTR field. The counter is not
accessible in AArch32.

Existing userspace using direct counter access won't know how to handle
the fixed instruction counter, so we have to avoid using the counter
when user access is requested.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Tested-by: James Clark <james.clark@linaro.org>
Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-7-280a8d7ff465@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf/arm_pmu.h   | 8 ++++++--
 include/linux/perf/arm_pmuv3.h | 6 ++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index e5d6d204beab..4b5b83677e3f 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -17,10 +17,14 @@
 #ifdef CONFIG_ARM_PMU
 
 /*
- * The ARMv7 CPU PMU supports up to 32 event counters.
+ * The Armv7 and Armv8.8 or less CPU PMU supports up to 32 event counters.
+ * The Armv8.9/9.4 CPU PMU supports up to 33 event counters.
  */
+#ifdef CONFIG_ARM
 #define ARMPMU_MAX_HWEVENTS		32
-
+#else
+#define ARMPMU_MAX_HWEVENTS		33
+#endif
 /*
  * ARM PMU hw_event flags
  */
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 4f7a7f2222e5..3372c1b56486 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -8,7 +8,7 @@
 
 #define ARMV8_PMU_MAX_GENERAL_COUNTERS	31
 #define ARMV8_PMU_CYCLE_IDX		31
-
+#define ARMV8_PMU_INSTR_IDX		32 /* Not accessible from AArch32 */
 
 /*
  * Common architectural and microarchitectural event numbers.
@@ -228,8 +228,10 @@
  */
 #define ARMV8_PMU_OVSR_P		GENMASK(30, 0)
 #define ARMV8_PMU_OVSR_C		BIT(31)
+#define ARMV8_PMU_OVSR_F		BIT_ULL(32) /* arm64 only */
 /* Mask for writable bits is both P and C fields */
-#define ARMV8_PMU_OVERFLOWED_MASK	(ARMV8_PMU_OVSR_P | ARMV8_PMU_OVSR_C)
+#define ARMV8_PMU_OVERFLOWED_MASK	(ARMV8_PMU_OVSR_P | ARMV8_PMU_OVSR_C | \
+					ARMV8_PMU_OVSR_F)
 
 /*
  * PMXEVTYPER: Event selection reg
-- 
cgit v1.2.3


From fca5cb4dd2b4a9423cb6d112cc71c33899955a1f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 10 Aug 2024 14:20:55 +0800
Subject: Revert "lib/mpi: Extend the MPI library"

This partially reverts commit a8ea8bdd9df92a0e5db5b43900abb7a288b8a53e.

Most of it is no longer needed since sm2 has been removed.  However,
the following functions have been kept as they have developed other
uses:

mpi_copy

mpi_mod

mpi_test_bit
mpi_set_bit
mpi_rshift

mpi_add
mpi_sub
mpi_addm
mpi_subm

mpi_mul
mpi_mulm

mpi_tdiv_r
mpi_fdiv_r

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/mpi.h | 65 -----------------------------------------------------
 1 file changed, 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mpi.h b/include/linux/mpi.h
index 89b720893e12..e081428b91ef 100644
--- a/include/linux/mpi.h
+++ b/include/linux/mpi.h
@@ -40,87 +40,33 @@ struct gcry_mpi {
 typedef struct gcry_mpi *MPI;
 
 #define mpi_get_nlimbs(a)     ((a)->nlimbs)
-#define mpi_has_sign(a)       ((a)->sign)
 
 /*-- mpiutil.c --*/
 MPI mpi_alloc(unsigned nlimbs);
-void mpi_clear(MPI a);
 void mpi_free(MPI a);
 int mpi_resize(MPI a, unsigned nlimbs);
 
-static inline MPI mpi_new(unsigned int nbits)
-{
-	return mpi_alloc((nbits + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB);
-}
-
 MPI mpi_copy(MPI a);
-MPI mpi_alloc_like(MPI a);
-void mpi_snatch(MPI w, MPI u);
-MPI mpi_set(MPI w, MPI u);
-MPI mpi_set_ui(MPI w, unsigned long u);
-MPI mpi_alloc_set_ui(unsigned long u);
-void mpi_swap_cond(MPI a, MPI b, unsigned long swap);
-
-/* Constants used to return constant MPIs.  See mpi_init if you
- * want to add more constants.
- */
-#define MPI_NUMBER_OF_CONSTANTS 6
-enum gcry_mpi_constants {
-	MPI_C_ZERO,
-	MPI_C_ONE,
-	MPI_C_TWO,
-	MPI_C_THREE,
-	MPI_C_FOUR,
-	MPI_C_EIGHT
-};
-
-MPI mpi_const(enum gcry_mpi_constants no);
 
 /*-- mpicoder.c --*/
-
-/* Different formats of external big integer representation. */
-enum gcry_mpi_format {
-	GCRYMPI_FMT_NONE = 0,
-	GCRYMPI_FMT_STD = 1,    /* Twos complement stored without length. */
-	GCRYMPI_FMT_PGP = 2,    /* As used by OpenPGP (unsigned only). */
-	GCRYMPI_FMT_SSH = 3,    /* As used by SSH (like STD but with length). */
-	GCRYMPI_FMT_HEX = 4,    /* Hex format. */
-	GCRYMPI_FMT_USG = 5,    /* Like STD but unsigned. */
-	GCRYMPI_FMT_OPAQUE = 8  /* Opaque format (some functions only). */
-};
-
 MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes);
 MPI mpi_read_from_buffer(const void *buffer, unsigned *ret_nread);
-int mpi_fromstr(MPI val, const char *str);
-MPI mpi_scanval(const char *string);
 MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len);
 void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign);
 int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes,
 		    int *sign);
 int mpi_write_to_sgl(MPI a, struct scatterlist *sg, unsigned nbytes,
 		     int *sign);
-int mpi_print(enum gcry_mpi_format format, unsigned char *buffer,
-			size_t buflen, size_t *nwritten, MPI a);
 
 /*-- mpi-mod.c --*/
 void mpi_mod(MPI rem, MPI dividend, MPI divisor);
 
-/* Context used with Barrett reduction.  */
-struct barrett_ctx_s;
-typedef struct barrett_ctx_s *mpi_barrett_t;
-
-mpi_barrett_t mpi_barrett_init(MPI m, int copy);
-void mpi_barrett_free(mpi_barrett_t ctx);
-void mpi_mod_barrett(MPI r, MPI x, mpi_barrett_t ctx);
-void mpi_mul_barrett(MPI w, MPI u, MPI v, mpi_barrett_t ctx);
-
 /*-- mpi-pow.c --*/
 int mpi_powm(MPI res, MPI base, MPI exp, MPI mod);
 
 /*-- mpi-cmp.c --*/
 int mpi_cmp_ui(MPI u, ulong v);
 int mpi_cmp(MPI u, MPI v);
-int mpi_cmpabs(MPI u, MPI v);
 
 /*-- mpi-sub-ui.c --*/
 int mpi_sub_ui(MPI w, MPI u, unsigned long vval);
@@ -130,16 +76,9 @@ void mpi_normalize(MPI a);
 unsigned mpi_get_nbits(MPI a);
 int mpi_test_bit(MPI a, unsigned int n);
 void mpi_set_bit(MPI a, unsigned int n);
-void mpi_set_highbit(MPI a, unsigned int n);
-void mpi_clear_highbit(MPI a, unsigned int n);
-void mpi_clear_bit(MPI a, unsigned int n);
-void mpi_rshift_limbs(MPI a, unsigned int count);
 void mpi_rshift(MPI x, MPI a, unsigned int n);
-void mpi_lshift_limbs(MPI a, unsigned int count);
-void mpi_lshift(MPI x, MPI a, unsigned int n);
 
 /*-- mpi-add.c --*/
-void mpi_add_ui(MPI w, MPI u, unsigned long v);
 void mpi_add(MPI w, MPI u, MPI v);
 void mpi_sub(MPI w, MPI u, MPI v);
 void mpi_addm(MPI w, MPI u, MPI v, MPI m);
@@ -152,10 +91,6 @@ void mpi_mulm(MPI w, MPI u, MPI v, MPI m);
 /*-- mpi-div.c --*/
 void mpi_tdiv_r(MPI rem, MPI num, MPI den);
 void mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor);
-void mpi_fdiv_q(MPI quot, MPI dividend, MPI divisor);
-
-/*-- mpi-inv.c --*/
-int mpi_invm(MPI x, MPI a, MPI n);
 
 /* inline functions */
 
-- 
cgit v1.2.3


From 8e3a67f2de87ee94ac11ea69beb4edc2993b17a0 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 10 Aug 2024 14:20:57 +0800
Subject: crypto: lib/mpi - Add error checks to extension

The remaining functions added by commit
a8ea8bdd9df92a0e5db5b43900abb7a288b8a53e did not check for memory
allocation errors.  Add the checks and change the API to allow errors
to be returned.

Fixes: a8ea8bdd9df9 ("lib/mpi: Extend the MPI library")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/mpi.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mpi.h b/include/linux/mpi.h
index e081428b91ef..47be46f36435 100644
--- a/include/linux/mpi.h
+++ b/include/linux/mpi.h
@@ -59,7 +59,7 @@ int mpi_write_to_sgl(MPI a, struct scatterlist *sg, unsigned nbytes,
 		     int *sign);
 
 /*-- mpi-mod.c --*/
-void mpi_mod(MPI rem, MPI dividend, MPI divisor);
+int mpi_mod(MPI rem, MPI dividend, MPI divisor);
 
 /*-- mpi-pow.c --*/
 int mpi_powm(MPI res, MPI base, MPI exp, MPI mod);
@@ -75,22 +75,22 @@ int mpi_sub_ui(MPI w, MPI u, unsigned long vval);
 void mpi_normalize(MPI a);
 unsigned mpi_get_nbits(MPI a);
 int mpi_test_bit(MPI a, unsigned int n);
-void mpi_set_bit(MPI a, unsigned int n);
-void mpi_rshift(MPI x, MPI a, unsigned int n);
+int mpi_set_bit(MPI a, unsigned int n);
+int mpi_rshift(MPI x, MPI a, unsigned int n);
 
 /*-- mpi-add.c --*/
-void mpi_add(MPI w, MPI u, MPI v);
-void mpi_sub(MPI w, MPI u, MPI v);
-void mpi_addm(MPI w, MPI u, MPI v, MPI m);
-void mpi_subm(MPI w, MPI u, MPI v, MPI m);
+int mpi_add(MPI w, MPI u, MPI v);
+int mpi_sub(MPI w, MPI u, MPI v);
+int mpi_addm(MPI w, MPI u, MPI v, MPI m);
+int mpi_subm(MPI w, MPI u, MPI v, MPI m);
 
 /*-- mpi-mul.c --*/
-void mpi_mul(MPI w, MPI u, MPI v);
-void mpi_mulm(MPI w, MPI u, MPI v, MPI m);
+int mpi_mul(MPI w, MPI u, MPI v);
+int mpi_mulm(MPI w, MPI u, MPI v, MPI m);
 
 /*-- mpi-div.c --*/
-void mpi_tdiv_r(MPI rem, MPI num, MPI den);
-void mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor);
+int mpi_tdiv_r(MPI rem, MPI num, MPI den);
+int mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor);
 
 /* inline functions */
 
-- 
cgit v1.2.3


From abc158c82ae555078aa5dd2d8407c3df0f868904 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 23 May 2024 10:55:59 +0200
Subject: sched: Prepare generic code for delayed dequeue

While most of the delayed dequeue code can be done inside the
sched_class itself, there is one location where we do not have an
appropriate hook, namely ttwu_runnable().

Add an ENQUEUE_DELAYED call to the on_rq path to deal with waking
delayed dequeue tasks.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/20240727105029.200000445@infradead.org
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c1b4ee3234f..f4a648e739e1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -544,6 +544,7 @@ struct sched_entity {
 
 	struct list_head		group_node;
 	unsigned int			on_rq;
+	unsigned int			sched_delayed;
 
 	u64				exec_start;
 	u64				sum_exec_runtime;
-- 
cgit v1.2.3


From a1c446611e31ca5363d4db51e398271da1dce0af Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 1 Jul 2024 21:30:09 +0200
Subject: sched,freezer: Mark TASK_FROZEN special

The special task states are those that do not suffer spurious wakeups,
TASK_FROZEN is very much one of those, mark it as such.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/20240727105029.998329901@infradead.org
---
 include/linux/sched.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f4a648e739e1..8a3a389bd623 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -149,8 +149,9 @@ struct user_event_mm;
  * Special states are those that do not use the normal wait-loop pattern. See
  * the comment with set_special_state().
  */
-#define is_special_task_state(state)				\
-	((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
+#define is_special_task_state(state)					\
+	((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED |	\
+		    TASK_DEAD | TASK_FROZEN))
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 # define debug_normal_state_change(state_value)				\
-- 
cgit v1.2.3


From 82e9d0456e06cebe2c89f3c73cdbc9e3805e9437 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 31 May 2024 15:49:40 +0200
Subject: sched/fair: Avoid re-setting virtual deadline on 'migrations'

During OSPM24 Youssef noted that migrations are re-setting the virtual
deadline. Notably everything that does a dequeue-enqueue, like setting
nice, changing preferred numa-node, and a myriad of other random crap,
will cause this to happen.

This shouldn't be. Preserve the relative virtual deadline across such
dequeue/enqueue cycles.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/20240727105030.625119246@infradead.org
---
 include/linux/sched.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8a3a389bd623..d25e1cfd5766 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -544,8 +544,10 @@ struct sched_entity {
 	u64				min_vruntime;
 
 	struct list_head		group_node;
-	unsigned int			on_rq;
-	unsigned int			sched_delayed;
+	unsigned char			on_rq;
+	unsigned char			sched_delayed;
+	unsigned char			rel_deadline;
+					/* hole */
 
 	u64				exec_start;
 	u64				sum_exec_runtime;
-- 
cgit v1.2.3


From 857b158dc5e81c6de795ef6be006eed146098fc6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 May 2023 13:46:30 +0200
Subject: sched/eevdf: Use sched_attr::sched_runtime to set request/slice
 suggestion

Allow applications to directly set a suggested request/slice length using
sched_attr::sched_runtime.

The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms]
which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100.

Applications should strive to use their periodic runtime at a high
confidence interval (95%+) as the target slice. Using a smaller slice
will introduce undue preemptions, while using a larger value will
increase latency.

For all the following examples assume a scheduling quantum of 8, and for
consistency all examples have W=4:

  {A,B,C,D}(w=1,r=8):

  ABCD...
  +---+---+---+---

  t=0, V=1.5				t=1, V=3.5
  A  |------<				A          |------<
  B   |------<				B   |------<
  C    |------<				C    |------<
  D     |------<			D     |------<
  ---+*------+-------+---		---+--*----+-------+---

  t=2, V=5.5				t=3, V=7.5
  A          |------<			A          |------<
  B           |------<			B           |------<
  C    |------<				C            |------<
  D     |------<			D     |------<
  ---+----*--+-------+---		---+------*+-------+---

Note: 4 identical tasks in FIFO order

~~~

  {A,B}(w=1,r=16) C(w=2,r=16)

  AACCBBCC...
  +---+---+---+---

  t=0, V=1.25				t=2, V=5.25
  A  |--------------<                   A                  |--------------<
  B   |--------------<                  B   |--------------<
  C    |------<                         C    |------<
  ---+*------+-------+---               ---+----*--+-------+---

  t=4, V=8.25				t=6, V=12.25
  A                  |--------------<   A                  |--------------<
  B   |--------------<                  B                   |--------------<
  C            |------<                 C            |------<
  ---+-------*-------+---               ---+-------+---*---+---

Note: 1 heavy task -- because q=8, double r such that the deadline of the w=2
      task doesn't go below q.

Note: observe the full schedule becomes: W*max(r_i/w_i) = 4*2q = 8q in length.

Note: the period of the heavy task is half the full period at:
      W*(r_i/w_i) = 4*(2q/2) = 4q

~~~

  {A,C,D}(w=1,r=16) B(w=1,r=8):

  BAACCBDD...
  +---+---+---+---

  t=0, V=1.5				t=1, V=3.5
  A  |--------------<			A  |---------------<
  B   |------<				B           |------<
  C    |--------------<			C    |--------------<
  D     |--------------<		D     |--------------<
  ---+*------+-------+---		---+--*----+-------+---

  t=3, V=7.5				t=5, V=11.5
  A                  |---------------<  A                  |---------------<
  B           |------<                  B           |------<
  C    |--------------<                 C                    |--------------<
  D     |--------------<                D     |--------------<
  ---+------*+-------+---               ---+-------+--*----+---

  t=6, V=13.5
  A                  |---------------<
  B                   |------<
  C                    |--------------<
  D     |--------------<
  ---+-------+----*--+---

Note: 1 short task -- again double r so that the deadline of the short task
      won't be below q. Made B short because its not the leftmost task, but is
      eligible with the 0,1,2,3 spread.

Note: like with the heavy task, the period of the short task observes:
      W*(r_i/w_i) = 4*(1q/1) = 4q

~~~

  A(w=1,r=16) B(w=1,r=8) C(w=2,r=16)

  BCCAABCC...
  +---+---+---+---

  t=0, V=1.25				t=1, V=3.25
  A  |--------------<                   A  |--------------<
  B   |------<                          B           |------<
  C    |------<                         C    |------<
  ---+*------+-------+---               ---+--*----+-------+---

  t=3, V=7.25				t=5, V=11.25
  A  |--------------<                   A                  |--------------<
  B           |------<                  B           |------<
  C            |------<                 C            |------<
  ---+------*+-------+---               ---+-------+--*----+---

  t=6, V=13.25
  A                  |--------------<
  B                   |------<
  C            |------<
  ---+-------+----*--+---

Note: 1 heavy and 1 short task -- combine them all.

Note: both the short and heavy task end up with a period of 4q

~~~

  A(w=1,r=16) B(w=2,r=16) C(w=1,r=8)

  BBCAABBC...
  +---+---+---+---

  t=0, V=1				t=2, V=5
  A  |--------------<                   A  |--------------<
  B   |------<                          B           |------<
  C    |------<                         C    |------<
  ---+*------+-------+---               ---+----*--+-------+---

  t=3, V=7				t=5, V=11
  A  |--------------<                   A                  |--------------<
  B           |------<                  B           |------<
  C            |------<                 C            |------<
  ---+------*+-------+---               ---+-------+--*----+---

  t=7, V=15
  A                  |--------------<
  B                   |------<
  C            |------<
  ---+-------+------*+---

Note: as before but permuted

~~~

From all this it can be deduced that, for the steady state:

 - the total period (P) of a schedule is:	W*max(r_i/w_i)
 - the average period of a task is:		W*(r_i/w_i)
 - each task obtains the fair share:		w_i/W of each full period P

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/20240727105030.842834421@infradead.org
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d25e1cfd5766..89a3d8d94e96 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -547,6 +547,7 @@ struct sched_entity {
 	unsigned char			on_rq;
 	unsigned char			sched_delayed;
 	unsigned char			rel_deadline;
+	unsigned char			custom_slice;
 					/* hole */
 
 	u64				exec_start;
-- 
cgit v1.2.3


From aef6987d89544d63a47753cf3741cabff0b5574c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 20 Jun 2024 13:16:49 +0200
Subject: sched/eevdf: Propagate min_slice up the cgroup hierarchy

In the absence of an explicit cgroup slice configureation, make mixed
slice length work with cgroups by propagating the min_slice up the
hierarchy.

This ensures the cgroup entity gets timely service to service its
entities that have this timing constraint set on them.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/20240727105030.948188417@infradead.org
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 89a3d8d94e96..3709dedbab59 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -542,6 +542,7 @@ struct sched_entity {
 	struct rb_node			run_node;
 	u64				deadline;
 	u64				min_vruntime;
+	u64				min_slice;
 
 	struct list_head		group_node;
 	unsigned char			on_rq;
-- 
cgit v1.2.3


From fda1dd3c54ef3c4200c2e77634a91610da973950 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 18 Jul 2024 17:50:37 -0700
Subject: find: Switch from inline to __always_inline

'inline' keyword is only a recommendation for compiler. If it decides to
not inline find_bit nodemask functions, the whole small_const_nbits()
machinery doesn't work.

This is how a standard GCC 11.3.0 does for my x86_64 build now. This patch
replaces 'inline' directive with unconditional '__always_inline' to make
sure that there's always a chance for compile-time optimization. It doesn't
change size of kernel image, according to bloat-o-meter.

[[ Brian: split out from:
      Subject: [PATCH 1/3] bitmap: switch from inline to __always_inline
      https://lore.kernel.org/all/20221027043810.350460-2-yury.norov@gmail.com/
   But rewritten, as there were too many conflicts. ]]

Co-developed-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Brian Norris <briannorris@chromium.org>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/find.h | 50 +++++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/find.h b/include/linux/find.h
index 5dfca4225fef..68685714bc18 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -52,7 +52,7 @@ unsigned long _find_next_bit_le(const unsigned long *addr, unsigned
  * Returns the bit number for the next set bit
  * If no bits are set, returns @size.
  */
-static inline
+static __always_inline
 unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
 			    unsigned long offset)
 {
@@ -81,7 +81,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
  * Returns the bit number for the next set bit
  * If no bits are set, returns @size.
  */
-static inline
+static __always_inline
 unsigned long find_next_and_bit(const unsigned long *addr1,
 		const unsigned long *addr2, unsigned long size,
 		unsigned long offset)
@@ -112,7 +112,7 @@ unsigned long find_next_and_bit(const unsigned long *addr1,
  * Returns the bit number for the next set bit
  * If no bits are set, returns @size.
  */
-static inline
+static __always_inline
 unsigned long find_next_andnot_bit(const unsigned long *addr1,
 		const unsigned long *addr2, unsigned long size,
 		unsigned long offset)
@@ -142,7 +142,7 @@ unsigned long find_next_andnot_bit(const unsigned long *addr1,
  * Returns the bit number for the next set bit
  * If no bits are set, returns @size.
  */
-static inline
+static __always_inline
 unsigned long find_next_or_bit(const unsigned long *addr1,
 		const unsigned long *addr2, unsigned long size,
 		unsigned long offset)
@@ -171,7 +171,7 @@ unsigned long find_next_or_bit(const unsigned long *addr1,
  * Returns the bit number of the next zero bit
  * If no bits are zero, returns @size.
  */
-static inline
+static __always_inline
 unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
 				 unsigned long offset)
 {
@@ -198,7 +198,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
  * Returns the bit number of the first set bit.
  * If no bits are set, returns @size.
  */
-static inline
+static __always_inline
 unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
 {
 	if (small_const_nbits(size)) {
@@ -224,7 +224,7 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
  * Returns the bit number of the N'th set bit.
  * If no such, returns >= @size.
  */
-static inline
+static __always_inline
 unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n)
 {
 	if (n >= size)
@@ -249,7 +249,7 @@ unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsign
  * Returns the bit number of the N'th set bit.
  * If no such, returns @size.
  */
-static inline
+static __always_inline
 unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
 				unsigned long size, unsigned long n)
 {
@@ -276,7 +276,7 @@ unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *
  * Returns the bit number of the N'th set bit.
  * If no such, returns @size.
  */
-static inline
+static __always_inline
 unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
 				unsigned long size, unsigned long n)
 {
@@ -332,7 +332,7 @@ unsigned long find_nth_and_andnot_bit(const unsigned long *addr1,
  * Returns the bit number for the next set bit
  * If no bits are set, returns @size.
  */
-static inline
+static __always_inline
 unsigned long find_first_and_bit(const unsigned long *addr1,
 				 const unsigned long *addr2,
 				 unsigned long size)
@@ -357,7 +357,7 @@ unsigned long find_first_and_bit(const unsigned long *addr1,
  * Returns the bit number for the first set bit
  * If no bits are set, returns @size.
  */
-static inline
+static __always_inline
 unsigned long find_first_and_and_bit(const unsigned long *addr1,
 				     const unsigned long *addr2,
 				     const unsigned long *addr3,
@@ -381,7 +381,7 @@ unsigned long find_first_and_and_bit(const unsigned long *addr1,
  * Returns the bit number of the first cleared bit.
  * If no bits are zero, returns @size.
  */
-static inline
+static __always_inline
 unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
 {
 	if (small_const_nbits(size)) {
@@ -402,7 +402,7 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
  *
  * Returns the bit number of the last set bit, or size.
  */
-static inline
+static __always_inline
 unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
 {
 	if (small_const_nbits(size)) {
@@ -425,7 +425,7 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
  * Returns the bit number for the next set bit, or first set bit up to @offset
  * If no bits are set, returns @size.
  */
-static inline
+static __always_inline
 unsigned long find_next_and_bit_wrap(const unsigned long *addr1,
 					const unsigned long *addr2,
 					unsigned long size, unsigned long offset)
@@ -448,7 +448,7 @@ unsigned long find_next_and_bit_wrap(const unsigned long *addr1,
  * Returns the bit number for the next set bit, or first set bit up to @offset
  * If no bits are set, returns @size.
  */
-static inline
+static __always_inline
 unsigned long find_next_bit_wrap(const unsigned long *addr,
 					unsigned long size, unsigned long offset)
 {
@@ -465,7 +465,7 @@ unsigned long find_next_bit_wrap(const unsigned long *addr,
  * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing
  * before using it alone.
  */
-static inline
+static __always_inline
 unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size,
 				 unsigned long start, unsigned long n)
 {
@@ -506,20 +506,20 @@ extern unsigned long find_next_clump8(unsigned long *clump,
 
 #if defined(__LITTLE_ENDIAN)
 
-static inline unsigned long find_next_zero_bit_le(const void *addr,
-		unsigned long size, unsigned long offset)
+static __always_inline
+unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset)
 {
 	return find_next_zero_bit(addr, size, offset);
 }
 
-static inline unsigned long find_next_bit_le(const void *addr,
-		unsigned long size, unsigned long offset)
+static __always_inline
+unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset)
 {
 	return find_next_bit(addr, size, offset);
 }
 
-static inline unsigned long find_first_zero_bit_le(const void *addr,
-		unsigned long size)
+static __always_inline
+unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
 {
 	return find_first_zero_bit(addr, size);
 }
@@ -527,7 +527,7 @@ static inline unsigned long find_first_zero_bit_le(const void *addr,
 #elif defined(__BIG_ENDIAN)
 
 #ifndef find_next_zero_bit_le
-static inline
+static __always_inline
 unsigned long find_next_zero_bit_le(const void *addr, unsigned
 		long size, unsigned long offset)
 {
@@ -546,7 +546,7 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned
 #endif
 
 #ifndef find_first_zero_bit_le
-static inline
+static __always_inline
 unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
 {
 	if (small_const_nbits(size)) {
@@ -560,7 +560,7 @@ unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
 #endif
 
 #ifndef find_next_bit_le
-static inline
+static __always_inline
 unsigned long find_next_bit_le(const void *addr, unsigned
 		long size, unsigned long offset)
 {
-- 
cgit v1.2.3


From ed8cd2b3bd9f070120528fc5403a2d0b5afe07f8 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 18 Jul 2024 17:50:38 -0700
Subject: bitmap: Switch from inline to __always_inline

'inline' keyword is only a recommendation for compiler. If it decides to
not inline bitmap functions, the whole small_const_nbits() machinery
doesn't work.

This is how a standard GCC 11.3.0 does for my x86_64 build now. This patch
replaces 'inline' directive with unconditional '__always_inline' to make
sure that there's always a chance for compile-time optimization. It doesn't
change size of kernel image, according to bloat-o-meter.

[[ Brian: split out from:
      Subject: [PATCH 1/3] bitmap: switch from inline to __always_inline
      https://lore.kernel.org/all/20221027043810.350460-2-yury.norov@gmail.com/
   But rewritten, as there were too many conflicts. ]]

Co-developed-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Brian Norris <briannorris@chromium.org>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap.h | 140 +++++++++++++++++++++++++++----------------------
 1 file changed, 76 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 8c4768c44a01..0406f48f0a6a 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -203,12 +203,12 @@ unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
  * the bit offset of all zero areas this function finds is multiples of that
  * power of 2. A @align_mask of 0 means no alignment is required.
  */
-static inline unsigned long
-bitmap_find_next_zero_area(unsigned long *map,
-			   unsigned long size,
-			   unsigned long start,
-			   unsigned int nr,
-			   unsigned long align_mask)
+static __always_inline
+unsigned long bitmap_find_next_zero_area(unsigned long *map,
+					 unsigned long size,
+					 unsigned long start,
+					 unsigned int nr,
+					 unsigned long align_mask)
 {
 	return bitmap_find_next_zero_area_off(map, size, start, nr,
 					      align_mask, 0);
@@ -228,7 +228,7 @@ void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 
 #define bitmap_size(nbits)	(ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE)
 
-static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
+static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 {
 	unsigned int len = bitmap_size(nbits);
 
@@ -238,7 +238,7 @@ static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 		memset(dst, 0, len);
 }
 
-static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
+static __always_inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
 {
 	unsigned int len = bitmap_size(nbits);
 
@@ -248,8 +248,8 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
 		memset(dst, 0xff, len);
 }
 
-static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
-			unsigned int nbits)
+static __always_inline
+void bitmap_copy(unsigned long *dst, const unsigned long *src, unsigned int nbits)
 {
 	unsigned int len = bitmap_size(nbits);
 
@@ -262,8 +262,8 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
 /*
  * Copy bitmap and clear tail bits in last word.
  */
-static inline void bitmap_copy_clear_tail(unsigned long *dst,
-		const unsigned long *src, unsigned int nbits)
+static __always_inline
+void bitmap_copy_clear_tail(unsigned long *dst, const unsigned long *src, unsigned int nbits)
 {
 	bitmap_copy(dst, src, nbits);
 	if (nbits % BITS_PER_LONG)
@@ -306,16 +306,18 @@ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits);
 	bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits))
 #endif
 
-static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1,
-			const unsigned long *src2, unsigned int nbits)
+static __always_inline
+bool bitmap_and(unsigned long *dst, const unsigned long *src1,
+		const unsigned long *src2, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
 	return __bitmap_and(dst, src1, src2, nbits);
 }
 
-static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
-			const unsigned long *src2, unsigned int nbits)
+static __always_inline
+void bitmap_or(unsigned long *dst, const unsigned long *src1,
+	       const unsigned long *src2, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		*dst = *src1 | *src2;
@@ -323,8 +325,9 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
 		__bitmap_or(dst, src1, src2, nbits);
 }
 
-static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
-			const unsigned long *src2, unsigned int nbits)
+static __always_inline
+void bitmap_xor(unsigned long *dst, const unsigned long *src1,
+		const unsigned long *src2, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		*dst = *src1 ^ *src2;
@@ -332,16 +335,17 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
 		__bitmap_xor(dst, src1, src2, nbits);
 }
 
-static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
-			const unsigned long *src2, unsigned int nbits)
+static __always_inline
+bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
+		   const unsigned long *src2, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
 	return __bitmap_andnot(dst, src1, src2, nbits);
 }
 
-static inline void bitmap_complement(unsigned long *dst, const unsigned long *src,
-			unsigned int nbits)
+static __always_inline
+void bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		*dst = ~(*src);
@@ -356,8 +360,8 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr
 #endif
 #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1)
 
-static inline bool bitmap_equal(const unsigned long *src1,
-				const unsigned long *src2, unsigned int nbits)
+static __always_inline
+bool bitmap_equal(const unsigned long *src1, const unsigned long *src2, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits));
@@ -376,10 +380,9 @@ static inline bool bitmap_equal(const unsigned long *src1,
  *
  * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise
  */
-static inline bool bitmap_or_equal(const unsigned long *src1,
-				   const unsigned long *src2,
-				   const unsigned long *src3,
-				   unsigned int nbits)
+static __always_inline
+bool bitmap_or_equal(const unsigned long *src1, const unsigned long *src2,
+		     const unsigned long *src3, unsigned int nbits)
 {
 	if (!small_const_nbits(nbits))
 		return __bitmap_or_equal(src1, src2, src3, nbits);
@@ -387,9 +390,8 @@ static inline bool bitmap_or_equal(const unsigned long *src1,
 	return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits));
 }
 
-static inline bool bitmap_intersects(const unsigned long *src1,
-				     const unsigned long *src2,
-				     unsigned int nbits)
+static __always_inline
+bool bitmap_intersects(const unsigned long *src1, const unsigned long *src2, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
@@ -397,8 +399,8 @@ static inline bool bitmap_intersects(const unsigned long *src1,
 		return __bitmap_intersects(src1, src2, nbits);
 }
 
-static inline bool bitmap_subset(const unsigned long *src1,
-				 const unsigned long *src2, unsigned int nbits)
+static __always_inline
+bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits));
@@ -406,7 +408,8 @@ static inline bool bitmap_subset(const unsigned long *src1,
 		return __bitmap_subset(src1, src2, nbits);
 }
 
-static inline bool bitmap_empty(const unsigned long *src, unsigned nbits)
+static __always_inline
+bool bitmap_empty(const unsigned long *src, unsigned nbits)
 {
 	if (small_const_nbits(nbits))
 		return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
@@ -414,7 +417,8 @@ static inline bool bitmap_empty(const unsigned long *src, unsigned nbits)
 	return find_first_bit(src, nbits) == nbits;
 }
 
-static inline bool bitmap_full(const unsigned long *src, unsigned int nbits)
+static __always_inline
+bool bitmap_full(const unsigned long *src, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
@@ -448,8 +452,8 @@ unsigned long bitmap_weight_andnot(const unsigned long *src1,
 	return __bitmap_weight_andnot(src1, src2, nbits);
 }
 
-static __always_inline void bitmap_set(unsigned long *map, unsigned int start,
-		unsigned int nbits)
+static __always_inline
+void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits)
 {
 	if (__builtin_constant_p(nbits) && nbits == 1)
 		__set_bit(start, map);
@@ -464,8 +468,8 @@ static __always_inline void bitmap_set(unsigned long *map, unsigned int start,
 		__bitmap_set(map, start, nbits);
 }
 
-static __always_inline void bitmap_clear(unsigned long *map, unsigned int start,
-		unsigned int nbits)
+static __always_inline
+void bitmap_clear(unsigned long *map, unsigned int start, unsigned int nbits)
 {
 	if (__builtin_constant_p(nbits) && nbits == 1)
 		__clear_bit(start, map);
@@ -480,8 +484,9 @@ static __always_inline void bitmap_clear(unsigned long *map, unsigned int start,
 		__bitmap_clear(map, start, nbits);
 }
 
-static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
-				unsigned int shift, unsigned int nbits)
+static __always_inline
+void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
+			unsigned int shift, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		*dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift;
@@ -489,8 +494,9 @@ static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *s
 		__bitmap_shift_right(dst, src, shift, nbits);
 }
 
-static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src,
-				unsigned int shift, unsigned int nbits)
+static __always_inline
+void bitmap_shift_left(unsigned long *dst, const unsigned long *src,
+		       unsigned int shift, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		*dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits);
@@ -498,11 +504,12 @@ static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *sr
 		__bitmap_shift_left(dst, src, shift, nbits);
 }
 
-static inline void bitmap_replace(unsigned long *dst,
-				  const unsigned long *old,
-				  const unsigned long *new,
-				  const unsigned long *mask,
-				  unsigned int nbits)
+static __always_inline
+void bitmap_replace(unsigned long *dst,
+		    const unsigned long *old,
+		    const unsigned long *new,
+		    const unsigned long *mask,
+		    unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		*dst = (*old & ~(*mask)) | (*new & *mask);
@@ -545,8 +552,9 @@ static inline void bitmap_replace(unsigned long *dst,
  * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation.
  * See bitmap_scatter() for details related to this relationship.
  */
-static inline void bitmap_scatter(unsigned long *dst, const unsigned long *src,
-				  const unsigned long *mask, unsigned int nbits)
+static __always_inline
+void bitmap_scatter(unsigned long *dst, const unsigned long *src,
+		    const unsigned long *mask, unsigned int nbits)
 {
 	unsigned int n = 0;
 	unsigned int bit;
@@ -599,8 +607,9 @@ static inline void bitmap_scatter(unsigned long *dst, const unsigned long *src,
  * bitmap_scatter(res, src, mask, n) and a call to
  * bitmap_scatter(res, result, mask, n) will lead to the same res value.
  */
-static inline void bitmap_gather(unsigned long *dst, const unsigned long *src,
-				 const unsigned long *mask, unsigned int nbits)
+static __always_inline
+void bitmap_gather(unsigned long *dst, const unsigned long *src,
+		   const unsigned long *mask, unsigned int nbits)
 {
 	unsigned int n = 0;
 	unsigned int bit;
@@ -611,9 +620,9 @@ static inline void bitmap_gather(unsigned long *dst, const unsigned long *src,
 		__assign_bit(n++, dst, test_bit(bit, src));
 }
 
-static inline void bitmap_next_set_region(unsigned long *bitmap,
-					  unsigned int *rs, unsigned int *re,
-					  unsigned int end)
+static __always_inline
+void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs,
+			    unsigned int *re, unsigned int end)
 {
 	*rs = find_next_bit(bitmap, end, *rs);
 	*re = find_next_zero_bit(bitmap, end, *rs + 1);
@@ -628,7 +637,8 @@ static inline void bitmap_next_set_region(unsigned long *bitmap,
  * This is the complement to __bitmap_find_free_region() and releases
  * the found region (by clearing it in the bitmap).
  */
-static inline void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order)
+static __always_inline
+void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order)
 {
 	bitmap_clear(bitmap, pos, BIT(order));
 }
@@ -644,7 +654,8 @@ static inline void bitmap_release_region(unsigned long *bitmap, unsigned int pos
  * Returns: 0 on success, or %-EBUSY if specified region wasn't
  * free (not all bits were zero).
  */
-static inline int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order)
+static __always_inline
+int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order)
 {
 	unsigned int len = BIT(order);
 
@@ -668,7 +679,8 @@ static inline int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos
  * Returns: the bit offset in bitmap of the allocated region,
  * or -errno on failure.
  */
-static inline int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
+static __always_inline
+int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
 {
 	unsigned int pos, end;		/* scans bitmap by regions of size order */
 
@@ -722,7 +734,7 @@ static inline int bitmap_find_free_region(unsigned long *bitmap, unsigned int bi
  * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits,
  * but we expect the lower 32-bits of u64.
  */
-static inline void bitmap_from_u64(unsigned long *dst, u64 mask)
+static __always_inline void bitmap_from_u64(unsigned long *dst, u64 mask)
 {
 	bitmap_from_arr64(dst, &mask, 64);
 }
@@ -737,9 +749,8 @@ static inline void bitmap_from_u64(unsigned long *dst, u64 mask)
  * @map memory region. For @nbits = 0 and @nbits > BITS_PER_LONG the return
  * value is undefined.
  */
-static inline unsigned long bitmap_read(const unsigned long *map,
-					unsigned long start,
-					unsigned long nbits)
+static __always_inline
+unsigned long bitmap_read(const unsigned long *map, unsigned long start, unsigned long nbits)
 {
 	size_t index = BIT_WORD(start);
 	unsigned long offset = start % BITS_PER_LONG;
@@ -772,8 +783,9 @@ static inline unsigned long bitmap_read(const unsigned long *map,
  *
  * For @nbits == 0 and @nbits > BITS_PER_LONG no writes are performed.
  */
-static inline void bitmap_write(unsigned long *map, unsigned long value,
-				unsigned long start, unsigned long nbits)
+static __always_inline
+void bitmap_write(unsigned long *map, unsigned long value,
+		  unsigned long start, unsigned long nbits)
 {
 	size_t index;
 	unsigned long offset;
-- 
cgit v1.2.3


From ab6b1010dab68f6d4bf063517db4ce2d63554bc6 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Thu, 18 Jul 2024 17:50:39 -0700
Subject: cpumask: Switch from inline to __always_inline

On recent (v6.6+) builds with Clang (based on Clang 18.0.0) and certain
configurations [0], I'm finding that (lack of) inlining decisions may
lead to section mismatch warnings like the following:

  WARNING: modpost: vmlinux.o: section mismatch in reference:
  cpumask_andnot (section: .text) ->
  cpuhp_bringup_cpus_parallel.tmp_mask (section: .init.data) ERROR:
  modpost: Section mismatches detected.

or more confusingly:

  WARNING: modpost: vmlinux: section mismatch in reference:
  cpumask_andnot+0x5f (section: .text) -> efi_systab_phys (section:
  .init.data)

The first warning makes a little sense, because
cpuhp_bringup_cpus_parallel() (an __init function) calls
cpumask_andnot() on tmp_mask (an __initdata symbol). If the compiler
doesn't inline cpumask_andnot(), this may appear like a mismatch.

The second warning makes less sense, but might be because efi_systab_phys
and cpuhp_bringup_cpus_parallel.tmp_mask are laid out near each other,
and the latter isn't a proper C symbol definition.

In any case, it seems a reasonable solution to suggest more strongly to
the compiler that these cpumask macros *must* be inlined, as 'inline' is
just a recommendation.

This change has been previously proposed in the past as:

  Subject: [PATCH 1/3] bitmap: switch from inline to __always_inline
  https://lore.kernel.org/all/20221027043810.350460-2-yury.norov@gmail.com/

But the change has been split up, to separately justify the cpumask
changes (which drive my work) and the bitmap/const optimizations (that
Yury separately proposed for other reasons). This ends up as somewhere
between a "rebase" and "rewrite" -- I had to rewrite most of the patch.

According to bloat-o-meter, vmlinux decreases minimally in size (-0.00%
to -0.01%, depending on the version of GCC or Clang and .config in
question) with this series of changes:

gcc 13.2.0, x86_64_defconfig
-3005 bytes, Before=21944501, After=21941496, chg -0.01%

clang 16.0.6, x86_64_defconfig
-105 bytes, Before=22571692, After=22571587, chg -0.00%

gcc 9.5.0, x86_64_defconfig
-1771 bytes, Before=21557598, After=21555827, chg -0.01%

clang 18.0_pre516547 (ChromiumOS toolchain), x86_64_defconfig
-191 bytes, Before=22615339, After=22615148, chg -0.00%

clang 18.0_pre516547 (ChromiumOS toolchain), based on ChromiumOS config + gcov
-979 bytes, Before=76294783, After=76293804, chg -0.00%

[0] CONFIG_HOTPLUG_PARALLEL=y ('select'ed for x86 as of [1]) and
    CONFIG_GCOV_PROFILE_ALL.

[1] commit 0c7ffa32dbd6 ("x86/smpboot/64: Implement
    arch_cpuhp_init_parallel_bringup() and enable it")

Co-developed-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Brian Norris <briannorris@chromium.org>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 212 +++++++++++++++++++++++++-----------------------
 1 file changed, 112 insertions(+), 100 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 801a7e524113..9e3f3c96607d 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -30,7 +30,7 @@
 extern unsigned int nr_cpu_ids;
 #endif
 
-static inline void set_nr_cpu_ids(unsigned int nr)
+static __always_inline void set_nr_cpu_ids(unsigned int nr)
 {
 #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
 	WARN_ON(nr != nr_cpu_ids);
@@ -149,7 +149,7 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu)
  *
  * Return: >= nr_cpu_ids if no cpus set.
  */
-static inline unsigned int cpumask_first(const struct cpumask *srcp)
+static __always_inline unsigned int cpumask_first(const struct cpumask *srcp)
 {
 	return find_first_bit(cpumask_bits(srcp), small_cpumask_bits);
 }
@@ -160,7 +160,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
  *
  * Return: >= nr_cpu_ids if all cpus are set.
  */
-static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
+static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
 {
 	return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits);
 }
@@ -172,7 +172,7 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
  *
  * Return: >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
  */
-static inline
+static __always_inline
 unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
 {
 	return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
@@ -186,7 +186,7 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask
  *
  * Return: >= nr_cpu_ids if no cpus set in all.
  */
-static inline
+static __always_inline
 unsigned int cpumask_first_and_and(const struct cpumask *srcp1,
 				   const struct cpumask *srcp2,
 				   const struct cpumask *srcp3)
@@ -201,7 +201,7 @@ unsigned int cpumask_first_and_and(const struct cpumask *srcp1,
  *
  * Return:	>= nr_cpumask_bits if no CPUs set.
  */
-static inline unsigned int cpumask_last(const struct cpumask *srcp)
+static __always_inline unsigned int cpumask_last(const struct cpumask *srcp)
 {
 	return find_last_bit(cpumask_bits(srcp), small_cpumask_bits);
 }
@@ -213,7 +213,7 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp)
  *
  * Return: >= nr_cpu_ids if no further cpus set.
  */
-static inline
+static __always_inline
 unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
 	/* -1 is a legal arg here. */
@@ -229,7 +229,8 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp)
  *
  * Return: >= nr_cpu_ids if no further cpus unset.
  */
-static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
+static __always_inline
+unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 {
 	/* -1 is a legal arg here. */
 	if (n != -1)
@@ -239,18 +240,21 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 
 #if NR_CPUS == 1
 /* Uniprocessor: there is only one valid CPU */
-static inline unsigned int cpumask_local_spread(unsigned int i, int node)
+static __always_inline
+unsigned int cpumask_local_spread(unsigned int i, int node)
 {
 	return 0;
 }
 
-static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
-						      const struct cpumask *src2p)
+static __always_inline
+unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
+					const struct cpumask *src2p)
 {
 	return cpumask_first_and(src1p, src2p);
 }
 
-static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp)
+static __always_inline
+unsigned int cpumask_any_distribute(const struct cpumask *srcp)
 {
 	return cpumask_first(srcp);
 }
@@ -269,9 +273,9 @@ unsigned int cpumask_any_distribute(const struct cpumask *srcp);
  *
  * Return: >= nr_cpu_ids if no further cpus set in both.
  */
-static inline
+static __always_inline
 unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
-		     const struct cpumask *src2p)
+			      const struct cpumask *src2p)
 {
 	/* -1 is a legal arg here. */
 	if (n != -1)
@@ -291,7 +295,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
 	for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
 
 #if NR_CPUS == 1
-static inline
+static __always_inline
 unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
 {
 	cpumask_check(start);
@@ -394,7 +398,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
  * Often used to find any cpu but smp_processor_id() in a mask.
  * Return: >= nr_cpu_ids if no cpus set.
  */
-static inline
+static __always_inline
 unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
 {
 	unsigned int i;
@@ -414,7 +418,7 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
  *
  * Returns >= nr_cpu_ids if no cpus set.
  */
-static inline
+static __always_inline
 unsigned int cpumask_any_and_but(const struct cpumask *mask1,
 				 const struct cpumask *mask2,
 				 unsigned int cpu)
@@ -436,7 +440,8 @@ unsigned int cpumask_any_and_but(const struct cpumask *mask1,
  *
  * Return: >= nr_cpu_ids if such cpu doesn't exist.
  */
-static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
+static __always_inline
+unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
 {
 	return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu));
 }
@@ -449,7 +454,7 @@ static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *s
  *
  * Return: >= nr_cpu_ids if such cpu doesn't exist.
  */
-static inline
+static __always_inline
 unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
 							const struct cpumask *srcp2)
 {
@@ -465,7 +470,7 @@ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
  *
  * Return: >= nr_cpu_ids if such cpu doesn't exist.
  */
-static inline
+static __always_inline
 unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
 							const struct cpumask *srcp2)
 {
@@ -508,12 +513,14 @@ unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp
  * @cpu: cpu number (< nr_cpu_ids)
  * @dstp: the cpumask pointer
  */
-static __always_inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
+static __always_inline
+void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
 {
 	set_bit(cpumask_check(cpu), cpumask_bits(dstp));
 }
 
-static __always_inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
+static __always_inline
+void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
 {
 	__set_bit(cpumask_check(cpu), cpumask_bits(dstp));
 }
@@ -557,7 +564,8 @@ static __always_inline void __cpumask_assign_cpu(int cpu, struct cpumask *dstp,
  *
  * Return: true if @cpu is set in @cpumask, else returns false
  */
-static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
+static __always_inline
+bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
 {
 	return test_bit(cpumask_check(cpu), cpumask_bits((cpumask)));
 }
@@ -571,7 +579,8 @@ static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpum
  *
  * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
  */
-static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
+static __always_inline
+bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
 {
 	return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
 }
@@ -585,7 +594,8 @@ static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cp
  *
  * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
  */
-static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
+static __always_inline
+bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
 {
 	return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask));
 }
@@ -594,7 +604,7 @@ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *
  * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
  * @dstp: the cpumask pointer
  */
-static inline void cpumask_setall(struct cpumask *dstp)
+static __always_inline void cpumask_setall(struct cpumask *dstp)
 {
 	if (small_const_nbits(small_cpumask_bits)) {
 		cpumask_bits(dstp)[0] = BITMAP_LAST_WORD_MASK(nr_cpumask_bits);
@@ -607,7 +617,7 @@ static inline void cpumask_setall(struct cpumask *dstp)
  * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask
  * @dstp: the cpumask pointer
  */
-static inline void cpumask_clear(struct cpumask *dstp)
+static __always_inline void cpumask_clear(struct cpumask *dstp)
 {
 	bitmap_zero(cpumask_bits(dstp), large_cpumask_bits);
 }
@@ -620,9 +630,9 @@ static inline void cpumask_clear(struct cpumask *dstp)
  *
  * Return: false if *@dstp is empty, else returns true
  */
-static inline bool cpumask_and(struct cpumask *dstp,
-			       const struct cpumask *src1p,
-			       const struct cpumask *src2p)
+static __always_inline
+bool cpumask_and(struct cpumask *dstp, const struct cpumask *src1p,
+		 const struct cpumask *src2p)
 {
 	return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p),
 				       cpumask_bits(src2p), small_cpumask_bits);
@@ -634,8 +644,9 @@ static inline bool cpumask_and(struct cpumask *dstp,
  * @src1p: the first input
  * @src2p: the second input
  */
-static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
-			      const struct cpumask *src2p)
+static __always_inline
+void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
+		const struct cpumask *src2p)
 {
 	bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
 				      cpumask_bits(src2p), small_cpumask_bits);
@@ -647,9 +658,9 @@ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
  * @src1p: the first input
  * @src2p: the second input
  */
-static inline void cpumask_xor(struct cpumask *dstp,
-			       const struct cpumask *src1p,
-			       const struct cpumask *src2p)
+static __always_inline
+void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p,
+		 const struct cpumask *src2p)
 {
 	bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p),
 				       cpumask_bits(src2p), small_cpumask_bits);
@@ -663,9 +674,9 @@ static inline void cpumask_xor(struct cpumask *dstp,
  *
  * Return: false if *@dstp is empty, else returns true
  */
-static inline bool cpumask_andnot(struct cpumask *dstp,
-				  const struct cpumask *src1p,
-				  const struct cpumask *src2p)
+static __always_inline
+bool cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p,
+		    const struct cpumask *src2p)
 {
 	return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p),
 					  cpumask_bits(src2p), small_cpumask_bits);
@@ -678,8 +689,8 @@ static inline bool cpumask_andnot(struct cpumask *dstp,
  *
  * Return: true if the cpumasks are equal, false if not
  */
-static inline bool cpumask_equal(const struct cpumask *src1p,
-				const struct cpumask *src2p)
+static __always_inline
+bool cpumask_equal(const struct cpumask *src1p, const struct cpumask *src2p)
 {
 	return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
 						 small_cpumask_bits);
@@ -694,9 +705,9 @@ static inline bool cpumask_equal(const struct cpumask *src1p,
  * Return: true if first cpumask ORed with second cpumask == third cpumask,
  *	   otherwise false
  */
-static inline bool cpumask_or_equal(const struct cpumask *src1p,
-				    const struct cpumask *src2p,
-				    const struct cpumask *src3p)
+static __always_inline
+bool cpumask_or_equal(const struct cpumask *src1p, const struct cpumask *src2p,
+		      const struct cpumask *src3p)
 {
 	return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p),
 			       cpumask_bits(src3p), small_cpumask_bits);
@@ -710,8 +721,8 @@ static inline bool cpumask_or_equal(const struct cpumask *src1p,
  * Return: true if first cpumask ANDed with second cpumask is non-empty,
  *	   otherwise false
  */
-static inline bool cpumask_intersects(const struct cpumask *src1p,
-				     const struct cpumask *src2p)
+static __always_inline
+bool cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p)
 {
 	return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
 						      small_cpumask_bits);
@@ -724,8 +735,8 @@ static inline bool cpumask_intersects(const struct cpumask *src1p,
  *
  * Return: true if *@src1p is a subset of *@src2p, else returns false
  */
-static inline bool cpumask_subset(const struct cpumask *src1p,
-				 const struct cpumask *src2p)
+static __always_inline
+bool cpumask_subset(const struct cpumask *src1p, const struct cpumask *src2p)
 {
 	return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
 						  small_cpumask_bits);
@@ -737,7 +748,7 @@ static inline bool cpumask_subset(const struct cpumask *src1p,
  *
  * Return: true if srcp is empty (has no bits set), else false
  */
-static inline bool cpumask_empty(const struct cpumask *srcp)
+static __always_inline bool cpumask_empty(const struct cpumask *srcp)
 {
 	return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits);
 }
@@ -748,7 +759,7 @@ static inline bool cpumask_empty(const struct cpumask *srcp)
  *
  * Return: true if srcp is full (has all bits set), else false
  */
-static inline bool cpumask_full(const struct cpumask *srcp)
+static __always_inline bool cpumask_full(const struct cpumask *srcp)
 {
 	return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits);
 }
@@ -759,7 +770,7 @@ static inline bool cpumask_full(const struct cpumask *srcp)
  *
  * Return: count of bits set in *srcp
  */
-static inline unsigned int cpumask_weight(const struct cpumask *srcp)
+static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp)
 {
 	return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits);
 }
@@ -771,8 +782,8 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp)
  *
  * Return: count of bits set in both *srcp1 and *srcp2
  */
-static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
-						const struct cpumask *srcp2)
+static __always_inline
+unsigned int cpumask_weight_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
 {
 	return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
 }
@@ -784,8 +795,9 @@ static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
  *
  * Return: count of bits set in both *srcp1 and *srcp2
  */
-static inline unsigned int cpumask_weight_andnot(const struct cpumask *srcp1,
-						const struct cpumask *srcp2)
+static __always_inline
+unsigned int cpumask_weight_andnot(const struct cpumask *srcp1,
+				   const struct cpumask *srcp2)
 {
 	return bitmap_weight_andnot(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
 }
@@ -796,8 +808,8 @@ static inline unsigned int cpumask_weight_andnot(const struct cpumask *srcp1,
  * @srcp: the input to shift
  * @n: the number of bits to shift by
  */
-static inline void cpumask_shift_right(struct cpumask *dstp,
-				       const struct cpumask *srcp, int n)
+static __always_inline
+void cpumask_shift_right(struct cpumask *dstp, const struct cpumask *srcp, int n)
 {
 	bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
 					       small_cpumask_bits);
@@ -809,8 +821,8 @@ static inline void cpumask_shift_right(struct cpumask *dstp,
  * @srcp: the input to shift
  * @n: the number of bits to shift by
  */
-static inline void cpumask_shift_left(struct cpumask *dstp,
-				      const struct cpumask *srcp, int n)
+static __always_inline
+void cpumask_shift_left(struct cpumask *dstp, const struct cpumask *srcp, int n)
 {
 	bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n,
 					      nr_cpumask_bits);
@@ -821,8 +833,8 @@ static inline void cpumask_shift_left(struct cpumask *dstp,
  * @dstp: the result
  * @srcp: the input cpumask
  */
-static inline void cpumask_copy(struct cpumask *dstp,
-				const struct cpumask *srcp)
+static __always_inline
+void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp)
 {
 	bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits);
 }
@@ -858,8 +870,8 @@ static inline void cpumask_copy(struct cpumask *dstp,
  *
  * Return: -errno, or 0 for success.
  */
-static inline int cpumask_parse_user(const char __user *buf, int len,
-				     struct cpumask *dstp)
+static __always_inline
+int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp)
 {
 	return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
 }
@@ -872,8 +884,8 @@ static inline int cpumask_parse_user(const char __user *buf, int len,
  *
  * Return: -errno, or 0 for success.
  */
-static inline int cpumask_parselist_user(const char __user *buf, int len,
-				     struct cpumask *dstp)
+static __always_inline
+int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp)
 {
 	return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
 				     nr_cpumask_bits);
@@ -886,7 +898,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len,
  *
  * Return: -errno, or 0 for success.
  */
-static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
+static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp)
 {
 	return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits);
 }
@@ -898,7 +910,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
  *
  * Return: -errno, or 0 for success.
  */
-static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
+static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp)
 {
 	return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
 }
@@ -908,7 +920,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
  *
  * Return: size to allocate for a &struct cpumask in bytes
  */
-static inline unsigned int cpumask_size(void)
+static __always_inline unsigned int cpumask_size(void)
 {
 	return bitmap_size(large_cpumask_bits);
 }
@@ -920,7 +932,7 @@ static inline unsigned int cpumask_size(void)
 
 bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
 
-static inline
+static __always_inline
 bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 {
 	return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
@@ -938,13 +950,13 @@ bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
  *
  * Return: %true if allocation succeeded, %false if not
  */
-static inline
+static __always_inline
 bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
 {
 	return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
 }
 
-static inline
+static __always_inline
 bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
 {
 	return alloc_cpumask_var(mask, flags | __GFP_ZERO);
@@ -954,7 +966,7 @@ void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
 void free_cpumask_var(cpumask_var_t mask);
 void free_bootmem_cpumask_var(cpumask_var_t mask);
 
-static inline bool cpumask_available(cpumask_var_t mask)
+static __always_inline bool cpumask_available(cpumask_var_t mask)
 {
 	return mask != NULL;
 }
@@ -964,43 +976,43 @@ static inline bool cpumask_available(cpumask_var_t mask)
 #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
 #define __cpumask_var_read_mostly
 
-static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
 {
 	return true;
 }
 
-static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
+static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
 					  int node)
 {
 	return true;
 }
 
-static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
 {
 	cpumask_clear(*mask);
 	return true;
 }
 
-static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
+static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
 					  int node)
 {
 	cpumask_clear(*mask);
 	return true;
 }
 
-static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
+static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
 {
 }
 
-static inline void free_cpumask_var(cpumask_var_t mask)
+static __always_inline void free_cpumask_var(cpumask_var_t mask)
 {
 }
 
-static inline void free_bootmem_cpumask_var(cpumask_var_t mask)
+static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask)
 {
 }
 
-static inline bool cpumask_available(cpumask_var_t mask)
+static __always_inline bool cpumask_available(cpumask_var_t mask)
 {
 	return true;
 }
@@ -1058,7 +1070,7 @@ void set_cpu_online(unsigned int cpu, bool online);
 	((struct cpumask *)(1 ? (bitmap)				\
 			    : (void *)sizeof(__check_is_bitmap(bitmap))))
 
-static inline int __check_is_bitmap(const unsigned long *bitmap)
+static __always_inline int __check_is_bitmap(const unsigned long *bitmap)
 {
 	return 1;
 }
@@ -1073,7 +1085,7 @@ static inline int __check_is_bitmap(const unsigned long *bitmap)
 extern const unsigned long
 	cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];
 
-static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
+static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu)
 {
 	const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
 	p -= cpu / BITS_PER_LONG;
@@ -1100,32 +1112,32 @@ static __always_inline unsigned int num_online_cpus(void)
 #define num_present_cpus()	cpumask_weight(cpu_present_mask)
 #define num_active_cpus()	cpumask_weight(cpu_active_mask)
 
-static inline bool cpu_online(unsigned int cpu)
+static __always_inline bool cpu_online(unsigned int cpu)
 {
 	return cpumask_test_cpu(cpu, cpu_online_mask);
 }
 
-static inline bool cpu_enabled(unsigned int cpu)
+static __always_inline bool cpu_enabled(unsigned int cpu)
 {
 	return cpumask_test_cpu(cpu, cpu_enabled_mask);
 }
 
-static inline bool cpu_possible(unsigned int cpu)
+static __always_inline bool cpu_possible(unsigned int cpu)
 {
 	return cpumask_test_cpu(cpu, cpu_possible_mask);
 }
 
-static inline bool cpu_present(unsigned int cpu)
+static __always_inline bool cpu_present(unsigned int cpu)
 {
 	return cpumask_test_cpu(cpu, cpu_present_mask);
 }
 
-static inline bool cpu_active(unsigned int cpu)
+static __always_inline bool cpu_active(unsigned int cpu)
 {
 	return cpumask_test_cpu(cpu, cpu_active_mask);
 }
 
-static inline bool cpu_dying(unsigned int cpu)
+static __always_inline bool cpu_dying(unsigned int cpu)
 {
 	return cpumask_test_cpu(cpu, cpu_dying_mask);
 }
@@ -1138,32 +1150,32 @@ static inline bool cpu_dying(unsigned int cpu)
 #define num_present_cpus()	1U
 #define num_active_cpus()	1U
 
-static inline bool cpu_online(unsigned int cpu)
+static __always_inline bool cpu_online(unsigned int cpu)
 {
 	return cpu == 0;
 }
 
-static inline bool cpu_possible(unsigned int cpu)
+static __always_inline bool cpu_possible(unsigned int cpu)
 {
 	return cpu == 0;
 }
 
-static inline bool cpu_enabled(unsigned int cpu)
+static __always_inline bool cpu_enabled(unsigned int cpu)
 {
 	return cpu == 0;
 }
 
-static inline bool cpu_present(unsigned int cpu)
+static __always_inline bool cpu_present(unsigned int cpu)
 {
 	return cpu == 0;
 }
 
-static inline bool cpu_active(unsigned int cpu)
+static __always_inline bool cpu_active(unsigned int cpu)
 {
 	return cpu == 0;
 }
 
-static inline bool cpu_dying(unsigned int cpu)
+static __always_inline bool cpu_dying(unsigned int cpu)
 {
 	return false;
 }
@@ -1197,7 +1209,7 @@ static inline bool cpu_dying(unsigned int cpu)
  * Return: the length of the (null-terminated) @buf string, zero if
  * nothing is copied.
  */
-static inline ssize_t
+static __always_inline ssize_t
 cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
 {
 	return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask),
@@ -1220,9 +1232,9 @@ cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
  * Return: the length of how many bytes have been copied, excluding
  * terminating '\0'.
  */
-static inline ssize_t
-cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
-		loff_t off, size_t count)
+static __always_inline
+ssize_t cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
+				    loff_t off, size_t count)
 {
 	return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask),
 				   nr_cpu_ids, off, count) - 1;
@@ -1242,9 +1254,9 @@ cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
  * Return: the length of how many bytes have been copied, excluding
  * terminating '\0'.
  */
-static inline ssize_t
-cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
-		loff_t off, size_t count)
+static __always_inline
+ssize_t cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
+				 loff_t off, size_t count)
 {
 	return bitmap_print_list_to_buf(buf, cpumask_bits(mask),
 				   nr_cpu_ids, off, count) - 1;
-- 
cgit v1.2.3


From 54c9e0085bd1015e524d8f4d3c4e78a7bc77ffca Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 18 Jul 2024 17:50:40 -0700
Subject: nodemask: Switch from inline to __always_inline

'inline' keyword is only a recommendation for compiler. If it decides to
not inline nodemask functions, the whole small_const_nbits() machinery
doesn't work.

This is how a standard GCC 11.3.0 does for my x86_64 build now. This patch
replaces 'inline' directive with unconditional '__always_inline' to make
sure that there's always a chance for compile-time optimization. It doesn't
change size of kernel image, according to bloat-o-meter.

[[ Brian: split out from:
      Subject: [PATCH 1/3] bitmap: switch from inline to __always_inline
      https://lore.kernel.org/all/20221027043810.350460-2-yury.norov@gmail.com/
   But rewritten, as there were too many conflicts. ]]

Co-developed-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Brian Norris <briannorris@chromium.org>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/nodemask.h | 86 ++++++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index b61438313a73..9fd7a0ce9c1a 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -107,11 +107,11 @@ extern nodemask_t _unused_nodemask_arg_;
  */
 #define nodemask_pr_args(maskp)	__nodemask_pr_numnodes(maskp), \
 				__nodemask_pr_bits(maskp)
-static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
+static __always_inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
 {
 	return m ? MAX_NUMNODES : 0;
 }
-static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
+static __always_inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
 {
 	return m ? m->bits : NULL;
 }
@@ -132,19 +132,19 @@ static __always_inline void __node_set(int node, volatile nodemask_t *dstp)
 }
 
 #define node_clear(node, dst) __node_clear((node), &(dst))
-static inline void __node_clear(int node, volatile nodemask_t *dstp)
+static __always_inline void __node_clear(int node, volatile nodemask_t *dstp)
 {
 	clear_bit(node, dstp->bits);
 }
 
 #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
-static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
+static __always_inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
 {
 	bitmap_fill(dstp->bits, nbits);
 }
 
 #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
-static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
+static __always_inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
 {
 	bitmap_zero(dstp->bits, nbits);
 }
@@ -154,14 +154,14 @@ static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
 
 #define node_test_and_set(node, nodemask) \
 			__node_test_and_set((node), &(nodemask))
-static inline bool __node_test_and_set(int node, nodemask_t *addr)
+static __always_inline bool __node_test_and_set(int node, nodemask_t *addr)
 {
 	return test_and_set_bit(node, addr->bits);
 }
 
 #define nodes_and(dst, src1, src2) \
 			__nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
-static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
+static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
 					const nodemask_t *src2p, unsigned int nbits)
 {
 	bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
@@ -169,7 +169,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
 
 #define nodes_or(dst, src1, src2) \
 			__nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
-static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
+static __always_inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
 					const nodemask_t *src2p, unsigned int nbits)
 {
 	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
@@ -177,7 +177,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
 
 #define nodes_xor(dst, src1, src2) \
 			__nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
-static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
+static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
 					const nodemask_t *src2p, unsigned int nbits)
 {
 	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
@@ -185,7 +185,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
 
 #define nodes_andnot(dst, src1, src2) \
 			__nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
-static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
+static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
 					const nodemask_t *src2p, unsigned int nbits)
 {
 	bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
@@ -193,7 +193,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
 
 #define nodes_complement(dst, src) \
 			__nodes_complement(&(dst), &(src), MAX_NUMNODES)
-static inline void __nodes_complement(nodemask_t *dstp,
+static __always_inline void __nodes_complement(nodemask_t *dstp,
 					const nodemask_t *srcp, unsigned int nbits)
 {
 	bitmap_complement(dstp->bits, srcp->bits, nbits);
@@ -201,7 +201,7 @@ static inline void __nodes_complement(nodemask_t *dstp,
 
 #define nodes_equal(src1, src2) \
 			__nodes_equal(&(src1), &(src2), MAX_NUMNODES)
-static inline bool __nodes_equal(const nodemask_t *src1p,
+static __always_inline bool __nodes_equal(const nodemask_t *src1p,
 					const nodemask_t *src2p, unsigned int nbits)
 {
 	return bitmap_equal(src1p->bits, src2p->bits, nbits);
@@ -209,7 +209,7 @@ static inline bool __nodes_equal(const nodemask_t *src1p,
 
 #define nodes_intersects(src1, src2) \
 			__nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
-static inline bool __nodes_intersects(const nodemask_t *src1p,
+static __always_inline bool __nodes_intersects(const nodemask_t *src1p,
 					const nodemask_t *src2p, unsigned int nbits)
 {
 	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
@@ -217,33 +217,33 @@ static inline bool __nodes_intersects(const nodemask_t *src1p,
 
 #define nodes_subset(src1, src2) \
 			__nodes_subset(&(src1), &(src2), MAX_NUMNODES)
-static inline bool __nodes_subset(const nodemask_t *src1p,
+static __always_inline bool __nodes_subset(const nodemask_t *src1p,
 					const nodemask_t *src2p, unsigned int nbits)
 {
 	return bitmap_subset(src1p->bits, src2p->bits, nbits);
 }
 
 #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
-static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
+static __always_inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
 {
 	return bitmap_empty(srcp->bits, nbits);
 }
 
 #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
-static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
+static __always_inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
 {
 	return bitmap_full(srcp->bits, nbits);
 }
 
 #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
-static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
+static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
 {
 	return bitmap_weight(srcp->bits, nbits);
 }
 
 #define nodes_shift_right(dst, src, n) \
 			__nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
-static inline void __nodes_shift_right(nodemask_t *dstp,
+static __always_inline void __nodes_shift_right(nodemask_t *dstp,
 					const nodemask_t *srcp, int n, int nbits)
 {
 	bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
@@ -251,7 +251,7 @@ static inline void __nodes_shift_right(nodemask_t *dstp,
 
 #define nodes_shift_left(dst, src, n) \
 			__nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
-static inline void __nodes_shift_left(nodemask_t *dstp,
+static __always_inline void __nodes_shift_left(nodemask_t *dstp,
 					const nodemask_t *srcp, int n, int nbits)
 {
 	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
@@ -261,13 +261,13 @@ static inline void __nodes_shift_left(nodemask_t *dstp,
           > MAX_NUMNODES, then the silly min_ts could be dropped. */
 
 #define first_node(src) __first_node(&(src))
-static inline unsigned int __first_node(const nodemask_t *srcp)
+static __always_inline unsigned int __first_node(const nodemask_t *srcp)
 {
 	return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
 }
 
 #define next_node(n, src) __next_node((n), &(src))
-static inline unsigned int __next_node(int n, const nodemask_t *srcp)
+static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp)
 {
 	return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
 }
@@ -277,7 +277,7 @@ static inline unsigned int __next_node(int n, const nodemask_t *srcp)
  * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
  */
 #define next_node_in(n, src) __next_node_in((n), &(src))
-static inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
+static __always_inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
 {
 	unsigned int ret = __next_node(node, srcp);
 
@@ -286,7 +286,7 @@ static inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
 	return ret;
 }
 
-static inline void init_nodemask_of_node(nodemask_t *mask, int node)
+static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node)
 {
 	nodes_clear(*mask);
 	node_set(node, *mask);
@@ -304,7 +304,7 @@ static inline void init_nodemask_of_node(nodemask_t *mask, int node)
 })
 
 #define first_unset_node(mask) __first_unset_node(&(mask))
-static inline unsigned int __first_unset_node(const nodemask_t *maskp)
+static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp)
 {
 	return min_t(unsigned int, MAX_NUMNODES,
 			find_first_zero_bit(maskp->bits, MAX_NUMNODES));
@@ -338,21 +338,21 @@ static inline unsigned int __first_unset_node(const nodemask_t *maskp)
 
 #define nodemask_parse_user(ubuf, ulen, dst) \
 		__nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES)
-static inline int __nodemask_parse_user(const char __user *buf, int len,
+static __always_inline int __nodemask_parse_user(const char __user *buf, int len,
 					nodemask_t *dstp, int nbits)
 {
 	return bitmap_parse_user(buf, len, dstp->bits, nbits);
 }
 
 #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
-static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
+static __always_inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
 {
 	return bitmap_parselist(buf, dstp->bits, nbits);
 }
 
 #define node_remap(oldbit, old, new) \
 		__node_remap((oldbit), &(old), &(new), MAX_NUMNODES)
-static inline int __node_remap(int oldbit,
+static __always_inline int __node_remap(int oldbit,
 		const nodemask_t *oldp, const nodemask_t *newp, int nbits)
 {
 	return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits);
@@ -360,7 +360,7 @@ static inline int __node_remap(int oldbit,
 
 #define nodes_remap(dst, src, old, new) \
 		__nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES)
-static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
+static __always_inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
 		const nodemask_t *oldp, const nodemask_t *newp, int nbits)
 {
 	bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
@@ -368,7 +368,7 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
 
 #define nodes_onto(dst, orig, relmap) \
 		__nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES)
-static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
+static __always_inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
 		const nodemask_t *relmapp, int nbits)
 {
 	bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
@@ -376,7 +376,7 @@ static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
 
 #define nodes_fold(dst, orig, sz) \
 		__nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES)
-static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
+static __always_inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
 		int sz, int nbits)
 {
 	bitmap_fold(dstp->bits, origp->bits, sz, nbits);
@@ -418,22 +418,22 @@ enum node_states {
 extern nodemask_t node_states[NR_NODE_STATES];
 
 #if MAX_NUMNODES > 1
-static inline int node_state(int node, enum node_states state)
+static __always_inline int node_state(int node, enum node_states state)
 {
 	return node_isset(node, node_states[state]);
 }
 
-static inline void node_set_state(int node, enum node_states state)
+static __always_inline void node_set_state(int node, enum node_states state)
 {
 	__node_set(node, &node_states[state]);
 }
 
-static inline void node_clear_state(int node, enum node_states state)
+static __always_inline void node_clear_state(int node, enum node_states state)
 {
 	__node_clear(node, &node_states[state]);
 }
 
-static inline int num_node_state(enum node_states state)
+static __always_inline int num_node_state(enum node_states state)
 {
 	return nodes_weight(node_states[state]);
 }
@@ -443,11 +443,11 @@ static inline int num_node_state(enum node_states state)
 
 #define first_online_node	first_node(node_states[N_ONLINE])
 #define first_memory_node	first_node(node_states[N_MEMORY])
-static inline unsigned int next_online_node(int nid)
+static __always_inline unsigned int next_online_node(int nid)
 {
 	return next_node(nid, node_states[N_ONLINE]);
 }
-static inline unsigned int next_memory_node(int nid)
+static __always_inline unsigned int next_memory_node(int nid)
 {
 	return next_node(nid, node_states[N_MEMORY]);
 }
@@ -455,13 +455,13 @@ static inline unsigned int next_memory_node(int nid)
 extern unsigned int nr_node_ids;
 extern unsigned int nr_online_nodes;
 
-static inline void node_set_online(int nid)
+static __always_inline void node_set_online(int nid)
 {
 	node_set_state(nid, N_ONLINE);
 	nr_online_nodes = num_node_state(N_ONLINE);
 }
 
-static inline void node_set_offline(int nid)
+static __always_inline void node_set_offline(int nid)
 {
 	node_clear_state(nid, N_ONLINE);
 	nr_online_nodes = num_node_state(N_ONLINE);
@@ -469,20 +469,20 @@ static inline void node_set_offline(int nid)
 
 #else
 
-static inline int node_state(int node, enum node_states state)
+static __always_inline int node_state(int node, enum node_states state)
 {
 	return node == 0;
 }
 
-static inline void node_set_state(int node, enum node_states state)
+static __always_inline void node_set_state(int node, enum node_states state)
 {
 }
 
-static inline void node_clear_state(int node, enum node_states state)
+static __always_inline void node_clear_state(int node, enum node_states state)
 {
 }
 
-static inline int num_node_state(enum node_states state)
+static __always_inline int num_node_state(enum node_states state)
 {
 	return 1;
 }
@@ -502,7 +502,7 @@ static inline int num_node_state(enum node_states state)
 
 #endif
 
-static inline int node_random(const nodemask_t *maskp)
+static __always_inline int node_random(const nodemask_t *maskp)
 {
 #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
 	int w, bit;
-- 
cgit v1.2.3


From b27404b2bbf951a11500525dea15681cc970226c Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 19 Aug 2024 08:55:46 +0800
Subject: ALSA/ASoC/SoundWire: Intel: use single definition for
 SDW_INTEL_MAX_LINKS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The definitions are currently duplicated in intel-sdw-acpi.c and
sof_sdw.c.  Move the definition to the sdw_intel.h header, and change
the prefix to make it Intel-specific.

No functionality change in this patch.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20240819005548.5867-2-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index d537587b4499..87d82ea9a13a 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -447,4 +447,9 @@ extern const struct sdw_intel_hw_ops sdw_intel_lnl_hw_ops;
 
 #define SDW_INTEL_DEV_NUM_IDA_MIN           6
 
+/*
+ * Max number of links supported in hardware
+ */
+#define SDW_INTEL_MAX_LINKS                4
+
 #endif
-- 
cgit v1.2.3


From d2234596be2192d9c1ae34f8c7534531191bc433 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 19 Aug 2024 08:55:47 +0800
Subject: soundwire: intel: add probe-time check on link id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In older platforms, the number of links was constant and hard-coded to
4. Newer platforms can have varying number of links, so we need to add
a probe-time check to make sure the ACPI-reported information with
_DSD properties is aligned with hardware capabilities reported in the
SoundWire LCAP register.

Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Acked-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20240819005548.5867-3-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 87d82ea9a13a..cb8e7396b4db 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -388,6 +388,7 @@ struct sdw_intel;
 /* struct intel_sdw_hw_ops - SoundWire ops for Intel platforms.
  * @debugfs_init: initialize all debugfs capabilities
  * @debugfs_exit: close and cleanup debugfs capabilities
+ * @get_link_count: fetch link count from hardware registers
  * @register_dai: read all PDI information and register DAIs
  * @check_clock_stop: throw error message if clock is not stopped.
  * @start_bus: normal start
@@ -412,6 +413,8 @@ struct sdw_intel_hw_ops {
 	void (*debugfs_init)(struct sdw_intel *sdw);
 	void (*debugfs_exit)(struct sdw_intel *sdw);
 
+	int (*get_link_count)(struct sdw_intel *sdw);
+
 	int (*register_dai)(struct sdw_intel *sdw);
 
 	void (*check_clock_stop)(struct sdw_intel *sdw);
-- 
cgit v1.2.3


From 1f3662838a05f1ab6af89a417f6f252d91d0806b Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 19 Aug 2024 08:55:48 +0800
Subject: soundwire: intel: increase maximum number of links
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Intel platforms have enabled 4 links since the beginning, newer
platforms now have 5 links. Update the definition accordingly.

This patch will have no effect on older platforms where the number of
links was hard-coded. A follow-up patch will add a dynamic check that
the ACPI-reported information is aligned with hardware capabilities on
newer platforms.

Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Acked-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20240819005548.5867-4-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index cb8e7396b4db..37ae69365fe2 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -453,6 +453,6 @@ extern const struct sdw_intel_hw_ops sdw_intel_lnl_hw_ops;
 /*
  * Max number of links supported in hardware
  */
-#define SDW_INTEL_MAX_LINKS                4
+#define SDW_INTEL_MAX_LINKS                5
 
 #endif
-- 
cgit v1.2.3


From c393eaa85349941badf3ce5f087400dbaf3bbe02 Mon Sep 17 00:00:00 2001
From: Haifeng Xu <haifeng.xu@shopee.com>
Date: Fri, 26 Jul 2024 11:05:25 +0800
Subject: fs: don't flush in-flight wb switches for superblocks without cgroup
 writeback

When deactivating any type of superblock, it had to wait for the in-flight
wb switches to be completed. wb switches are executed in inode_switch_wbs_work_fn()
which needs to acquire the wb_switch_rwsem and races against sync_inodes_sb().
If there are too much dirty data in the superblock, the waiting time may increase
significantly.

For superblocks without cgroup writeback such as tmpfs, they have nothing to
do with the wb swithes, so the flushing can be avoided.

Signed-off-by: Haifeng Xu <haifeng.xu@shopee.com>
Link: https://lore.kernel.org/r/20240726030525.180330-1-haifeng.xu@shopee.com
Reviewed-by: Jan Kara <jack@suse.cz>
Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/writeback.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 1a54676d843a..56b85841ae4c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -217,7 +217,7 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
 			      size_t bytes);
 int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
 			   enum wb_reason reason, struct wb_completion *done);
-void cgroup_writeback_umount(void);
+void cgroup_writeback_umount(struct super_block *sb);
 bool cleanup_offline_cgwb(struct bdi_writeback *wb);
 
 /**
@@ -324,7 +324,7 @@ static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
 {
 }
 
-static inline void cgroup_writeback_umount(void)
+static inline void cgroup_writeback_umount(struct super_block *sb)
 {
 }
 
-- 
cgit v1.2.3


From c01a5d89e5c8abe638107be2a4ea9e4c7fcdd7f6 Mon Sep 17 00:00:00 2001
From: Wang Long <w@laoqinren.net>
Date: Fri, 2 Aug 2024 17:19:01 +0800
Subject: percpu-rwsem: remove the unused parameter 'read'

In the function percpu_rwsem_release, the parameter `read`
is unused, so remove it.

Signed-off-by: Wang Long <w@laoqinren.net>
Link: https://lore.kernel.org/r/20240802091901.2546797-1-w@laoqinren.net
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h           | 2 +-
 include/linux/percpu-rwsem.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb0426f349fc..c0286d479c9d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1683,7 +1683,7 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
 #define __sb_writers_acquired(sb, lev)	\
 	percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
 #define __sb_writers_release(sb, lev)	\
-	percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
+	percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_)
 
 /**
  * __sb_write_started - check if sb freeze level is held
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 36b942b67b7d..c012df33a9f0 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -145,7 +145,7 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 #define percpu_rwsem_assert_held(sem)	lockdep_assert_held(sem)
 
 static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
-					bool read, unsigned long ip)
+					unsigned long ip)
 {
 	lock_release(&sem->dep_map, ip);
 }
-- 
cgit v1.2.3


From 2865baf54077aa98fcdb478cefe6a42c417b9374 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 8 Apr 2024 20:04:58 -0700
Subject: x86: support user address masking instead of non-speculative
 conditional

The Spectre-v1 mitigations made "access_ok()" much more expensive, since
it has to serialize execution with the test for a valid user address.

All the normal user copy routines avoid this by just masking the user
address with a data-dependent mask instead, but the fast
"unsafe_user_read()" kind of patterms that were supposed to be a fast
case got slowed down.

This introduces a notion of using

	src = masked_user_access_begin(src);

to do the user address sanity using a data-dependent mask instead of the
more traditional conditional

	if (user_read_access_begin(src, len)) {

model.

This model only works for dense accesses that start at 'src' and on
architectures that have a guard region that is guaranteed to fault in
between the user space and the kernel space area.

With this, the user access doesn't need to be manually checked, because
a bad address is guaranteed to fault (by some architecture masking
trick: on x86-64 this involves just turning an invalid user address into
all ones, since we don't map the top of address space).

This only converts a couple of examples for now.  Example x86-64 code
generation for loading two words from user space:

        stac
        mov    %rax,%rcx
        sar    $0x3f,%rcx
        or     %rax,%rcx
        mov    (%rcx),%r13
        mov    0x8(%rcx),%r14
        clac

where all the error handling and -EFAULT is now purely handled out of
line by the exception path.

Of course, if the micro-architecture does badly at 'clac' and 'stac',
the above is still pitifully slow.  But at least we did as well as we
could.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uaccess.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 3064314f4832..f18371f6cf36 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -32,6 +32,13 @@
 })
 #endif
 
+#ifdef masked_user_access_begin
+ #define can_do_masked_user_access() 1
+#else
+ #define can_do_masked_user_access() 0
+ #define masked_user_access_begin(src) NULL
+#endif
+
 /*
  * Architectures should provide two primitives (raw_copy_{to,from}_user())
  * and get rid of their private instances of copy_{to,from}_user() and
-- 
cgit v1.2.3


From 8dffaec34dd55473adcbc924a4c9b04aaa0d4278 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Aug 2024 12:18:23 -1000
Subject: workqueue: Fix htmldocs build warning

Fix htmldocs build warning introduced by ec0a7d44b358 ("workqueue: Add
interface for user-defined workqueue lockdep map").

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Matthew Brost <matthew.brost@intel.com>
---
 include/linux/workqueue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 8ccbf510880b..04dff07a9e2b 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -534,7 +534,7 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active,
  * @fmt: printf format for the name of the workqueue
  * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
  * @lockdep_map: user-defined lockdep_map
- * @args: args for @fmt
+ * @...: args for @fmt
  *
  * Same as alloc_ordered_workqueue but with the a user-define lockdep_map.
  * Useful for workqueues created with the same purpose and to avoid leaking a
-- 
cgit v1.2.3


From 7f6287417baf57754f47687c6ea1a749a0686ab0 Mon Sep 17 00:00:00 2001
From: Matteo Croce <teknoraver@meta.com>
Date: Mon, 19 Aug 2024 18:28:05 +0200
Subject: bpf: Allow bpf_current_task_under_cgroup() with BPF_CGROUP_*

The helper bpf_current_task_under_cgroup() currently is only allowed for
tracing programs, allow its usage also in the BPF_CGROUP_* program types.

Move the code from kernel/trace/bpf_trace.c to kernel/bpf/helpers.c,
so it compiles also without CONFIG_BPF_EVENTS.

This will be used in systemd-networkd to monitor the sysctl writes,
and filter it's own writes from others:
https://github.com/systemd/systemd/pull/32212

Signed-off-by: Matteo Croce <teknoraver@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240819162805.78235-3-technoboy85@gmail.com
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b9425e410bcb..f0192c173ed8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3206,6 +3206,7 @@ extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
 extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto;
 extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto;
+extern const struct bpf_func_proto bpf_current_task_under_cgroup_proto;
 extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
 extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
 extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
-- 
cgit v1.2.3


From 6ad8bc92a47702f0b5d0b96b680199910b89f688 Mon Sep 17 00:00:00 2001
From: Christian Hopps <chopps@labn.net>
Date: Thu, 15 Aug 2024 13:21:14 -0400
Subject: net: add copy from skb_seq_state to buffer function

Add an skb helper function to copy a range of bytes from within
an existing skb_seq_state.

Signed-off-by: Christian Hopps <chopps@labn.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/skbuff.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 29c3ea5b6e93..a871533b8568 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1433,6 +1433,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
 unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
 			  struct skb_seq_state *st);
 void skb_abort_seq_read(struct skb_seq_state *st);
+int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len);
 
 unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
 			   unsigned int to, struct ts_config *config);
-- 
cgit v1.2.3


From b6cee6544d01b8ac01232d343b1b2b594d94d61c Mon Sep 17 00:00:00 2001
From: Dikshita Agarwal <quic_dikshita@quicinc.com>
Date: Mon, 19 Aug 2024 15:59:09 +0530
Subject: PM: domains: add device managed version of
 dev_pm_domain_attach|detach_list()

Add the devres-enabled version of dev_pm_domain_attach|detach_list.
If client drivers use devm_pm_domain_attach_list() to attach the PM domains,
devm_pm_domain_detach_list() will be invoked implicitly during remove phase.

Signed-off-by: Dikshita Agarwal <quic_dikshita@quicinc.com>
Link: https://lore.kernel.org/r/1724063350-11993-2-git-send-email-quic_dikshita@quicinc.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/pm_domain.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index b86bb52858ac..b637ec14025f 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -476,6 +476,9 @@ struct device *dev_pm_domain_attach_by_name(struct device *dev,
 int dev_pm_domain_attach_list(struct device *dev,
 			      const struct dev_pm_domain_attach_data *data,
 			      struct dev_pm_domain_list **list);
+int devm_pm_domain_attach_list(struct device *dev,
+			       const struct dev_pm_domain_attach_data *data,
+			       struct dev_pm_domain_list **list);
 void dev_pm_domain_detach(struct device *dev, bool power_off);
 void dev_pm_domain_detach_list(struct dev_pm_domain_list *list);
 int dev_pm_domain_start(struct device *dev);
@@ -502,6 +505,14 @@ static inline int dev_pm_domain_attach_list(struct device *dev,
 {
 	return 0;
 }
+
+static inline int devm_pm_domain_attach_list(struct device *dev,
+					     const struct dev_pm_domain_attach_data *data,
+					     struct dev_pm_domain_list **list)
+{
+	return 0;
+}
+
 static inline void dev_pm_domain_detach(struct device *dev, bool power_off) {}
 static inline void dev_pm_domain_detach_list(struct dev_pm_domain_list *list) {}
 static inline int dev_pm_domain_start(struct device *dev)
-- 
cgit v1.2.3


From acb0184fe9bca3bb7103dadc76ba9d38c969ca86 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 22 Jul 2024 11:11:53 +0100
Subject: coresight: Move struct coresight_trace_id_map to common header

The trace ID maps will need to be created and stored by the core and
Perf code so move the definition up to the common header.

Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: James Clark <james.clark@arm.com>
Tested-by: Leo Yan <leo.yan@arm.com>
Tested-by: Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20240722101202.26915-12-james.clark@linaro.org
---
 include/linux/coresight.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index f09ace92176e..c16c61a8411d 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -218,6 +218,24 @@ struct coresight_sysfs_link {
 	const char *target_name;
 };
 
+/* architecturally we have 128 IDs some of which are reserved */
+#define CORESIGHT_TRACE_IDS_MAX 128
+
+/**
+ * Trace ID map.
+ *
+ * @used_ids:	Bitmap to register available (bit = 0) and in use (bit = 1) IDs.
+ *		Initialised so that the reserved IDs are permanently marked as
+ *		in use.
+ * @pend_rel_ids: CPU IDs that have been released by the trace source but not
+ *		  yet marked as available, to allow re-allocation to the same
+ *		  CPU during a perf session.
+ */
+struct coresight_trace_id_map {
+	DECLARE_BITMAP(used_ids, CORESIGHT_TRACE_IDS_MAX);
+	DECLARE_BITMAP(pend_rel_ids, CORESIGHT_TRACE_IDS_MAX);
+};
+
 /**
  * struct coresight_device - representation of a device as used by the framework
  * @pdata:	Platform data with device connections associated to this device.
-- 
cgit v1.2.3


From d53c8253c7822fbc524adcd950e7c6de6a229c99 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 22 Jul 2024 11:11:55 +0100
Subject: coresight: Make CPU id map a property of a trace ID map

The global CPU ID mappings won't work for per-sink ID maps so move it to
the ID map struct. coresight_trace_id_release_all_pending() is hard
coded to operate on the default map, but once Perf sessions use their
own maps the pending release mechanism will be deleted. So it doesn't
need to be extended to accept a trace ID map argument at this point.

Signed-off-by: James Clark <james.clark@arm.com>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Tested-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20240722101202.26915-14-james.clark@linaro.org
---
 include/linux/coresight.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index c16c61a8411d..7d62b88bfb5c 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -234,6 +234,7 @@ struct coresight_sysfs_link {
 struct coresight_trace_id_map {
 	DECLARE_BITMAP(used_ids, CORESIGHT_TRACE_IDS_MAX);
 	DECLARE_BITMAP(pend_rel_ids, CORESIGHT_TRACE_IDS_MAX);
+	atomic_t __percpu *cpu_map;
 };
 
 /**
-- 
cgit v1.2.3


From 5ad628a7617607baac53745f8025638b967470a2 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 22 Jul 2024 11:11:56 +0100
Subject: coresight: Use per-sink trace ID maps for Perf sessions

This will allow sessions with more than CORESIGHT_TRACE_IDS_MAX ETMs
as long as there are fewer than that many ETMs connected to each sink.

Each sink owns its own trace ID map, and any Perf session connecting to
that sink will allocate from it, even if the sink is currently in use by
other users. This is similar to the existing behavior where the dynamic
trace IDs are constant as long as there is any concurrent Perf session
active. It's not completely optimal because slightly more IDs will be
used than necessary, but the optimal solution involves tracking the PIDs
of each session and allocating ID maps based on the session owner. This
is difficult to do with the combination of per-thread and per-cpu modes
and some scheduling issues. The complexity of this isn't likely to worth
it because even with multiple users they'd just see a difference in the
ordering of ID allocations rather than hitting any limits (unless the
hardware does have too many ETMs connected to one sink).

Signed-off-by: James Clark <james.clark@arm.com>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20240722101202.26915-15-james.clark@linaro.org
---
 include/linux/coresight.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 7d62b88bfb5c..9c3067e2e38b 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -290,6 +290,7 @@ struct coresight_device {
 	bool sysfs_sink_activated;
 	struct dev_ext_attribute *ea;
 	struct coresight_device *def_sink;
+	struct coresight_trace_id_map perf_sink_id_map;
 	/* sysfs links between components */
 	int nr_links;
 	bool has_conns_grp;
@@ -384,7 +385,7 @@ struct coresight_ops_link {
 struct coresight_ops_source {
 	int (*cpu_id)(struct coresight_device *csdev);
 	int (*enable)(struct coresight_device *csdev, struct perf_event *event,
-		      enum cs_mode mode);
+		      enum cs_mode mode, struct coresight_trace_id_map *id_map);
 	void (*disable)(struct coresight_device *csdev,
 			struct perf_event *event);
 };
-- 
cgit v1.2.3


From de0029fdde86092c75472c92e56a962f4edee0f6 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 22 Jul 2024 11:11:57 +0100
Subject: coresight: Remove pending trace ID release mechanism

Pending the release of IDs was a way of managing concurrent sysfs and
Perf sessions in a single global ID map. Perf may have finished while
sysfs hadn't, and Perf shouldn't release the IDs in use by sysfs and
vice versa.

Now that Perf uses its own exclusive ID maps, pending release doesn't
result in any different behavior than just releasing all IDs when the
last Perf session finishes. As part of the per-sink trace ID change, we
would have still had to make the pending mechanism work on a per-sink
basis, due to the overlapping ID allocations, so instead of making that
more complicated, just remove it.

Signed-off-by: James Clark <james.clark@arm.com>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20240722101202.26915-16-james.clark@linaro.org
---
 include/linux/coresight.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 9c3067e2e38b..197949fd2c35 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -227,14 +227,12 @@ struct coresight_sysfs_link {
  * @used_ids:	Bitmap to register available (bit = 0) and in use (bit = 1) IDs.
  *		Initialised so that the reserved IDs are permanently marked as
  *		in use.
- * @pend_rel_ids: CPU IDs that have been released by the trace source but not
- *		  yet marked as available, to allow re-allocation to the same
- *		  CPU during a perf session.
+ * @perf_cs_etm_session_active: Number of Perf sessions using this ID map.
  */
 struct coresight_trace_id_map {
 	DECLARE_BITMAP(used_ids, CORESIGHT_TRACE_IDS_MAX);
-	DECLARE_BITMAP(pend_rel_ids, CORESIGHT_TRACE_IDS_MAX);
 	atomic_t __percpu *cpu_map;
+	atomic_t perf_cs_etm_session_active;
 };
 
 /**
-- 
cgit v1.2.3


From 487eec8da80aef16229d30429f1b26090b1bf0eb Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 22 Jul 2024 11:11:58 +0100
Subject: coresight: Emit sink ID in the HW_ID packets

For Perf to be able to decode when per-sink trace IDs are used, emit the
sink that's being written to for each ETM.

Perf currently errors out if it sees a newer packet version so instead
of bumping it, add a new minor version field. This can be used to
signify new versions that have backwards compatible fields. Considering
this change is only for high core count machines, it doesn't make sense
to make a breaking change for everyone.

Signed-off-by: James Clark <james.clark@arm.com>
Tested-by: Leo Yan <leo.yan@arm.com>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20240722101202.26915-17-james.clark@linaro.org
---
 include/linux/coresight-pmu.h | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h
index 51ac441a37c3..89b0ac0014b0 100644
--- a/include/linux/coresight-pmu.h
+++ b/include/linux/coresight-pmu.h
@@ -49,12 +49,21 @@
  * Interpretation of the PERF_RECORD_AUX_OUTPUT_HW_ID payload.
  * Used to associate a CPU with the CoreSight Trace ID.
  * [07:00] - Trace ID - uses 8 bits to make value easy to read in file.
- * [59:08] - Unused (SBZ)
- * [63:60] - Version
+ * [39:08] - Sink ID - as reported in /sys/bus/event_source/devices/cs_etm/sinks/
+ *	      Added in minor version 1.
+ * [55:40] - Unused (SBZ)
+ * [59:56] - Minor Version - previously existing fields are compatible with
+ *	      all minor versions.
+ * [63:60] - Major Version - previously existing fields mean different things
+ *	      in new major versions.
  */
 #define CS_AUX_HW_ID_TRACE_ID_MASK	GENMASK_ULL(7, 0)
-#define CS_AUX_HW_ID_VERSION_MASK	GENMASK_ULL(63, 60)
+#define CS_AUX_HW_ID_SINK_ID_MASK	GENMASK_ULL(39, 8)
 
-#define CS_AUX_HW_ID_CURR_VERSION 0
+#define CS_AUX_HW_ID_MINOR_VERSION_MASK	GENMASK_ULL(59, 56)
+#define CS_AUX_HW_ID_MAJOR_VERSION_MASK	GENMASK_ULL(63, 60)
+
+#define CS_AUX_HW_ID_MAJOR_VERSION 0
+#define CS_AUX_HW_ID_MINOR_VERSION 1
 
 #endif
-- 
cgit v1.2.3


From 988d40a4d4e7d671305bea501562a5d1a1d479fa Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@arm.com>
Date: Mon, 22 Jul 2024 11:11:59 +0100
Subject: coresight: Make trace ID map spinlock local to the map

Reduce contention on the lock by replacing the global lock with one for
each map.

Signed-off-by: James Clark <james.clark@arm.com>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20240722101202.26915-18-james.clark@linaro.org
---
 include/linux/coresight.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 197949fd2c35..c13342594278 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -233,6 +233,7 @@ struct coresight_trace_id_map {
 	DECLARE_BITMAP(used_ids, CORESIGHT_TRACE_IDS_MAX);
 	atomic_t __percpu *cpu_map;
 	atomic_t perf_cs_etm_session_active;
+	spinlock_t lock;
 };
 
 /**
-- 
cgit v1.2.3


From 7c432a18ad216b4f7b08e93287586d60e12a3b7b Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Tue, 20 Aug 2024 15:27:55 +0100
Subject: firmware: arm_ffa: Update the FF-A command list with v1.2 additions

Arm Firmware Framework for A-profile(FFA) v1.2 introduces register based
discovery mechanism and direct messaging extensions that enables to target
specific UUID within a partition.

Let us add all the newly supported FF-A function IDs in the spec.
Also update to the error values and associated handling.

Message-Id: <20240820-ffa_v1-2-v2-2-18c0c5f3c65e@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 89683f31ae12..b34f0c0dc2c5 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -73,6 +73,11 @@
 #define FFA_FN64_MEM_PERM_GET		FFA_SMC_64(0x88)
 #define FFA_MEM_PERM_SET		FFA_SMC_32(0x89)
 #define FFA_FN64_MEM_PERM_SET		FFA_SMC_64(0x89)
+#define FFA_CONSOLE_LOG			FFA_SMC_32(0x8A)
+#define FFA_PARTITION_INFO_GET_REGS	FFA_SMC_64(0x8B)
+#define FFA_EL3_INTR_HANDLE		FFA_SMC_32(0x8C)
+#define FFA_MSG_SEND_DIRECT_REQ2	FFA_SMC_64(0x8D)
+#define FFA_MSG_SEND_DIRECT_RESP2	FFA_SMC_64(0x8E)
 
 /*
  * For some calls it is necessary to use SMC64 to pass or return 64-bit values.
-- 
cgit v1.2.3


From aaef3bc98129c86078b336f16788dd733b0728a4 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Tue, 20 Aug 2024 15:27:58 +0100
Subject: firmware: arm_ffa: Add support for FFA_MSG_SEND_DIRECT_{REQ,RESP}2

FFA_MSG_SEND_DIRECT_{REQ,RESP} supported only x3-x7 to pass implementation
defined values as part of the message. This may not be sufficient sometimes
and also it would be good to use all the registers supported by SMCCC v1.2
(x0-x17) for such register based communication.

Also another limitation with the FFA_MSG_SEND_DIRECT_{REQ,RESP} is the
ability to target a specific service within the partition based on it's
UUID.

In order to address both of the above limitation, FF-A v1.2 introduced
FFA_MSG_SEND_DIRECT_{REQ,RESP}2 which has the ability to target the
message to a specific service based on its UUID within a partition as
well as utilise all the available registers(x4-x17 specifically) for
the communication.

This change adds support for FFA_MSG_SEND_DIRECT_REQ2 and
FFA_MSG_SEND_DIRECT_RESP2.

Message-Id: <20240820-ffa_v1-2-v2-5-18c0c5f3c65e@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index b34f0c0dc2c5..a28e2a6a13d0 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -270,6 +270,11 @@ struct ffa_indirect_msg_hdr {
 	u32 size;
 };
 
+/* For use with FFA_MSG_SEND_DIRECT_{REQ,RESP}2 which pass data via registers */
+struct ffa_send_direct_data2 {
+	unsigned long data[14]; /* x4-x17 */
+};
+
 struct ffa_mem_region_addr_range {
 	/* The base IPA of the constituent memory region, aligned to 4 kiB */
 	u64 address;
@@ -431,6 +436,8 @@ struct ffa_msg_ops {
 	int (*sync_send_receive)(struct ffa_device *dev,
 				 struct ffa_send_direct_data *data);
 	int (*indirect_send)(struct ffa_device *dev, void *buf, size_t sz);
+	int (*sync_send_receive2)(struct ffa_device *dev, const uuid_t *uuid,
+				  struct ffa_send_direct_data2 *data);
 };
 
 struct ffa_mem_ops {
-- 
cgit v1.2.3


From e68ac2b488495fa4d127d6105ce633849859957a Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Thu, 15 Aug 2024 11:15:40 -0600
Subject: softirq: Remove unused 'action' parameter from action callback

When soft interrupt actions are called, they are passed a pointer to the
struct softirq action which contains the action's function pointer.

This pointer isn't useful, as the action callback already knows what
function it is. And since each callback handles a specific soft interrupt,
the callback also knows which soft interrupt number is running.

No soft interrupt action callback actually uses this parameter, so remove
it from the function pointer signature. This clarifies that soft interrupt
actions are global routines and makes it slightly cheaper to call them.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/all/20240815171549.3260003-1-csander@purestorage.com
---
 include/linux/interrupt.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 3f30c88e0b4c..694de61e0b38 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -594,7 +594,7 @@ extern const char * const softirq_to_name[NR_SOFTIRQS];
 
 struct softirq_action
 {
-	void	(*action)(struct softirq_action *);
+	void	(*action)(void);
 };
 
 asmlinkage void do_softirq(void);
@@ -609,7 +609,7 @@ static inline void do_softirq_post_smp_call_flush(unsigned int unused)
 }
 #endif
 
-extern void open_softirq(int nr, void (*action)(struct softirq_action *));
+extern void open_softirq(int nr, void (*action)(void));
 extern void softirq_init(void);
 extern void __raise_softirq_irqoff(unsigned int nr);
 
-- 
cgit v1.2.3


From 2fea0c26b82f304f43b3905e56d954cf98a6d0e9 Mon Sep 17 00:00:00 2001
From: Fan Wu <wufan@linux.microsoft.com>
Date: Fri, 2 Aug 2024 23:08:19 -0700
Subject: initramfs,lsm: add a security hook to do_populate_rootfs()

This patch introduces a new hook to notify security system that the
content of initramfs has been unpacked into the rootfs.

Upon receiving this notification, the security system can activate
a policy to allow only files that originated from the initramfs to
execute or load into kernel during the early stages of booting.

This approach is crucial for minimizing the attack surface by
ensuring that only trusted files from the initramfs are operational
in the critical boot phase.

Signed-off-by: Fan Wu <wufan@linux.microsoft.com>
[PM: subject line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 2 ++
 include/linux/security.h      | 8 ++++++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 520730fe2d94..22a14fc794fe 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -449,3 +449,5 @@ LSM_HOOK(int, 0, uring_override_creds, const struct cred *new)
 LSM_HOOK(int, 0, uring_sqpoll, void)
 LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd)
 #endif /* CONFIG_IO_URING */
+
+LSM_HOOK(void, LSM_RET_VOID, initramfs_populated, void)
diff --git a/include/linux/security.h b/include/linux/security.h
index 62233fec8ead..3298855abdbc 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2256,4 +2256,12 @@ static inline int security_uring_cmd(struct io_uring_cmd *ioucmd)
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_IO_URING */
 
+#ifdef CONFIG_SECURITY
+extern void security_initramfs_populated(void);
+#else
+static inline void security_initramfs_populated(void)
+{
+}
+#endif /* CONFIG_SECURITY */
+
 #endif /* ! __LINUX_SECURITY_H */
-- 
cgit v1.2.3


From 7138679ff2a2b1674f16618558d6cabea6ab2c53 Mon Sep 17 00:00:00 2001
From: Fan Wu <wufan@linux.microsoft.com>
Date: Fri, 2 Aug 2024 23:08:21 -0700
Subject: lsm: add new securityfs delete function

When deleting a directory in the security file system, the existing
securityfs_remove requires the directory to be empty, otherwise
it will do nothing. This leads to a potential risk that the security
file system might be in an unclean state when the intended deletion
did not happen.

This commit introduces a new function securityfs_recursive_remove
to recursively delete a directory without leaving an unclean state.

Co-developed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Fan Wu <wufan@linux.microsoft.com>
[PM: subject line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 3298855abdbc..f6d2bc69cfa6 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2090,6 +2090,7 @@ struct dentry *securityfs_create_symlink(const char *name,
 					 const char *target,
 					 const struct inode_operations *iops);
 extern void securityfs_remove(struct dentry *dentry);
+extern void securityfs_recursive_remove(struct dentry *dentry);
 
 #else /* CONFIG_SECURITYFS */
 
-- 
cgit v1.2.3


From b55d26bd1891423a3cdccf816b386aec8bbefc87 Mon Sep 17 00:00:00 2001
From: Deven Bowers <deven.desai@linux.microsoft.com>
Date: Fri, 2 Aug 2024 23:08:25 -0700
Subject: block,lsm: add LSM blob and new LSM hooks for block devices

This patch introduces a new LSM blob to the block_device structure,
enabling the security subsystem to store security-sensitive data related
to block devices. Currently, for a device mapper's mapped device containing
a dm-verity target, critical security information such as the roothash and
its signing state are not readily accessible. Specifically, while the
dm-verity volume creation process passes the dm-verity roothash and its
signature from userspace to the kernel, the roothash is stored privately
within the dm-verity target, and its signature is discarded
post-verification. This makes it extremely hard for the security subsystem
to utilize these data.

With the addition of the LSM blob to the block_device structure, the
security subsystem can now retain and manage important security metadata
such as the roothash and the signing state of a dm-verity by storing them
inside the blob. Access decisions can then be based on these stored data.

The implementation follows the same approach used for security blobs in
other structures like struct file, struct inode, and struct superblock.
The initialization of the security blob occurs after the creation of the
struct block_device, performed by the security subsystem. Similarly, the
security blob is freed by the security subsystem before the struct
block_device is deallocated or freed.

This patch also introduces a new hook security_bdev_setintegrity() to save
block device's integrity data to the new LSM blob. For example, for
dm-verity, it can use this hook to expose its roothash and signing state
to LSMs, then LSMs can save these data into the LSM blob.

Please note that the new hook should be invoked every time the security
information is updated to keep these data current. For example, in
dm-verity, if the mapping table is reloaded and configured to use a
different dm-verity target with a new roothash and signing information,
the previously stored data in the LSM blob will become obsolete. It is
crucial to re-invoke the hook to refresh these data and ensure they are up
to date. This necessity arises from the design of device-mapper, where a
device-mapper device is first created, and then targets are subsequently
loaded into it. These targets can be modified multiple times during the
device's lifetime. Therefore, while the LSM blob is allocated during the
creation of the block device, its actual contents are not initialized at
this stage and can change substantially over time. This includes
alterations from data that the LSM 'trusts' to those it does not, making
it essential to handle these changes correctly. Failure to address this
dynamic aspect could potentially allow for bypassing LSM checks.

Signed-off-by: Deven Bowers <deven.desai@linux.microsoft.com>
Signed-off-by: Fan Wu <wufan@linux.microsoft.com>
[PM: merge fuzz, subject line tweaks]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/blk_types.h     |  3 +++
 include/linux/lsm_hook_defs.h |  5 +++++
 include/linux/lsm_hooks.h     |  1 +
 include/linux/security.h      | 26 ++++++++++++++++++++++++++
 4 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 36ed96133217..413ebdff974b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -71,6 +71,9 @@ struct block_device {
 
 	struct partition_meta_info *bd_meta_info;
 	int			bd_writers;
+#ifdef CONFIG_SECURITY
+	void			*bd_security;
+#endif
 	/*
 	 * keep this out-of-line as it's both big and not needed in the fast
 	 * path
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 22a14fc794fe..860821f3bf6f 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -451,3 +451,8 @@ LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd)
 #endif /* CONFIG_IO_URING */
 
 LSM_HOOK(void, LSM_RET_VOID, initramfs_populated, void)
+
+LSM_HOOK(int, 0, bdev_alloc_security, struct block_device *bdev)
+LSM_HOOK(void, LSM_RET_VOID, bdev_free_security, struct block_device *bdev)
+LSM_HOOK(int, 0, bdev_setintegrity, struct block_device *bdev,
+	 enum lsm_integrity_type type, const void *value, size_t size)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 11ea0063228f..4687985b9175 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -83,6 +83,7 @@ struct lsm_blob_sizes {
 	int lbs_task;
 	int lbs_xattr_count; /* number of xattr slots in new_xattrs array */
 	int lbs_tun_dev;
+	int lbs_bdev;
 };
 
 /*
diff --git a/include/linux/security.h b/include/linux/security.h
index f6d2bc69cfa6..d7cab2d5002f 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -83,6 +83,10 @@ enum lsm_event {
 	LSM_POLICY_CHANGE,
 };
 
+enum lsm_integrity_type {
+	__LSM_INT_MAX
+};
+
 /*
  * These are reasons that can be passed to the security_locked_down()
  * LSM hook. Lockdown reasons that protect kernel integrity (ie, the
@@ -509,6 +513,11 @@ int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen);
 int security_locked_down(enum lockdown_reason what);
 int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, u32 *uctx_len,
 		      void *val, size_t val_len, u64 id, u64 flags);
+int security_bdev_alloc(struct block_device *bdev);
+void security_bdev_free(struct block_device *bdev);
+int security_bdev_setintegrity(struct block_device *bdev,
+			       enum lsm_integrity_type type, const void *value,
+			       size_t size);
 #else /* CONFIG_SECURITY */
 
 static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data)
@@ -1483,6 +1492,23 @@ static inline int lsm_fill_user_ctx(struct lsm_ctx __user *uctx,
 {
 	return -EOPNOTSUPP;
 }
+
+static inline int security_bdev_alloc(struct block_device *bdev)
+{
+	return 0;
+}
+
+static inline void security_bdev_free(struct block_device *bdev)
+{
+}
+
+static inline int security_bdev_setintegrity(struct block_device *bdev,
+					     enum lsm_integrity_type type,
+					     const void *value, size_t size)
+{
+	return 0;
+}
+
 #endif	/* CONFIG_SECURITY */
 
 #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE)
-- 
cgit v1.2.3


From a6af7bc3d72ff52c5526a392144347fcb3094149 Mon Sep 17 00:00:00 2001
From: Deven Bowers <deven.desai@linux.microsoft.com>
Date: Fri, 2 Aug 2024 23:08:26 -0700
Subject: dm-verity: expose root hash digest and signature data to LSMs

dm-verity provides a strong guarantee of a block device's integrity. As
a generic way to check the integrity of a block device, it provides
those integrity guarantees to its higher layers, including the filesystem
level.

However, critical security metadata like the dm-verity roothash and its
signing information are not easily accessible to the LSMs.
To address this limitation, this patch introduces a mechanism to store
and manage these essential security details within a newly added LSM blob
in the block_device structure.

This addition allows LSMs to make access control decisions on the integrity
data stored within the block_device, enabling more flexible security
policies. For instance, LSMs can now revoke access to dm-verity devices
based on their roothashes, ensuring that only authorized and verified
content is accessible. Additionally, LSMs can enforce policies to only
allow files from dm-verity devices that have a valid digital signature to
execute, effectively blocking any unsigned files from execution, thus
enhancing security against unauthorized modifications.

The patch includes new hook calls, `security_bdev_setintegrity()`, in
dm-verity to expose the dm-verity roothash and the roothash signature to
LSMs via preresume() callback. By using the preresume() callback, it
ensures that the security metadata is consistently in sync with the
metadata of the dm-verity target in the current active mapping table.
The hook calls are depended on CONFIG_SECURITY.

Signed-off-by: Deven Bowers <deven.desai@linux.microsoft.com>
Signed-off-by: Fan Wu <wufan@linux.microsoft.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
[PM: moved sig_size field as discussed]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index d7cab2d5002f..e383022467db 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -83,8 +83,15 @@ enum lsm_event {
 	LSM_POLICY_CHANGE,
 };
 
+struct dm_verity_digest {
+	const char *alg;
+	const u8 *digest;
+	size_t digest_len;
+};
+
 enum lsm_integrity_type {
-	__LSM_INT_MAX
+	LSM_INT_DMVERITY_SIG_VALID,
+	LSM_INT_DMVERITY_ROOTHASH,
 };
 
 /*
-- 
cgit v1.2.3


From fb55e177d5936fb80fb2586036d195c57e7f6892 Mon Sep 17 00:00:00 2001
From: Fan Wu <wufan@linux.microsoft.com>
Date: Fri, 2 Aug 2024 23:08:28 -0700
Subject: lsm: add security_inode_setintegrity() hook

This patch introduces a new hook to save inode's integrity
data. For example, for fsverity enabled files, LSMs can use this hook to
save the existence of verified fsverity builtin signature into the inode's
security blob, and LSMs can make access decisions based on this data.

Signed-off-by: Fan Wu <wufan@linux.microsoft.com>
[PM: subject line tweak, removed changelog]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h |  2 ++
 include/linux/security.h      | 10 ++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 860821f3bf6f..1d59513bf230 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -180,6 +180,8 @@ LSM_HOOK(void, LSM_RET_VOID, inode_getsecid, struct inode *inode, u32 *secid)
 LSM_HOOK(int, 0, inode_copy_up, struct dentry *src, struct cred **new)
 LSM_HOOK(int, -EOPNOTSUPP, inode_copy_up_xattr, struct dentry *src,
 	 const char *name)
+LSM_HOOK(int, 0, inode_setintegrity, const struct inode *inode,
+	 enum lsm_integrity_type type, const void *value, size_t size)
 LSM_HOOK(int, 0, kernfs_init_security, struct kernfs_node *kn_dir,
 	 struct kernfs_node *kn)
 LSM_HOOK(int, 0, file_permission, struct file *file, int mask)
diff --git a/include/linux/security.h b/include/linux/security.h
index e383022467db..97b7c57e6560 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -410,6 +410,9 @@ int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer
 void security_inode_getsecid(struct inode *inode, u32 *secid);
 int security_inode_copy_up(struct dentry *src, struct cred **new);
 int security_inode_copy_up_xattr(struct dentry *src, const char *name);
+int security_inode_setintegrity(const struct inode *inode,
+				enum lsm_integrity_type type, const void *value,
+				size_t size);
 int security_kernfs_init_security(struct kernfs_node *kn_dir,
 				  struct kernfs_node *kn);
 int security_file_permission(struct file *file, int mask);
@@ -1026,6 +1029,13 @@ static inline int security_inode_copy_up(struct dentry *src, struct cred **new)
 	return 0;
 }
 
+static inline int security_inode_setintegrity(const struct inode *inode,
+					      enum lsm_integrity_type type,
+					      const void *value, size_t size)
+{
+	return 0;
+}
+
 static inline int security_kernfs_init_security(struct kernfs_node *kn_dir,
 						struct kernfs_node *kn)
 {
-- 
cgit v1.2.3


From 7c373e4f1445263728d3eeab7e33e932c8f4a288 Mon Sep 17 00:00:00 2001
From: Fan Wu <wufan@linux.microsoft.com>
Date: Fri, 2 Aug 2024 23:08:29 -0700
Subject: fsverity: expose verified fsverity built-in signatures to LSMs

This patch enhances fsverity's capabilities to support both integrity and
authenticity protection by introducing the exposure of built-in
signatures through a new LSM hook. This functionality allows LSMs,
e.g. IPE, to enforce policies based on the authenticity and integrity of
files, specifically focusing on built-in fsverity signatures. It enables
a policy enforcement layer within LSMs for fsverity, offering granular
control over the usage of authenticity claims. For instance, a policy
could be established to only permit the execution of all files with
verified built-in fsverity signatures.

The introduction of a security_inode_setintegrity() hook call within
fsverity's workflow ensures that the verified built-in signature of a file
is exposed to LSMs. This enables LSMs to recognize and label fsverity files
that contain a verified built-in fsverity signature. This hook is invoked
subsequent to the fsverity_verify_signature() process, guaranteeing the
signature's verification against fsverity's keyring. This mechanism is
crucial for maintaining system security, as it operates in kernel space,
effectively thwarting attempts by malicious binaries to bypass user space
stack interactions.

The second to last commit in this patch set will add a link to the IPE
documentation in fsverity.rst.

Signed-off-by: Deven Bowers <deven.desai@linux.microsoft.com>
Signed-off-by: Fan Wu <wufan@linux.microsoft.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 97b7c57e6560..c37c32ebbdcd 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -92,6 +92,7 @@ struct dm_verity_digest {
 enum lsm_integrity_type {
 	LSM_INT_DMVERITY_SIG_VALID,
 	LSM_INT_DMVERITY_ROOTHASH,
+	LSM_INT_FSVERITY_BUILTINSIG_VALID,
 };
 
 /*
-- 
cgit v1.2.3


From 7cff549daa67c7549c5a8688674cea2df8c2ec80 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@kernel.org>
Date: Fri, 16 Aug 2024 17:43:05 +0200
Subject: kernel: Add helper macros for loop unrolling

This helps in easily initializing blocks of code (e.g. static calls and
keys).

UNROLL(N, MACRO, __VA_ARGS__) calls MACRO N times with the first
argument as the index of the iteration. This allows string pasting to
create unique tokens for variable names, function calls etc.

As an example:

	#include <linux/unroll.h>

	#define MACRO(N, a, b)            \
		int add_##N(int a, int b) \
		{                         \
			return a + b + N; \
		}

	UNROLL(2, MACRO, x, y)

expands to:

	int add_0(int x, int y)
	{
		return x + y + 0;
	}

	int add_1(int x, int y)
	{
		return x + y + 1;
	}

Tested-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Nacked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: KP Singh <kpsingh@kernel.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/unroll.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 include/linux/unroll.h

(limited to 'include/linux')

diff --git a/include/linux/unroll.h b/include/linux/unroll.h
new file mode 100644
index 000000000000..d42fd6366373
--- /dev/null
+++ b/include/linux/unroll.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (C) 2023 Google LLC.
+ */
+
+#ifndef __UNROLL_H
+#define __UNROLL_H
+
+#include <linux/args.h>
+
+#define UNROLL(N, MACRO, args...) CONCATENATE(__UNROLL_, N)(MACRO, args)
+
+#define __UNROLL_0(MACRO, args...)
+#define __UNROLL_1(MACRO, args...)  __UNROLL_0(MACRO, args)  MACRO(0, args)
+#define __UNROLL_2(MACRO, args...)  __UNROLL_1(MACRO, args)  MACRO(1, args)
+#define __UNROLL_3(MACRO, args...)  __UNROLL_2(MACRO, args)  MACRO(2, args)
+#define __UNROLL_4(MACRO, args...)  __UNROLL_3(MACRO, args)  MACRO(3, args)
+#define __UNROLL_5(MACRO, args...)  __UNROLL_4(MACRO, args)  MACRO(4, args)
+#define __UNROLL_6(MACRO, args...)  __UNROLL_5(MACRO, args)  MACRO(5, args)
+#define __UNROLL_7(MACRO, args...)  __UNROLL_6(MACRO, args)  MACRO(6, args)
+#define __UNROLL_8(MACRO, args...)  __UNROLL_7(MACRO, args)  MACRO(7, args)
+#define __UNROLL_9(MACRO, args...)  __UNROLL_8(MACRO, args)  MACRO(8, args)
+#define __UNROLL_10(MACRO, args...) __UNROLL_9(MACRO, args)  MACRO(9, args)
+#define __UNROLL_11(MACRO, args...) __UNROLL_10(MACRO, args) MACRO(10, args)
+#define __UNROLL_12(MACRO, args...) __UNROLL_11(MACRO, args) MACRO(11, args)
+#define __UNROLL_13(MACRO, args...) __UNROLL_12(MACRO, args) MACRO(12, args)
+#define __UNROLL_14(MACRO, args...) __UNROLL_13(MACRO, args) MACRO(13, args)
+#define __UNROLL_15(MACRO, args...) __UNROLL_14(MACRO, args) MACRO(14, args)
+#define __UNROLL_16(MACRO, args...) __UNROLL_15(MACRO, args) MACRO(15, args)
+#define __UNROLL_17(MACRO, args...) __UNROLL_16(MACRO, args) MACRO(16, args)
+#define __UNROLL_18(MACRO, args...) __UNROLL_17(MACRO, args) MACRO(17, args)
+#define __UNROLL_19(MACRO, args...) __UNROLL_18(MACRO, args) MACRO(18, args)
+#define __UNROLL_20(MACRO, args...) __UNROLL_19(MACRO, args) MACRO(19, args)
+
+#endif /* __UNROLL_H */
-- 
cgit v1.2.3


From 9b59a85a84dc37ca4f2c54df5e06aff4c1eae5d3 Mon Sep 17 00:00:00 2001
From: Matthew Brost <matthew.brost@intel.com>
Date: Tue, 20 Aug 2024 12:38:08 -0700
Subject: workqueue: Don't call va_start / va_end twice

Calling va_start / va_end multiple times is undefined and causes
problems with certain compiler / platforms.

Change alloc_ordered_workqueue_lockdep_map to a macro and updated
__alloc_workqueue to take a va_list argument.

Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 04dff07a9e2b..f3cc15940b4d 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -543,20 +543,8 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active,
  * RETURNS:
  * Pointer to the allocated workqueue on success, %NULL on failure.
  */
-__printf(1, 4) static inline struct workqueue_struct *
-alloc_ordered_workqueue_lockdep_map(const char *fmt, unsigned int flags,
-				    struct lockdep_map *lockdep_map, ...)
-{
-	struct workqueue_struct *wq;
-	va_list args;
-
-	va_start(args, lockdep_map);
-	wq = alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | flags,
-					 1, lockdep_map, args);
-	va_end(args);
-
-	return wq;
-}
+#define alloc_ordered_workqueue_lockdep_map(fmt, flags, lockdep_map, args...)	\
+	alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, lockdep_map, ##args)
 #endif
 
 /**
-- 
cgit v1.2.3


From 8fb9f31984bdc0aaa1b0f7c7e7a27bd3137f8744 Mon Sep 17 00:00:00 2001
From: Zhiguo Niu <zhiguo.niu@unisoc.com>
Date: Thu, 1 Aug 2024 09:33:51 +0800
Subject: f2fs: clean up val{>>,<<}F2FS_BLKSIZE_BITS

Use F2FS_BYTES_TO_BLK(bytes) and F2FS_BLK_TO_BYTES(blk) for cleanup

Signed-off-by: Zhiguo Niu <zhiguo.niu@unisoc.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 01bee2b289c2..f17f89a259e2 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -19,7 +19,6 @@
 #define F2FS_BLKSIZE_BITS		PAGE_SHIFT /* bits for F2FS_BLKSIZE */
 #define F2FS_MAX_EXTENSION		64	/* # of extension entries */
 #define F2FS_EXTENSION_LEN		8	/* max size of extension */
-#define F2FS_BLK_ALIGN(x)	(((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS)
 
 #define NULL_ADDR		((block_t)0)	/* used as block_t addresses */
 #define NEW_ADDR		((block_t)-1)	/* used as block_t addresses */
@@ -28,6 +27,7 @@
 #define F2FS_BYTES_TO_BLK(bytes)	((bytes) >> F2FS_BLKSIZE_BITS)
 #define F2FS_BLK_TO_BYTES(blk)		((blk) << F2FS_BLKSIZE_BITS)
 #define F2FS_BLK_END_BYTES(blk)		(F2FS_BLK_TO_BYTES(blk + 1) - 1)
+#define F2FS_BLK_ALIGN(x)			(F2FS_BYTES_TO_BLK((x) + F2FS_BLKSIZE - 1))
 
 /* 0, 1(node nid), 2(meta nid) are reserved node id */
 #define F2FS_RESERVED_NODE_NUM		3
-- 
cgit v1.2.3


From 013f510d1fce563f37f420bf231b065885a16f21 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Thu, 1 Aug 2024 21:31:21 +0800
Subject: dm: Remove unused declaration dm_get_rq_mapinfo()

Commit ae6ad75e5c3c ("dm: remove unused dm_get_rq_mapinfo()")
removed the implementation but leave declaration.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 include/linux/device-mapper.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 53ca3a913d06..8321f65897f3 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -524,7 +524,6 @@ int dm_post_suspending(struct dm_target *ti);
 int dm_noflush_suspending(struct dm_target *ti);
 void dm_accept_partial_bio(struct bio *bio, unsigned int n_sectors);
 void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone);
-union map_info *dm_get_rq_mapinfo(struct request *rq);
 
 #ifdef CONFIG_BLK_DEV_ZONED
 struct dm_report_zones_args {
-- 
cgit v1.2.3


From 0e1d5731d3c1e2214249ef36dcd13ad51ad304cf Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 20 Aug 2024 08:35:30 +0206
Subject: printk: Check printk_deferred_enter()/_exit() usage

Add validation that printk_deferred_enter()/_exit() are called in
non-migration contexts.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20240820063001.36405-5-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/printk.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index b937cefcb31c..eee8e97da681 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -161,15 +161,16 @@ int _printk(const char *fmt, ...);
  */
 __printf(1, 2) __cold int _printk_deferred(const char *fmt, ...);
 
-extern void __printk_safe_enter(void);
-extern void __printk_safe_exit(void);
+extern void __printk_deferred_enter(void);
+extern void __printk_deferred_exit(void);
+
 /*
  * The printk_deferred_enter/exit macros are available only as a hack for
  * some code paths that need to defer all printk console printing. Interrupts
  * must be disabled for the deferred duration.
  */
-#define printk_deferred_enter __printk_safe_enter
-#define printk_deferred_exit __printk_safe_exit
+#define printk_deferred_enter() __printk_deferred_enter()
+#define printk_deferred_exit() __printk_deferred_exit()
 
 /*
  * Please don't use printk_ratelimit(), because it shares ratelimiting state
-- 
cgit v1.2.3


From b7049d88c1763d5f133b0e93e39da8e89a9f944b Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 20 Aug 2024 08:35:32 +0206
Subject: printk: nbcon: Remove return value for write_atomic()

The return value of write_atomic() does not provide any useful
information. On the contrary, it makes things more complicated
for the caller to appropriately deal with the information.

Change write_atomic() to not have a return value. If the
message did not get printed due to loss of ownership, the
caller will notice this on its own. If ownership was not lost,
it will be assumed that the driver successfully printed the
message and the sequence number for that console will be
incremented.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20240820063001.36405-7-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/console.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 31a8f5b85f5d..577b157fe4fb 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -345,7 +345,7 @@ struct console {
 	struct hlist_node	node;
 
 	/* nbcon console specific members */
-	bool			(*write_atomic)(struct console *con,
+	void			(*write_atomic)(struct console *con,
 						struct nbcon_write_context *wctxt);
 	atomic_t		__private nbcon_state;
 	atomic_long_t		__private nbcon_seq;
-- 
cgit v1.2.3


From 7d56936c1e5208b5eeea2a9a2f70e85248f6da49 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 20 Aug 2024 08:35:33 +0206
Subject: printk: nbcon: Add detailed doc for write_atomic()

The write_atomic() callback has special requirements and is
allowed to use special helper functions. Provide detailed
documentation of the callback so that a developer has a
chance of implementing it correctly.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20240820063001.36405-8-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/console.h | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 577b157fe4fb..35c64ee3827b 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -303,7 +303,7 @@ struct nbcon_write_context {
 /**
  * struct console - The console descriptor structure
  * @name:		The name of the console driver
- * @write:		Write callback to output messages (Optional)
+ * @write:		Legacy write callback to output messages (Optional)
  * @read:		Read callback for console input (Optional)
  * @device:		The underlying TTY device driver (Optional)
  * @unblank:		Callback to unblank the console (Optional)
@@ -320,7 +320,6 @@ struct nbcon_write_context {
  * @data:		Driver private data
  * @node:		hlist node for the console list
  *
- * @write_atomic:	Write callback for atomic context
  * @nbcon_state:	State for nbcon consoles
  * @nbcon_seq:		Sequence number of the next record for nbcon to print
  * @pbufs:		Pointer to nbcon private buffer
@@ -345,8 +344,34 @@ struct console {
 	struct hlist_node	node;
 
 	/* nbcon console specific members */
-	void			(*write_atomic)(struct console *con,
-						struct nbcon_write_context *wctxt);
+
+	/**
+	 * @write_atomic:
+	 *
+	 * NBCON callback to write out text in any context. (Optional)
+	 *
+	 * This callback is called with the console already acquired. However,
+	 * a higher priority context is allowed to take it over by default.
+	 *
+	 * The callback must call nbcon_enter_unsafe() and nbcon_exit_unsafe()
+	 * around any code where the takeover is not safe, for example, when
+	 * manipulating the serial port registers.
+	 *
+	 * nbcon_enter_unsafe() will fail if the context has lost the console
+	 * ownership in the meantime. In this case, the callback is no longer
+	 * allowed to go forward. It must back out immediately and carefully.
+	 * The buffer content is also no longer trusted since it no longer
+	 * belongs to the context.
+	 *
+	 * The callback should allow the takeover whenever it is safe. It
+	 * increases the chance to see messages when the system is in trouble.
+	 *
+	 * The callback can be called from any context (including NMI).
+	 * Therefore it must avoid usage of any locking and instead rely
+	 * on the console ownership for synchronization.
+	 */
+	void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt);
+
 	atomic_t		__private nbcon_state;
 	atomic_long_t		__private nbcon_seq;
 	struct printk_buffers	*pbufs;
-- 
cgit v1.2.3


From 7a16a771890746b4060333d665ebe674945a3cfa Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 20 Aug 2024 08:35:34 +0206
Subject: printk: nbcon: Add callbacks to synchronize with driver

Console drivers typically must deal with access to the hardware
via user input/output (such as an interactive login shell) and
output of kernel messages via printk() calls. To provide the
necessary synchronization, usually some driver-specific locking
mechanism is used (for example, the port spinlock for uart
serial consoles).

Until now, usage of this driver-specific locking has been hidden
from the printk-subsystem and implemented within the various
console callbacks. However, nbcon consoles would need to use it
even in the generic code.

Add device_lock() and device_unlock() callback which will need
to get implemented by nbcon consoles.

The callbacks will use whatever synchronization mechanism the
driver is using for itself. The minimum requirement is to
prevent CPU migration. It would allow a context friendly
acquiring of nbcon console ownership in non-emergency and
non-panic context.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20240820063001.36405-9-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/console.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 35c64ee3827b..46b3c210b931 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -372,6 +372,49 @@ struct console {
 	 */
 	void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt);
 
+	/**
+	 * @device_lock:
+	 *
+	 * NBCON callback to begin synchronization with driver code.
+	 *
+	 * Console drivers typically must deal with access to the hardware
+	 * via user input/output (such as an interactive login shell) and
+	 * output of kernel messages via printk() calls. This callback is
+	 * called by the printk-subsystem whenever it needs to synchronize
+	 * with hardware access by the driver. It should be implemented to
+	 * use whatever synchronization mechanism the driver is using for
+	 * itself (for example, the port lock for uart serial consoles).
+	 *
+	 * The callback is always called from task context. It may use any
+	 * synchronization method required by the driver.
+	 *
+	 * IMPORTANT: The callback MUST disable migration. The console driver
+	 *	may be using a synchronization mechanism that already takes
+	 *	care of this (such as spinlocks). Otherwise this function must
+	 *	explicitly call migrate_disable().
+	 *
+	 * The flags argument is provided as a convenience to the driver. It
+	 * will be passed again to device_unlock(). It can be ignored if the
+	 * driver does not need it.
+	 */
+	void (*device_lock)(struct console *con, unsigned long *flags);
+
+	/**
+	 * @device_unlock:
+	 *
+	 * NBCON callback to finish synchronization with driver code.
+	 *
+	 * It is the counterpart to device_lock().
+	 *
+	 * This callback is always called from task context. It must
+	 * appropriately re-enable migration (depending on how device_lock()
+	 * disabled migration).
+	 *
+	 * The flags argument is the value of the same variable that was
+	 * passed to device_lock().
+	 */
+	void (*device_unlock)(struct console *con, unsigned long flags);
+
 	atomic_t		__private nbcon_state;
 	atomic_long_t		__private nbcon_seq;
 	struct printk_buffers	*pbufs;
-- 
cgit v1.2.3


From 77e73c0687f5bc884fa1b0cb97aca912bcc01957 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 20 Aug 2024 08:35:36 +0206
Subject: serial: core: Provide low-level functions to lock port

It will be necessary at times for the uart nbcon console
drivers to acquire the port lock directly (without the
additional nbcon functionality of the port lock wrappers).
These are special cases such as the implementation of the
device_lock()/device_unlock() callbacks or for internal
port lock wrapper synchronization.

Provide low-level variants __uart_port_lock_irqsave() and
__uart_port_unlock_irqrestore() for this purpose.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20240820063001.36405-11-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/serial_core.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index aea25eef9a1a..8872cd21e70a 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -590,6 +590,24 @@ struct uart_port {
 	void			*private_data;		/* generic platform data pointer */
 };
 
+/*
+ * Only for console->device_lock()/_unlock() callbacks and internal
+ * port lock wrapper synchronization.
+ */
+static inline void __uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags)
+{
+	spin_lock_irqsave(&up->lock, *flags);
+}
+
+/*
+ * Only for console->device_lock()/_unlock() callbacks and internal
+ * port lock wrapper synchronization.
+ */
+static inline void __uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags)
+{
+	spin_unlock_irqrestore(&up->lock, flags);
+}
+
 /**
  * uart_port_lock - Lock the UART port
  * @up:		Pointer to UART port structure
-- 
cgit v1.2.3


From eabd4600dafacf9895184259373f2445ff748654 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 20 Aug 2024 08:35:37 +0206
Subject: serial: core: Introduce wrapper to set @uart_port->cons
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce uart_port_set_cons() as a wrapper to set @cons of a
uart_port. The wrapper sets @cons under the port lock in order
to prevent @cons from disappearing while another context is
holding the port lock. This is necessary for a follow-up
commit relating to the port lock wrappers, which rely on @cons
not changing between lock and unlock.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Tested-by: Théo Lebrun <theo.lebrun@bootlin.com> # EyeQ5, AMBA-PL011
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20240820063001.36405-12-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/serial_core.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 8872cd21e70a..2cf03ff2056a 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -608,6 +608,23 @@ static inline void __uart_port_unlock_irqrestore(struct uart_port *up, unsigned
 	spin_unlock_irqrestore(&up->lock, flags);
 }
 
+/**
+ * uart_port_set_cons - Safely set the @cons field for a uart
+ * @up:		The uart port to set
+ * @con:	The new console to set to
+ *
+ * This function must be used to set @up->cons. It uses the port lock to
+ * synchronize with the port lock wrappers in order to ensure that the console
+ * cannot change or disappear while another context is holding the port lock.
+ */
+static inline void uart_port_set_cons(struct uart_port *up, struct console *con)
+{
+	unsigned long flags;
+
+	__uart_port_lock_irqsave(up, &flags);
+	up->cons = con;
+	__uart_port_unlock_irqrestore(up, flags);
+}
 /**
  * uart_port_lock - Lock the UART port
  * @up:		Pointer to UART port structure
-- 
cgit v1.2.3


From dc219d8d858d95549543aa7a41b6e64a0da9e406 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 20 Aug 2024 08:35:38 +0206
Subject: console: Improve console_srcu_read_flags() comments

It was not clear when exactly console_srcu_read_flags() must be
used vs. directly reading @console->flags.

Refactor and clarify that console_srcu_read_flags() is only
needed if the console is registered or the caller is in a
context where the registration status of the console may change
(due to another context).

The function requires the caller holds @console_srcu, which will
ensure that the caller sees an appropriate @flags value for the
registered console and that exit/cleanup routines will not run
if the console is in the process of unregistration.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20240820063001.36405-13-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/console.h | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 46b3c210b931..aafe3121b74e 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -446,28 +446,34 @@ extern void console_list_unlock(void) __releases(console_mutex);
 extern struct hlist_head console_list;
 
 /**
- * console_srcu_read_flags - Locklessly read the console flags
+ * console_srcu_read_flags - Locklessly read flags of a possibly registered
+ *				console
  * @con:	struct console pointer of console to read flags from
  *
- * This function provides the necessary READ_ONCE() and data_race()
- * notation for locklessly reading the console flags. The READ_ONCE()
- * in this function matches the WRITE_ONCE() when @flags are modified
- * for registered consoles with console_srcu_write_flags().
+ * Locklessly reading @con->flags provides a consistent read value because
+ * there is at most one CPU modifying @con->flags and that CPU is using only
+ * read-modify-write operations to do so.
  *
- * Only use this function to read console flags when locklessly
- * iterating the console list via srcu.
+ * Requires console_srcu_read_lock to be held, which implies that @con might
+ * be a registered console. The purpose of holding console_srcu_read_lock is
+ * to guarantee that the console state is valid (CON_SUSPENDED/CON_ENABLED)
+ * and that no exit/cleanup routines will run if the console is currently
+ * undergoing unregistration.
+ *
+ * If the caller is holding the console_list_lock or it is _certain_ that
+ * @con is not and will not become registered, the caller may read
+ * @con->flags directly instead.
  *
  * Context: Any context.
+ * Return: The current value of the @con->flags field.
  */
 static inline short console_srcu_read_flags(const struct console *con)
 {
 	WARN_ON_ONCE(!console_srcu_read_lock_is_held());
 
 	/*
-	 * Locklessly reading console->flags provides a consistent
-	 * read value because there is at most one CPU modifying
-	 * console->flags and that CPU is using only read-modify-write
-	 * operations to do so.
+	 * The READ_ONCE() matches the WRITE_ONCE() when @flags are modified
+	 * for registered consoles with console_srcu_write_flags().
 	 */
 	return data_race(READ_ONCE(con->flags));
 }
-- 
cgit v1.2.3


From adf6f37d142e14896115929f3892bbc0a86e25bf Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 20 Aug 2024 08:35:39 +0206
Subject: nbcon: Add API to acquire context for non-printing operations

Provide functions nbcon_device_try_acquire() and
nbcon_device_release() which will try to acquire the nbcon
console ownership with NBCON_PRIO_NORMAL and mark it unsafe for
handover/takeover.

These functions are to be used together with the device-specific
locking when performing non-printing activities on the console
device. They will allow synchronization against the
atomic_write() callback which will be serialized, for higher
priority contexts, only by acquiring the console context
ownership.

Pitfalls:

The API requires to be called in a context with migration
disabled because it uses per-CPU variables internally.

The context is set unsafe for a takeover all the time. It
guarantees full serialization against any atomic_write() caller
except for the final flush in panic() which might try an unsafe
takeover.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20240820063001.36405-14-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/console.h |  2 ++
 include/linux/printk.h  | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index aafe3121b74e..3706f944de46 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -322,6 +322,7 @@ struct nbcon_write_context {
  *
  * @nbcon_state:	State for nbcon consoles
  * @nbcon_seq:		Sequence number of the next record for nbcon to print
+ * @nbcon_device_ctxt:	Context available for non-printing operations
  * @pbufs:		Pointer to nbcon private buffer
  */
 struct console {
@@ -417,6 +418,7 @@ struct console {
 
 	atomic_t		__private nbcon_state;
 	atomic_long_t		__private nbcon_seq;
+	struct nbcon_context	__private nbcon_device_ctxt;
 	struct printk_buffers	*pbufs;
 };
 
diff --git a/include/linux/printk.h b/include/linux/printk.h
index eee8e97da681..9687089f5ace 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -9,6 +9,8 @@
 #include <linux/ratelimit_types.h>
 #include <linux/once_lite.h>
 
+struct console;
+
 extern const char linux_banner[];
 extern const char linux_proc_banner[];
 
@@ -198,6 +200,8 @@ extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold;
 extern asmlinkage void dump_stack(void) __cold;
 void printk_trigger_flush(void);
 void console_try_replay_all(void);
+extern bool nbcon_device_try_acquire(struct console *con);
+extern void nbcon_device_release(struct console *con);
 #else
 static inline __printf(1, 0)
 int vprintk(const char *s, va_list args)
@@ -280,6 +284,16 @@ static inline void printk_trigger_flush(void)
 static inline void console_try_replay_all(void)
 {
 }
+
+static inline bool nbcon_device_try_acquire(struct console *con)
+{
+	return false;
+}
+
+static inline void nbcon_device_release(struct console *con)
+{
+}
+
 #endif
 
 bool this_cpu_in_panic(void);
-- 
cgit v1.2.3


From 4126f149c48b2ae757d11ea24675f7071ab22ebf Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 20 Aug 2024 08:35:40 +0206
Subject: serial: core: Acquire nbcon context in port->lock wrapper

Currently the port->lock wrappers uart_port_lock(),
uart_port_unlock() (and their variants) only lock/unlock
the spin_lock.

If the port is an nbcon console that has implemented the
write_atomic() callback, the wrappers must also acquire/release
the console context and mark the region as unsafe. This allows
general port->lock synchronization to be synchronized against
the nbcon write_atomic() callback.

Note that __uart_port_using_nbcon() relies on the port->lock
being held while a console is added and removed from the
console list (i.e. all uart nbcon drivers *must* take the
port->lock in their device_lock() callbacks).

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20240820063001.36405-15-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/serial_core.h | 82 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 2cf03ff2056a..4ab65874a850 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -11,6 +11,8 @@
 #include <linux/compiler.h>
 #include <linux/console.h>
 #include <linux/interrupt.h>
+#include <linux/lockdep.h>
+#include <linux/printk.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/tty.h>
@@ -625,6 +627,60 @@ static inline void uart_port_set_cons(struct uart_port *up, struct console *con)
 	up->cons = con;
 	__uart_port_unlock_irqrestore(up, flags);
 }
+
+/* Only for internal port lock wrapper usage. */
+static inline bool __uart_port_using_nbcon(struct uart_port *up)
+{
+	lockdep_assert_held_once(&up->lock);
+
+	if (likely(!uart_console(up)))
+		return false;
+
+	/*
+	 * @up->cons is only modified under the port lock. Therefore it is
+	 * certain that it cannot disappear here.
+	 *
+	 * @up->cons->node is added/removed from the console list under the
+	 * port lock. Therefore it is certain that the registration status
+	 * cannot change here, thus @up->cons->flags can be read directly.
+	 */
+	if (hlist_unhashed_lockless(&up->cons->node) ||
+	    !(up->cons->flags & CON_NBCON) ||
+	    !up->cons->write_atomic) {
+		return false;
+	}
+
+	return true;
+}
+
+/* Only for internal port lock wrapper usage. */
+static inline bool __uart_port_nbcon_try_acquire(struct uart_port *up)
+{
+	if (!__uart_port_using_nbcon(up))
+		return true;
+
+	return nbcon_device_try_acquire(up->cons);
+}
+
+/* Only for internal port lock wrapper usage. */
+static inline void __uart_port_nbcon_acquire(struct uart_port *up)
+{
+	if (!__uart_port_using_nbcon(up))
+		return;
+
+	while (!nbcon_device_try_acquire(up->cons))
+		cpu_relax();
+}
+
+/* Only for internal port lock wrapper usage. */
+static inline void __uart_port_nbcon_release(struct uart_port *up)
+{
+	if (!__uart_port_using_nbcon(up))
+		return;
+
+	nbcon_device_release(up->cons);
+}
+
 /**
  * uart_port_lock - Lock the UART port
  * @up:		Pointer to UART port structure
@@ -632,6 +688,7 @@ static inline void uart_port_set_cons(struct uart_port *up, struct console *con)
 static inline void uart_port_lock(struct uart_port *up)
 {
 	spin_lock(&up->lock);
+	__uart_port_nbcon_acquire(up);
 }
 
 /**
@@ -641,6 +698,7 @@ static inline void uart_port_lock(struct uart_port *up)
 static inline void uart_port_lock_irq(struct uart_port *up)
 {
 	spin_lock_irq(&up->lock);
+	__uart_port_nbcon_acquire(up);
 }
 
 /**
@@ -651,6 +709,7 @@ static inline void uart_port_lock_irq(struct uart_port *up)
 static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags)
 {
 	spin_lock_irqsave(&up->lock, *flags);
+	__uart_port_nbcon_acquire(up);
 }
 
 /**
@@ -661,7 +720,15 @@ static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *f
  */
 static inline bool uart_port_trylock(struct uart_port *up)
 {
-	return spin_trylock(&up->lock);
+	if (!spin_trylock(&up->lock))
+		return false;
+
+	if (!__uart_port_nbcon_try_acquire(up)) {
+		spin_unlock(&up->lock);
+		return false;
+	}
+
+	return true;
 }
 
 /**
@@ -673,7 +740,15 @@ static inline bool uart_port_trylock(struct uart_port *up)
  */
 static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long *flags)
 {
-	return spin_trylock_irqsave(&up->lock, *flags);
+	if (!spin_trylock_irqsave(&up->lock, *flags))
+		return false;
+
+	if (!__uart_port_nbcon_try_acquire(up)) {
+		spin_unlock_irqrestore(&up->lock, *flags);
+		return false;
+	}
+
+	return true;
 }
 
 /**
@@ -682,6 +757,7 @@ static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long
  */
 static inline void uart_port_unlock(struct uart_port *up)
 {
+	__uart_port_nbcon_release(up);
 	spin_unlock(&up->lock);
 }
 
@@ -691,6 +767,7 @@ static inline void uart_port_unlock(struct uart_port *up)
  */
 static inline void uart_port_unlock_irq(struct uart_port *up)
 {
+	__uart_port_nbcon_release(up);
 	spin_unlock_irq(&up->lock);
 }
 
@@ -701,6 +778,7 @@ static inline void uart_port_unlock_irq(struct uart_port *up)
  */
 static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags)
 {
+	__uart_port_nbcon_release(up);
 	spin_unlock_irqrestore(&up->lock, flags);
 }
 
-- 
cgit v1.2.3


From 5dde3b7354133846079fa51e55e74ef90a836759 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 20 Aug 2024 08:35:52 +0206
Subject: printk: nbcon: Add unsafe flushing on panic

Add nbcon_atomic_flush_unsafe() to flush all nbcon consoles
using the write_atomic() callback and allowing unsafe hostile
takeovers. Call this at the end of panic() as a final attempt
to flush any pending messages.

Note that legacy consoles use unsafe methods for flushing
from the beginning of panic (see bust_spinlocks()). Therefore,
systems using both legacy and nbcon consoles may still fail to
see panic messages due to unsafe legacy console usage.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20240820063001.36405-27-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/printk.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9687089f5ace..2e083f01f8a3 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -202,6 +202,7 @@ void printk_trigger_flush(void);
 void console_try_replay_all(void);
 extern bool nbcon_device_try_acquire(struct console *con);
 extern void nbcon_device_release(struct console *con);
+void nbcon_atomic_flush_unsafe(void);
 #else
 static inline __printf(1, 0)
 int vprintk(const char *s, va_list args)
@@ -294,6 +295,10 @@ static inline void nbcon_device_release(struct console *con)
 {
 }
 
+static inline void nbcon_atomic_flush_unsafe(void)
+{
+}
+
 #endif
 
 bool this_cpu_in_panic(void);
-- 
cgit v1.2.3


From e35a8884270bae11196eedf3b0a5bf22619f11f2 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 20 Aug 2024 08:35:55 +0206
Subject: printk: Coordinate direct printing in panic

If legacy and nbcon consoles are registered and the nbcon
consoles are allowed to flush (i.e. no boot consoles
registered), the legacy consoles will no longer perform
direct printing on the panic CPU until after the backtrace
has been stored. This will give the safe nbcon consoles a
chance to print the panic messages before allowing the
unsafe legacy consoles to print.

If no nbcon consoles are registered or they are not allowed
to flush because boot consoles are registered, there is no
change in behavior (i.e. legacy consoles will always attempt
to print from the printk() caller context).

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20240820063001.36405-30-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/printk.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 2e083f01f8a3..eca9bb2ee637 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -200,6 +200,7 @@ extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold;
 extern asmlinkage void dump_stack(void) __cold;
 void printk_trigger_flush(void);
 void console_try_replay_all(void);
+void printk_legacy_allow_panic_sync(void);
 extern bool nbcon_device_try_acquire(struct console *con);
 extern void nbcon_device_release(struct console *con);
 void nbcon_atomic_flush_unsafe(void);
@@ -286,6 +287,10 @@ static inline void console_try_replay_all(void)
 {
 }
 
+static inline void printk_legacy_allow_panic_sync(void)
+{
+}
+
 static inline bool nbcon_device_try_acquire(struct console *con)
 {
 	return false;
-- 
cgit v1.2.3


From ecb5e1aa82c86642ec1eaafefd4e317dfba3a238 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Aug 2024 08:35:57 +0206
Subject: printk: nbcon: Implement emergency sections

In emergency situations (something has gone wrong but the
system continues to operate), usually important information
(such as a backtrace) is generated via printk(). This
information should be pushed out to the consoles ASAP.

Add per-CPU emergency nesting tracking because an emergency
can arise while in an emergency situation.

Add functions to mark the beginning and end of emergency
sections where the urgent messages are generated.

Perform direct console flushing at the emergency priority if
the current CPU is in an emergency state and it is safe to do
so.

Note that the emergency state is not system-wide. While one CPU
is in an emergency state, another CPU may attempt to print
console messages at normal priority.

Also note that printk() already attempts to flush consoles in
the caller context for normal priority. However, follow-up
changes will introduce printing kthreads, in which case the
normal priority printk() calls will offload to the kthreads.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20240820063001.36405-32-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/console.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 3706f944de46..9a13f91b0c43 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -553,10 +553,14 @@ static inline bool console_is_registered(const struct console *con)
 	hlist_for_each_entry(con, &console_list, node)
 
 #ifdef CONFIG_PRINTK
+extern void nbcon_cpu_emergency_enter(void);
+extern void nbcon_cpu_emergency_exit(void);
 extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt);
 extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
 extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
 #else
+static inline void nbcon_cpu_emergency_enter(void) { }
+static inline void nbcon_cpu_emergency_exit(void) { }
 static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; }
 static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; }
 static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; }
-- 
cgit v1.2.3


From d156263e247c334a8872578118e0709ed63c4806 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 21 Aug 2024 06:37:39 -1000
Subject: workqueue: Fix another htmldocs build warning

Fix htmldocs build warning introduced by 9b59a85a84dc ("workqueue: Don't
call va_start / va_end twice").

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Matthew Brost <matthew.brost@intel.com>
---
 include/linux/workqueue.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index f3cc15940b4d..59c2695e12e7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -534,7 +534,7 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active,
  * @fmt: printf format for the name of the workqueue
  * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
  * @lockdep_map: user-defined lockdep_map
- * @...: args for @fmt
+ * @args: args for @fmt
  *
  * Same as alloc_ordered_workqueue but with the a user-define lockdep_map.
  * Useful for workqueues created with the same purpose and to avoid leaking a
@@ -544,7 +544,8 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active,
  * Pointer to the allocated workqueue on success, %NULL on failure.
  */
 #define alloc_ordered_workqueue_lockdep_map(fmt, flags, lockdep_map, args...)	\
-	alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, lockdep_map, ##args)
+	alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags),	\
+				    1, lockdep_map, ##args)
 #endif
 
 /**
-- 
cgit v1.2.3


From 496ddd19a0fad22f250fc7a7b7a8000155418934 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 8 Aug 2024 16:22:28 -0700
Subject: bpf: extract iterator argument type and name validation logic

Verifier enforces that all iterator structs are named `bpf_iter_<name>`
and that whenever iterator is passed to a kfunc it's passed as a valid PTR ->
STRUCT chain (with potentially const modifiers in between).

We'll need this check for upcoming changes, so instead of duplicating
the logic, extract it into a helper function.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240808232230.2848712-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index cffb43133c68..b8a583194c4a 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -580,6 +580,7 @@ bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
 int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type);
 bool btf_types_are_same(const struct btf *btf1, u32 id1,
 			const struct btf *btf2, u32 id2);
+int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx);
 #else
 static inline const struct btf_type *btf_type_by_id(const struct btf *btf,
 						    u32 type_id)
@@ -654,6 +655,10 @@ static inline bool btf_types_are_same(const struct btf *btf1, u32 id1,
 {
 	return false;
 }
+static inline int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t)
-- 
cgit v1.2.3


From 74b1e94e94ea369923e5656eeb10b26b16591327 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 20 Aug 2024 13:51:19 -0700
Subject: net: repack struct netdev_queue

Adding the NAPI pointer to struct netdev_queue made it grow into another
cacheline, even though there was 44 bytes of padding available.

The struct was historically grouped as follows:

    /* read-mostly stuff (align) */
    /* ... random control path fields ... */
    /* write-mostly stuff (align) */
    /* ... 40 byte hole ... */
    /* struct dql (align) */

It seems that people want to add control path fields after
the read only fields. struct dql looks pretty innocent
but it forces its own alignment and nothing indicates that
there is a lot of empty space above it.

Move dql above the xmit_lock. This shifts the empty space
to the end of the struct rather than in the middle of it.
Move two example fields there to set an example.
Hopefully people will now add new fields at the end of
the struct. A lot of the read-only stuff is also control
path-only, but if we move it all we'll have another hole
in the middle.

Before:
	/* size: 384, cachelines: 6, members: 16 */
	/* sum members: 284, holes: 3, sum holes: 100 */

After:
        /* size: 320, cachelines: 5, members: 16 */
        /* sum members: 284, holes: 1, sum holes: 8 */

Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20240820205119.1321322-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0ef3eaa23f4b..614ec5d3d75b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -644,9 +644,6 @@ struct netdev_queue {
 	struct Qdisc __rcu	*qdisc_sleeping;
 #ifdef CONFIG_SYSFS
 	struct kobject		kobj;
-#endif
-#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
-	int			numa_node;
 #endif
 	unsigned long		tx_maxrate;
 	/*
@@ -660,13 +657,13 @@ struct netdev_queue {
 #ifdef CONFIG_XDP_SOCKETS
 	struct xsk_buff_pool    *pool;
 #endif
-	/* NAPI instance for the queue
-	 * Readers and writers must hold RTNL
-	 */
-	struct napi_struct      *napi;
+
 /*
  * write-mostly part
  */
+#ifdef CONFIG_BQL
+	struct dql		dql;
+#endif
 	spinlock_t		_xmit_lock ____cacheline_aligned_in_smp;
 	int			xmit_lock_owner;
 	/*
@@ -676,8 +673,16 @@ struct netdev_queue {
 
 	unsigned long		state;
 
-#ifdef CONFIG_BQL
-	struct dql		dql;
+/*
+ * slow- / control-path part
+ */
+	/* NAPI instance for the queue
+	 * Readers and writers must hold RTNL
+	 */
+	struct napi_struct	*napi;
+
+#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
+	int			numa_node;
 #endif
 } ____cacheline_aligned_in_smp;
 
-- 
cgit v1.2.3


From ba0fb44aed47693cc2482427f63ba6cd19051327 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Sun, 11 Aug 2024 10:09:35 +0300
Subject: dma-mapping: replace zone_dma_bits by zone_dma_limit

The hardware DMA limit might not be power of 2. When RAM range starts
above 0, say 4GB, DMA limit of 30 bits should end at 5GB.  A single high
bit can not encode this limit.

Use a plain  address for the DMA zone limit instead.

Since the DMA zone can now potentially span beyond 4GB physical limit of
DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Co-developed-by: Baruch Siach <baruch@tkos.co.il>
Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Petr Tesarik <ptesarik@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-direct.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index edbe13d00776..d7e30d4f7503 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -12,7 +12,7 @@
 #include <linux/mem_encrypt.h>
 #include <linux/swiotlb.h>
 
-extern unsigned int zone_dma_bits;
+extern u64 zone_dma_limit;
 
 /*
  * Record the mapping of CPU physical to DMA addresses for a given region.
-- 
cgit v1.2.3


From b5c58b2fdc427e7958412ecb2de2804a1f7c1572 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Wed, 24 Jul 2024 21:04:49 +0300
Subject: dma-mapping: direct calls for dma-iommu

Directly call into dma-iommu just like we have been doing for dma-direct
for a while.  This avoids the indirect call overhead for IOMMU ops and
removes the need to have DMA ops entirely for many common configurations.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/device.h      |   5 ++
 include/linux/dma-map-ops.h |  13 ----
 include/linux/iommu-dma.h   | 147 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 152 insertions(+), 13 deletions(-)
 create mode 100644 include/linux/iommu-dma.h

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 34eb20f5966f..1c5280d28bc3 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -707,6 +707,8 @@ struct device_physical_location {
  *		for dma allocations.  This flag is managed by the dma ops
  *		instance from ->dma_supported.
  * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers.
+ * @dma_iommu: Device is using default IOMMU implementation for DMA and
+ *		doesn't rely on dma_ops structure.
  *
  * At the lowest level, every device in a Linux system is represented by an
  * instance of struct device. The device structure contains the information
@@ -822,6 +824,9 @@ struct device {
 #ifdef CONFIG_DMA_NEED_SYNC
 	bool			dma_skip_sync:1;
 #endif
+#ifdef CONFIG_IOMMU_DMA
+	bool			dma_iommu:1;
+#endif
 };
 
 /**
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 02a1c825896b..077b15c93bb8 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -13,20 +13,7 @@
 struct cma;
 struct iommu_ops;
 
-/*
- * Values for struct dma_map_ops.flags:
- *
- * DMA_F_PCI_P2PDMA_SUPPORTED: Indicates the dma_map_ops implementation can
- * handle PCI P2PDMA pages in the map_sg/unmap_sg operation.
- * DMA_F_CAN_SKIP_SYNC: DMA sync operations can be skipped if the device is
- * coherent and it's not an SWIOTLB buffer.
- */
-#define DMA_F_PCI_P2PDMA_SUPPORTED     (1 << 0)
-#define DMA_F_CAN_SKIP_SYNC            (1 << 1)
-
 struct dma_map_ops {
-	unsigned int flags;
-
 	void *(*alloc)(struct device *dev, size_t size,
 			dma_addr_t *dma_handle, gfp_t gfp,
 			unsigned long attrs);
diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h
new file mode 100644
index 000000000000..d30a58bf00fd
--- /dev/null
+++ b/include/linux/iommu-dma.h
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ *
+ * DMA operations that map physical memory through IOMMU.
+ */
+#ifndef _LINUX_IOMMU_DMA_H
+#define _LINUX_IOMMU_DMA_H
+
+#include <linux/dma-direction.h>
+
+#ifdef CONFIG_IOMMU_DMA
+dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs);
+void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir, unsigned long attrs);
+void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir, unsigned long attrs);
+void *iommu_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
+		gfp_t gfp, unsigned long attrs);
+int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs);
+int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs);
+unsigned long iommu_dma_get_merge_boundary(struct device *dev);
+size_t iommu_dma_opt_mapping_size(void);
+size_t iommu_dma_max_mapping_size(struct device *dev);
+void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr,
+		dma_addr_t handle, unsigned long attrs);
+dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, size_t size,
+		enum dma_data_direction dir, gfp_t gfp, unsigned long attrs);
+void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
+		struct sg_table *sgt, enum dma_data_direction dir);
+void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
+		size_t size, enum dma_data_direction dir);
+void iommu_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
+		size_t size, enum dma_data_direction dir);
+void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
+		int nelems, enum dma_data_direction dir);
+void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
+		int nelems, enum dma_data_direction dir);
+#else
+static inline dma_addr_t iommu_dma_map_page(struct device *dev,
+		struct page *page, unsigned long offset, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	return DMA_MAPPING_ERROR;
+}
+static inline void iommu_dma_unmap_page(struct device *dev,
+		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+}
+static inline int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	return -EINVAL;
+}
+static inline void iommu_dma_unmap_sg(struct device *dev,
+		struct scatterlist *sg, int nents, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+}
+static inline void *iommu_dma_alloc(struct device *dev, size_t size,
+		dma_addr_t *handle, gfp_t gfp, unsigned long attrs)
+{
+	return NULL;
+}
+static inline int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	return -EINVAL;
+}
+static inline int iommu_dma_get_sgtable(struct device *dev,
+		struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr,
+		size_t size, unsigned long attrs)
+{
+	return -EINVAL;
+}
+static inline unsigned long iommu_dma_get_merge_boundary(struct device *dev)
+{
+	return 0;
+}
+static inline size_t iommu_dma_opt_mapping_size(void)
+{
+	return 0;
+}
+static inline size_t iommu_dma_max_mapping_size(struct device *dev)
+{
+	return 0;
+}
+static inline void iommu_dma_free(struct device *dev, size_t size,
+		void *cpu_addr, dma_addr_t handle, unsigned long attrs)
+{
+}
+static inline dma_addr_t iommu_dma_map_resource(struct device *dev,
+		phys_addr_t phys, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	return DMA_MAPPING_ERROR;
+}
+static inline void iommu_dma_unmap_resource(struct device *dev,
+		dma_addr_t handle, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+}
+static inline struct sg_table *
+iommu_dma_alloc_noncontiguous(struct device *dev, size_t size,
+		enum dma_data_direction dir, gfp_t gfp, unsigned long attrs)
+{
+	return NULL;
+}
+static inline void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
+		struct sg_table *sgt, enum dma_data_direction dir)
+{
+}
+static inline void iommu_dma_sync_single_for_cpu(struct device *dev,
+		dma_addr_t dma_handle, size_t size,
+		enum dma_data_direction dir)
+{
+}
+static inline void iommu_dma_sync_single_for_device(struct device *dev,
+		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
+{
+}
+static inline void iommu_dma_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nelems,
+		enum dma_data_direction dir)
+{
+}
+static inline void iommu_dma_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nelems,
+		enum dma_data_direction dir)
+{
+}
+#endif /* CONFIG_IOMMU_DMA */
+#endif /* _LINUX_IOMMU_DMA_H */
-- 
cgit v1.2.3


From ae010757a55b57c8b82628e8ea9b7da2269131d9 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Thu, 22 Aug 2024 01:41:07 -0700
Subject: bpf: rename nocsr -> bpf_fastcall in verifier

Attribute used by LLVM implementation of the feature had been changed
from no_caller_saved_registers to bpf_fastcall (see [1]).
This commit replaces references to nocsr by references to bpf_fastcall
to keep LLVM and Kernel parts in sync.

[1] https://github.com/llvm/llvm-project/pull/105417

Acked-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240822084112.3257995-2-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  6 +++---
 include/linux/bpf_verifier.h | 18 +++++++++---------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f0192c173ed8..00dc4dd28cbd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -808,12 +808,12 @@ struct bpf_func_proto {
 	bool gpl_only;
 	bool pkt_access;
 	bool might_sleep;
-	/* set to true if helper follows contract for gcc/llvm
-	 * attribute no_caller_saved_registers:
+	/* set to true if helper follows contract for llvm
+	 * attribute bpf_fastcall:
 	 * - void functions do not scratch r0
 	 * - functions taking N arguments scratch only registers r1-rN
 	 */
-	bool allow_nocsr;
+	bool allow_fastcall;
 	enum bpf_return_type ret_type;
 	union {
 		struct {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5cea15c81b8a..634a302a39e3 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -577,13 +577,13 @@ struct bpf_insn_aux_data {
 	bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
 	u8 alu_state; /* used in combination with alu_limit */
 	/* true if STX or LDX instruction is a part of a spill/fill
-	 * pattern for a no_caller_saved_registers call.
+	 * pattern for a bpf_fastcall call.
 	 */
-	u8 nocsr_pattern:1;
+	u8 fastcall_pattern:1;
 	/* for CALL instructions, a number of spill/fill pairs in the
-	 * no_caller_saved_registers pattern.
+	 * bpf_fastcall pattern.
 	 */
-	u8 nocsr_spills_num:3;
+	u8 fastcall_spills_num:3;
 
 	/* below fields are initialized once */
 	unsigned int orig_idx; /* original instruction index */
@@ -653,10 +653,10 @@ struct bpf_subprog_info {
 	u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
 	u16 stack_depth; /* max. stack depth used by this function */
 	u16 stack_extra;
-	/* offsets in range [stack_depth .. nocsr_stack_off)
-	 * are used for no_caller_saved_registers spills and fills.
+	/* offsets in range [stack_depth .. fastcall_stack_off)
+	 * are used for bpf_fastcall spills and fills.
 	 */
-	s16 nocsr_stack_off;
+	s16 fastcall_stack_off;
 	bool has_tail_call: 1;
 	bool tail_call_reachable: 1;
 	bool has_ld_abs: 1;
@@ -664,8 +664,8 @@ struct bpf_subprog_info {
 	bool is_async_cb: 1;
 	bool is_exception_cb: 1;
 	bool args_cached: 1;
-	/* true if nocsr stack region is used by functions that can't be inlined */
-	bool keep_nocsr_stack: 1;
+	/* true if bpf_fastcall stack region is used by functions that can't be inlined */
+	bool keep_fastcall_stack: 1;
 
 	u8 arg_cnt;
 	struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
-- 
cgit v1.2.3


From aa35e56a5217b86f9c05420c36c908baf3b2df5f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 19 Aug 2024 18:00:19 +0200
Subject: thermal: core: Introduce .should_bind() thermal zone callback

The current design of the code binding cooling devices to trip points in
thermal zones is convoluted and hard to follow.

Namely, a driver that registers a thermal zone can provide .bind()
and .unbind() operations for it, which are required to call either
thermal_bind_cdev_to_trip() and thermal_unbind_cdev_from_trip(),
respectively, or thermal_zone_bind_cooling_device() and
thermal_zone_unbind_cooling_device(), respectively, for every relevant
trip point and the given cooling device.  Moreover, if .bind() is
provided and .unbind() is not, the cleanup necessary during the removal
of a thermal zone or a cooling device may not be carried out.

In other words, the core relies on the thermal zone owners to do the
right thing, which is error prone and far from obvious, even though all
of that is not really necessary.  Specifically, if the core could ask
the thermal zone owner, through a special thermal zone callback, whether
or not a given cooling device should be bound to a given trip point in
the given thermal zone, it might as well carry out all of the binding
and unbinding by itself.  In particular, the unbinding can be done
automatically without involving the thermal zone owner at all because
all of the thermal instances associated with a thermal zone or cooling
device going away must be deleted regardless.

Accordingly, introduce a new thermal zone operation, .should_bind(),
that can be invoked by the thermal core for a given thermal zone,
trip point and cooling device combination in order to check whether
or not the cooling device should be bound to the trip point at hand.
It takes an additional cooling_spec argument allowing the thermal
zone owner to specify the highest and lowest cooling states of the
cooling device and its weight for the given trip point binding.

Make the thermal core use this operation, if present, in the absence of
.bind() and .unbind().  Note that .should_bind() will be called under
the thermal zone lock.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Acked-by: Huisong Li <lihuisong@huawei.com>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/9334403.CDJkKcVGEf@rjwysocki.net
---
 include/linux/thermal.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index ba6d74c11620..9c5d1f14224f 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -85,11 +85,21 @@ struct thermal_trip {
 
 struct thermal_zone_device;
 
+struct cooling_spec {
+	unsigned long upper;	/* Highest cooling state  */
+	unsigned long lower;	/* Lowest cooling state  */
+	unsigned int weight;	/* Cooling device weight */
+};
+
 struct thermal_zone_device_ops {
 	int (*bind) (struct thermal_zone_device *,
 		     struct thermal_cooling_device *);
 	int (*unbind) (struct thermal_zone_device *,
 		       struct thermal_cooling_device *);
+	bool (*should_bind) (struct thermal_zone_device *,
+			     const struct thermal_trip *,
+			     struct thermal_cooling_device *,
+			     struct cooling_spec *);
 	int (*get_temp) (struct thermal_zone_device *, int *);
 	int (*set_trips) (struct thermal_zone_device *, int, int);
 	int (*change_mode) (struct thermal_zone_device *,
-- 
cgit v1.2.3


From 2fb85633641770d55b50c934db6b4fb02411a1fe Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 19 Aug 2024 18:05:00 +0200
Subject: thermal: core: Unexport thermal_bind_cdev_to_trip() and
 thermal_unbind_cdev_from_trip()

Since thermal_bind_cdev_to_trip() and thermal_unbind_cdev_from_trip()
are only called locally in the thermal core now, they can be static,
so change their definitions accordingly and drop their headers from
the global thermal header file.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Acked-by: Huisong Li <lihuisong@huawei.com>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/3512161.QJadu78ljV@rjwysocki.net
---
 include/linux/thermal.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 9c5d1f14224f..6599a26847f7 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -247,18 +247,10 @@ const char *thermal_zone_device_type(struct thermal_zone_device *tzd);
 int thermal_zone_device_id(struct thermal_zone_device *tzd);
 struct device *thermal_zone_device(struct thermal_zone_device *tzd);
 
-int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz,
-			      const struct thermal_trip *trip,
-			      struct thermal_cooling_device *cdev,
-			      unsigned long upper, unsigned long lower,
-			      unsigned int weight);
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
 				     struct thermal_cooling_device *,
 				     unsigned long, unsigned long,
 				     unsigned int);
-int thermal_unbind_cdev_from_trip(struct thermal_zone_device *tz,
-				  const struct thermal_trip *trip,
-				  struct thermal_cooling_device *cdev);
 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int,
 				       struct thermal_cooling_device *);
 void thermal_zone_device_update(struct thermal_zone_device *,
-- 
cgit v1.2.3


From d51e783c17bab0c139bf78d6bd9d1f66673f7903 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@kernel.org>
Date: Fri, 16 Aug 2024 17:43:06 +0200
Subject: lsm: count the LSMs enabled at compile time

These macros are a clever trick to determine a count of the number of
LSMs that are enabled in the config to ascertain the maximum number of
static calls that need to be configured per LSM hook.

Without this one would need to generate static calls for the total
number of LSMs in the kernel (even if they are not compiled) times the
number of LSM hooks which ends up being quite wasteful.

Tested-by: Guenter Roeck <linux@roeck-us.net>
Suggested-by: Kui-Feng Lee <sinquersw@gmail.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: Song Liu <song@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Nacked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: KP Singh <kpsingh@kernel.org>
[PM: added IPE to the count during merge]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/args.h      |   6 +--
 include/linux/lsm_count.h | 135 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/lsm_count.h

(limited to 'include/linux')

diff --git a/include/linux/args.h b/include/linux/args.h
index 8ff60a54eb7d..2e8e65d975c7 100644
--- a/include/linux/args.h
+++ b/include/linux/args.h
@@ -17,9 +17,9 @@
  * that as _n.
  */
 
-/* This counts to 12. Any more, it will return 13th argument. */
-#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n
-#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+/* This counts to 15. Any more, it will return 16th argument. */
+#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _n, X...) _n
+#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
 
 /* Concatenate two parameters, but allow them to be expanded beforehand. */
 #define __CONCAT(a, b) a ## b
diff --git a/include/linux/lsm_count.h b/include/linux/lsm_count.h
new file mode 100644
index 000000000000..16eb49761b25
--- /dev/null
+++ b/include/linux/lsm_count.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (C) 2023 Google LLC.
+ */
+
+#ifndef __LINUX_LSM_COUNT_H
+#define __LINUX_LSM_COUNT_H
+
+#include <linux/args.h>
+
+#ifdef CONFIG_SECURITY
+
+/*
+ * Macros to count the number of LSMs enabled in the kernel at compile time.
+ */
+
+/*
+ * Capabilities is enabled when CONFIG_SECURITY is enabled.
+ */
+#if IS_ENABLED(CONFIG_SECURITY)
+#define CAPABILITIES_ENABLED 1,
+#else
+#define CAPABILITIES_ENABLED
+#endif
+
+#if IS_ENABLED(CONFIG_SECURITY_SELINUX)
+#define SELINUX_ENABLED 1,
+#else
+#define SELINUX_ENABLED
+#endif
+
+#if IS_ENABLED(CONFIG_SECURITY_SMACK)
+#define SMACK_ENABLED 1,
+#else
+#define SMACK_ENABLED
+#endif
+
+#if IS_ENABLED(CONFIG_SECURITY_APPARMOR)
+#define APPARMOR_ENABLED 1,
+#else
+#define APPARMOR_ENABLED
+#endif
+
+#if IS_ENABLED(CONFIG_SECURITY_TOMOYO)
+#define TOMOYO_ENABLED 1,
+#else
+#define TOMOYO_ENABLED
+#endif
+
+#if IS_ENABLED(CONFIG_SECURITY_YAMA)
+#define YAMA_ENABLED 1,
+#else
+#define YAMA_ENABLED
+#endif
+
+#if IS_ENABLED(CONFIG_SECURITY_LOADPIN)
+#define LOADPIN_ENABLED 1,
+#else
+#define LOADPIN_ENABLED
+#endif
+
+#if IS_ENABLED(CONFIG_SECURITY_LOCKDOWN_LSM)
+#define LOCKDOWN_ENABLED 1,
+#else
+#define LOCKDOWN_ENABLED
+#endif
+
+#if IS_ENABLED(CONFIG_SECURITY_SAFESETID)
+#define SAFESETID_ENABLED 1,
+#else
+#define SAFESETID_ENABLED
+#endif
+
+#if IS_ENABLED(CONFIG_BPF_LSM)
+#define BPF_LSM_ENABLED 1,
+#else
+#define BPF_LSM_ENABLED
+#endif
+
+#if IS_ENABLED(CONFIG_SECURITY_LANDLOCK)
+#define LANDLOCK_ENABLED 1,
+#else
+#define LANDLOCK_ENABLED
+#endif
+
+#if IS_ENABLED(CONFIG_IMA)
+#define IMA_ENABLED 1,
+#else
+#define IMA_ENABLED
+#endif
+
+#if IS_ENABLED(CONFIG_EVM)
+#define EVM_ENABLED 1,
+#else
+#define EVM_ENABLED
+#endif
+
+#if IS_ENABLED(CONFIG_SECURITY_IPE)
+#define IPE_ENABLED 1,
+#else
+#define IPE_ENABLED
+#endif
+
+/*
+ *  There is a trailing comma that we need to be accounted for. This is done by
+ *  using a skipped argument in __COUNT_LSMS
+ */
+#define __COUNT_LSMS(skipped_arg, args...) COUNT_ARGS(args...)
+#define COUNT_LSMS(args...) __COUNT_LSMS(args)
+
+#define MAX_LSM_COUNT			\
+	COUNT_LSMS(			\
+		CAPABILITIES_ENABLED	\
+		SELINUX_ENABLED		\
+		SMACK_ENABLED		\
+		APPARMOR_ENABLED	\
+		TOMOYO_ENABLED		\
+		YAMA_ENABLED		\
+		LOADPIN_ENABLED		\
+		LOCKDOWN_ENABLED	\
+		SAFESETID_ENABLED	\
+		BPF_LSM_ENABLED		\
+		LANDLOCK_ENABLED	\
+		IMA_ENABLED		\
+		EVM_ENABLED		\
+		IPE_ENABLED)
+
+#else
+
+#define MAX_LSM_COUNT 0
+
+#endif /* CONFIG_SECURITY */
+
+#endif  /* __LINUX_LSM_COUNT_H */
-- 
cgit v1.2.3


From 417c5643cd67a55f424b203b492082035d0236c3 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@kernel.org>
Date: Fri, 16 Aug 2024 17:43:07 +0200
Subject: lsm: replace indirect LSM hook calls with static calls

LSM hooks are currently invoked from a linked list as indirect calls
which are invoked using retpolines as a mitigation for speculative
attacks (Branch History / Target injection) and add extra overhead which
is especially bad in kernel hot paths:

security_file_ioctl:
   0xff...0320 <+0>:	endbr64
   0xff...0324 <+4>:	push   %rbp
   0xff...0325 <+5>:	push   %r15
   0xff...0327 <+7>:	push   %r14
   0xff...0329 <+9>:	push   %rbx
   0xff...032a <+10>:	mov    %rdx,%rbx
   0xff...032d <+13>:	mov    %esi,%ebp
   0xff...032f <+15>:	mov    %rdi,%r14
   0xff...0332 <+18>:	mov    $0xff...7030,%r15
   0xff...0339 <+25>:	mov    (%r15),%r15
   0xff...033c <+28>:	test   %r15,%r15
   0xff...033f <+31>:	je     0xff...0358 <security_file_ioctl+56>
   0xff...0341 <+33>:	mov    0x18(%r15),%r11
   0xff...0345 <+37>:	mov    %r14,%rdi
   0xff...0348 <+40>:	mov    %ebp,%esi
   0xff...034a <+42>:	mov    %rbx,%rdx

   0xff...034d <+45>:	call   0xff...2e0 <__x86_indirect_thunk_array+352>
   			       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

    Indirect calls that use retpolines leading to overhead, not just due
    to extra instruction but also branch misses.

   0xff...0352 <+50>:	test   %eax,%eax
   0xff...0354 <+52>:	je     0xff...0339 <security_file_ioctl+25>
   0xff...0356 <+54>:	jmp    0xff...035a <security_file_ioctl+58>
   0xff...0358 <+56>:	xor    %eax,%eax
   0xff...035a <+58>:	pop    %rbx
   0xff...035b <+59>:	pop    %r14
   0xff...035d <+61>:	pop    %r15
   0xff...035f <+63>:	pop    %rbp
   0xff...0360 <+64>:	jmp    0xff...47c4 <__x86_return_thunk>

The indirect calls are not really needed as one knows the addresses of
enabled LSM callbacks at boot time and only the order can possibly
change at boot time with the lsm= kernel command line parameter.

An array of static calls is defined per LSM hook and the static calls
are updated at boot time once the order has been determined.

With the hook now exposed as a static call, one can see that the
retpolines are no longer there and the LSM callbacks are invoked
directly:

security_file_ioctl:
   0xff...0ca0 <+0>:	endbr64
   0xff...0ca4 <+4>:	nopl   0x0(%rax,%rax,1)
   0xff...0ca9 <+9>:	push   %rbp
   0xff...0caa <+10>:	push   %r14
   0xff...0cac <+12>:	push   %rbx
   0xff...0cad <+13>:	mov    %rdx,%rbx
   0xff...0cb0 <+16>:	mov    %esi,%ebp
   0xff...0cb2 <+18>:	mov    %rdi,%r14
   0xff...0cb5 <+21>:	jmp    0xff...0cc7 <security_file_ioctl+39>
  			       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   Static key enabled for SELinux

   0xffffffff818f0cb7 <+23>:	jmp    0xff...0cde <security_file_ioctl+62>
   				^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

   Static key enabled for BPF LSM. This is something that is changed to
   default to false to avoid the existing side effect issues of BPF LSM
   [1] in a subsequent patch.

   0xff...0cb9 <+25>:	xor    %eax,%eax
   0xff...0cbb <+27>:	xchg   %ax,%ax
   0xff...0cbd <+29>:	pop    %rbx
   0xff...0cbe <+30>:	pop    %r14
   0xff...0cc0 <+32>:	pop    %rbp
   0xff...0cc1 <+33>:	cs jmp 0xff...0000 <__x86_return_thunk>
   0xff...0cc7 <+39>:	endbr64
   0xff...0ccb <+43>:	mov    %r14,%rdi
   0xff...0cce <+46>:	mov    %ebp,%esi
   0xff...0cd0 <+48>:	mov    %rbx,%rdx
   0xff...0cd3 <+51>:	call   0xff...3230 <selinux_file_ioctl>
   			       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   Direct call to SELinux.

   0xff...0cd8 <+56>:	test   %eax,%eax
   0xff...0cda <+58>:	jne    0xff...0cbd <security_file_ioctl+29>
   0xff...0cdc <+60>:	jmp    0xff...0cb7 <security_file_ioctl+23>
   0xff...0cde <+62>:	endbr64
   0xff...0ce2 <+66>:	mov    %r14,%rdi
   0xff...0ce5 <+69>:	mov    %ebp,%esi
   0xff...0ce7 <+71>:	mov    %rbx,%rdx
   0xff...0cea <+74>:	call   0xff...e220 <bpf_lsm_file_ioctl>
   			       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   Direct call to BPF LSM.

   0xff...0cef <+79>:	test   %eax,%eax
   0xff...0cf1 <+81>:	jne    0xff...0cbd <security_file_ioctl+29>
   0xff...0cf3 <+83>:	jmp    0xff...0cb9 <security_file_ioctl+25>
   0xff...0cf5 <+85>:	endbr64
   0xff...0cf9 <+89>:	mov    %r14,%rdi
   0xff...0cfc <+92>:	mov    %ebp,%esi
   0xff...0cfe <+94>:	mov    %rbx,%rdx
   0xff...0d01 <+97>:	pop    %rbx
   0xff...0d02 <+98>:	pop    %r14
   0xff...0d04 <+100>:	pop    %rbp
   0xff...0d05 <+101>:	ret
   0xff...0d06 <+102>:	int3
   0xff...0d07 <+103>:	int3
   0xff...0d08 <+104>:	int3
   0xff...0d09 <+105>:	int3

While this patch uses static_branch_unlikely indicating that an LSM hook
is likely to be not present. In most cases this is still a better choice
as even when an LSM with one hook is added, empty slots are created for
all LSM hooks (especially when many LSMs that do not initialize most
hooks are present on the system).

There are some hooks that don't use the call_int_hook or
call_void_hook. These hooks are updated to use a new macro called
lsm_for_each_hook where the lsm_callback is directly invoked as an
indirect call.

Below are results of the relevant Unixbench system benchmarks with BPF LSM
and SELinux enabled with default policies enabled with and without these
patches.

Benchmark                                          Delta(%): (+ is better)
==========================================================================
Execl Throughput                                             +1.9356
File Write 1024 bufsize 2000 maxblocks                       +6.5953
Pipe Throughput                                              +9.5499
Pipe-based Context Switching                                 +3.0209
Process Creation                                             +2.3246
Shell Scripts (1 concurrent)                                 +1.4975
System Call Overhead                                         +2.7815
System Benchmarks Index Score (Partial Only):                +3.4859

In the best case, some syscalls like eventfd_create benefitted to about
~10%.

Tested-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Song Liu <song@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: KP Singh <kpsingh@kernel.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hooks.h | 52 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 4687985b9175..090d1d3e19fe 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -30,19 +30,47 @@
 #include <linux/init.h>
 #include <linux/rculist.h>
 #include <linux/xattr.h>
+#include <linux/static_call.h>
+#include <linux/unroll.h>
+#include <linux/jump_label.h>
+#include <linux/lsm_count.h>
 
 union security_list_options {
 	#define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__);
 	#include "lsm_hook_defs.h"
 	#undef LSM_HOOK
+	void *lsm_func_addr;
 };
 
-struct security_hook_heads {
-	#define LSM_HOOK(RET, DEFAULT, NAME, ...) struct hlist_head NAME;
-	#include "lsm_hook_defs.h"
-	#undef LSM_HOOK
+/*
+ * @key: static call key as defined by STATIC_CALL_KEY
+ * @trampoline: static call trampoline as defined by STATIC_CALL_TRAMP
+ * @hl: The security_hook_list as initialized by the owning LSM.
+ * @active: Enabled when the static call has an LSM hook associated.
+ */
+struct lsm_static_call {
+	struct static_call_key *key;
+	void *trampoline;
+	struct security_hook_list *hl;
+	/* this needs to be true or false based on what the key defaults to */
+	struct static_key_false *active;
 } __randomize_layout;
 
+/*
+ * Table of the static calls for each LSM hook.
+ * Once the LSMs are initialized, their callbacks will be copied to these
+ * tables such that the calls are filled backwards (from last to first).
+ * This way, we can jump directly to the first used static call, and execute
+ * all of them after. This essentially makes the entry point
+ * dynamic to adapt the number of static calls to the number of callbacks.
+ */
+struct lsm_static_calls_table {
+	#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
+		struct lsm_static_call NAME[MAX_LSM_COUNT];
+	#include <linux/lsm_hook_defs.h>
+	#undef LSM_HOOK
+} __packed __randomize_layout;
+
 /**
  * struct lsm_id - Identify a Linux Security Module.
  * @lsm: name of the LSM, must be approved by the LSM maintainers
@@ -58,10 +86,14 @@ struct lsm_id {
 /*
  * Security module hook list structure.
  * For use with generic list macros for common operations.
+ *
+ * struct security_hook_list - Contents of a cacheable, mappable object.
+ * @scalls: The beginning of the array of static calls assigned to this hook.
+ * @hook: The callback for the hook.
+ * @lsm: The name of the lsm that owns this hook.
  */
 struct security_hook_list {
-	struct hlist_node list;
-	struct hlist_head *head;
+	struct lsm_static_call *scalls;
 	union security_list_options hook;
 	const struct lsm_id *lsmid;
 } __randomize_layout;
@@ -98,8 +130,11 @@ struct lsm_blob_sizes {
  * care of the common case and reduces the amount of
  * text involved.
  */
-#define LSM_HOOK_INIT(HEAD, HOOK) \
-	{ .head = &security_hook_heads.HEAD, .hook = { .HEAD = HOOK } }
+#define LSM_HOOK_INIT(NAME, HOOK)			\
+	{						\
+		.scalls = static_calls_table.NAME,	\
+		.hook = { .NAME = HOOK }		\
+	}
 
 extern void security_add_hooks(struct security_hook_list *hooks, int count,
 			       const struct lsm_id *lsmid);
@@ -134,7 +169,6 @@ struct lsm_info {
 
 /* DO NOT tamper with these variables outside of the LSM framework */
 extern char *lsm_names;
-extern struct security_hook_heads security_hook_heads;
 extern struct lsm_static_calls_table static_calls_table __ro_after_init;
 extern struct lsm_info __start_lsm_info[], __end_lsm_info[];
 extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];
-- 
cgit v1.2.3


From 6c65914a40d33e96ceedc757be0129139cdf7852 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 16 Aug 2024 11:54:23 -0700
Subject: Input: keypad-nomadik-ske - remove the driver

The users of this driver were removed in 2013 in commit 28633c54bda6
("ARM: ux500: Rip out keypad initialisation which is no longer used").

Remove the driver as well.

Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/Zr-gX0dfN4te_8VG@google.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/platform_data/keypad-nomadik-ske.h | 50 ------------------------
 1 file changed, 50 deletions(-)
 delete mode 100644 include/linux/platform_data/keypad-nomadik-ske.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/keypad-nomadik-ske.h b/include/linux/platform_data/keypad-nomadik-ske.h
deleted file mode 100644
index 7efabbca1dca..000000000000
--- a/include/linux/platform_data/keypad-nomadik-ske.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson SA 2010
- *
- * Author: Naveen Kumar Gaddipati <naveen.gaddipati@stericsson.com>
- *
- * ux500 Scroll key and Keypad Encoder (SKE) header
- */
-
-#ifndef __SKE_H
-#define __SKE_H
-
-#include <linux/input/matrix_keypad.h>
-
-/* register definitions for SKE peripheral */
-#define SKE_CR		0x00
-#define SKE_VAL0	0x04
-#define SKE_VAL1	0x08
-#define SKE_DBCR	0x0C
-#define SKE_IMSC	0x10
-#define SKE_RIS		0x14
-#define SKE_MIS		0x18
-#define SKE_ICR		0x1C
-
-/*
- * Keypad module
- */
-
-/**
- * struct keypad_platform_data - structure for platform specific data
- * @init:	pointer to keypad init function
- * @exit:	pointer to keypad deinitialisation function
- * @keymap_data: matrix scan code table for keycodes
- * @krow:	maximum number of rows
- * @kcol:	maximum number of columns
- * @debounce_ms: platform specific debounce time
- * @no_autorepeat: flag for auto repetition
- * @wakeup_enable: allow waking up the system
- */
-struct ske_keypad_platform_data {
-	int (*init)(void);
-	int (*exit)(void);
-	const struct matrix_keymap_data *keymap_data;
-	u8 krow;
-	u8 kcol;
-	u8 debounce_ms;
-	bool no_autorepeat;
-	bool wakeup_enable;
-};
-#endif	/*__SKE_KPD_H*/
-- 
cgit v1.2.3


From 5bc46b49c828a6dfaab80b71ecb63fe76a1096d2 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@kernel.org>
Date: Mon, 22 Jul 2024 14:02:20 +0200
Subject: nvme-tcp: check for invalidated or revoked key

key_lookup() will always return a key, even if that key is revoked
or invalidated. So check for invalid keys before continuing.

Signed-off-by: Hannes Reinecke <hare@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme-keyring.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h
index e10333d78dbb..19d2b256180f 100644
--- a/include/linux/nvme-keyring.h
+++ b/include/linux/nvme-keyring.h
@@ -12,7 +12,7 @@ key_serial_t nvme_tls_psk_default(struct key *keyring,
 		const char *hostnqn, const char *subnqn);
 
 key_serial_t nvme_keyring_id(void);
-
+struct key *nvme_tls_key_lookup(key_serial_t key_id);
 #else
 
 static inline key_serial_t nvme_tls_psk_default(struct key *keyring,
@@ -24,5 +24,9 @@ static inline key_serial_t nvme_keyring_id(void)
 {
 	return 0;
 }
+static inline struct key *nvme_tls_key_lookup(key_serial_t key_id)
+{
+	return ERR_PTR(-ENOTSUPP);
+}
 #endif /* !CONFIG_NVME_KEYRING */
 #endif /* _NVME_KEYRING_H */
-- 
cgit v1.2.3


From 559048d156ff3391c4b793779a824c9193e20442 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 5 Aug 2024 14:43:44 -0700
Subject: string: Check for "nonstring" attribute on strscpy() arguments

GCC already checks for arguments that are marked with the "nonstring"[1]
attribute when used on standard C String API functions (e.g. strcpy). Gain
this compile-time checking also for the kernel's primary string copying
function, strscpy().

Note that Clang has neither "nonstring" nor __builtin_has_attribute().

Link: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-nonstring-variable-attribute [1]
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Tested-by: Miguel Ojeda <ojeda@kernel.org>
Link: https://lore.kernel.org/r/20240805214340.work.339-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/compiler.h       |  3 +++
 include/linux/compiler_types.h |  7 +++++++
 include/linux/string.h         | 12 ++++++++----
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 2df665fa2964..ec55bcce4146 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -242,6 +242,9 @@ static inline void *offset_to_ptr(const int *off)
 /* &a[0] degrades to a pointer: a different type from an array */
 #define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
 
+/* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */
+#define __must_be_cstr(p)	BUILD_BUG_ON_ZERO(__annotated(p, nonstring))
+
 /*
  * This returns a constant expression while determining if an argument is
  * a constant expression, most importantly without evaluating the argument.
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index f14c275950b5..1a957ea2f4fe 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -421,6 +421,13 @@ struct ftrace_likely_data {
 #define __member_size(p)	__builtin_object_size(p, 1)
 #endif
 
+/* Determine if an attribute has been applied to a variable. */
+#if __has_builtin(__builtin_has_attribute)
+#define __annotated(var, attr)	__builtin_has_attribute(var, attr)
+#else
+#define __annotated(var, attr)	(false)
+#endif
+
 /*
  * Some versions of gcc do not mark 'asm goto' volatile:
  *
diff --git a/include/linux/string.h b/include/linux/string.h
index 9edace076ddb..95b3fc308f4f 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -76,12 +76,16 @@ ssize_t sized_strscpy(char *, const char *, size_t);
  * known size.
  */
 #define __strscpy0(dst, src, ...)	\
-	sized_strscpy(dst, src, sizeof(dst) + __must_be_array(dst))
-#define __strscpy1(dst, src, size)	sized_strscpy(dst, src, size)
+	sized_strscpy(dst, src, sizeof(dst) + __must_be_array(dst) +	\
+				__must_be_cstr(dst) + __must_be_cstr(src))
+#define __strscpy1(dst, src, size)	\
+	sized_strscpy(dst, src, size + __must_be_cstr(dst) + __must_be_cstr(src))
 
 #define __strscpy_pad0(dst, src, ...)	\
-	sized_strscpy_pad(dst, src, sizeof(dst) + __must_be_array(dst))
-#define __strscpy_pad1(dst, src, size)	sized_strscpy_pad(dst, src, size)
+	sized_strscpy_pad(dst, src, sizeof(dst) + __must_be_array(dst) +	\
+				    __must_be_cstr(dst) + __must_be_cstr(src))
+#define __strscpy_pad1(dst, src, size)	\
+	sized_strscpy_pad(dst, src, size + __must_be_cstr(dst) + __must_be_cstr(src))
 
 /**
  * strscpy - Copy a C-string into a sized buffer
-- 
cgit v1.2.3


From 84429b675bcfd2a518ae167ee4661cdf7539aa7d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 22 Aug 2024 15:50:09 +0200
Subject: fs: Allow fine-grained control of folio sizes

We need filesystems to be able to communicate acceptable folio sizes
to the pagecache for a variety of uses (e.g. large block sizes).
Support a range of folio sizes between order-0 and order-31.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Co-developed-by: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Link: https://lore.kernel.org/r/20240822135018.1931258-2-kernel@pankajraghav.com
Tested-by: David Howells <dhowells@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pagemap.h | 90 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 76 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index d9c7edb6422b..c60025bb584c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -204,14 +204,21 @@ enum mapping_flags {
 	AS_EXITING	= 4, 	/* final truncate in progress */
 	/* writeback related tags are not used */
 	AS_NO_WRITEBACK_TAGS = 5,
-	AS_LARGE_FOLIO_SUPPORT = 6,
-	AS_RELEASE_ALWAYS,	/* Call ->release_folio(), even if no private data */
-	AS_STABLE_WRITES,	/* must wait for writeback before modifying
+	AS_RELEASE_ALWAYS = 6,	/* Call ->release_folio(), even if no private data */
+	AS_STABLE_WRITES = 7,	/* must wait for writeback before modifying
 				   folio contents */
-	AS_INACCESSIBLE,	/* Do not attempt direct R/W access to the mapping,
-				   including to move the mapping */
+	AS_INACCESSIBLE = 8,	/* Do not attempt direct R/W access to the mapping */
+	/* Bits 16-25 are used for FOLIO_ORDER */
+	AS_FOLIO_ORDER_BITS = 5,
+	AS_FOLIO_ORDER_MIN = 16,
+	AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS,
 };
 
+#define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1)
+#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN)
+#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX)
+#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK)
+
 /**
  * mapping_set_error - record a writeback error in the address_space
  * @mapping: the mapping in which an error should be set
@@ -367,9 +374,51 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
 #define MAX_XAS_ORDER		(XA_CHUNK_SHIFT * 2 - 1)
 #define MAX_PAGECACHE_ORDER	min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)
 
+/*
+ * mapping_set_folio_order_range() - Set the orders supported by a file.
+ * @mapping: The address space of the file.
+ * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive).
+ * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive).
+ *
+ * The filesystem should call this function in its inode constructor to
+ * indicate which base size (min) and maximum size (max) of folio the VFS
+ * can use to cache the contents of the file.  This should only be used
+ * if the filesystem needs special handling of folio sizes (ie there is
+ * something the core cannot know).
+ * Do not tune it based on, eg, i_size.
+ *
+ * Context: This should not be called while the inode is active as it
+ * is non-atomic.
+ */
+static inline void mapping_set_folio_order_range(struct address_space *mapping,
+						 unsigned int min,
+						 unsigned int max)
+{
+	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+		return;
+
+	if (min > MAX_PAGECACHE_ORDER)
+		min = MAX_PAGECACHE_ORDER;
+
+	if (max > MAX_PAGECACHE_ORDER)
+		max = MAX_PAGECACHE_ORDER;
+
+	if (max < min)
+		max = min;
+
+	mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) |
+		(min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX);
+}
+
+static inline void mapping_set_folio_min_order(struct address_space *mapping,
+					       unsigned int min)
+{
+	mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER);
+}
+
 /**
  * mapping_set_large_folios() - Indicate the file supports large folios.
- * @mapping: The file.
+ * @mapping: The address space of the file.
  *
  * The filesystem should call this function in its inode constructor to
  * indicate that the VFS can use large folios to cache the contents of
@@ -380,7 +429,23 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
  */
 static inline void mapping_set_large_folios(struct address_space *mapping)
 {
-	__set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
+	mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER);
+}
+
+static inline unsigned int
+mapping_max_folio_order(const struct address_space *mapping)
+{
+	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+		return 0;
+	return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX;
+}
+
+static inline unsigned int
+mapping_min_folio_order(const struct address_space *mapping)
+{
+	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+		return 0;
+	return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN;
 }
 
 /*
@@ -389,20 +454,17 @@ static inline void mapping_set_large_folios(struct address_space *mapping)
  */
 static inline bool mapping_large_folio_support(struct address_space *mapping)
 {
-	/* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */
+	/* AS_FOLIO_ORDER is only reasonable for pagecache folios */
 	VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON,
 			"Anonymous mapping always supports large folio");
 
-	return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
-		test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
+	return mapping_max_folio_order(mapping) > 0;
 }
 
 /* Return the maximum folio size for this pagecache mapping, in bytes. */
-static inline size_t mapping_max_folio_size(struct address_space *mapping)
+static inline size_t mapping_max_folio_size(const struct address_space *mapping)
 {
-	if (mapping_large_folio_support(mapping))
-		return PAGE_SIZE << MAX_PAGECACHE_ORDER;
-	return PAGE_SIZE;
+	return PAGE_SIZE << mapping_max_folio_order(mapping);
 }
 
 static inline int filemap_nr_thps(struct address_space *mapping)
-- 
cgit v1.2.3


From ab95d23bab220ef845c0d422f49452a475330eaf Mon Sep 17 00:00:00 2001
From: Pankaj Raghav <p.raghav@samsung.com>
Date: Thu, 22 Aug 2024 15:50:10 +0200
Subject: filemap: allocate mapping_min_order folios in the page cache

filemap_create_folio() and do_read_cache_folio() were always allocating
folio of order 0. __filemap_get_folio was trying to allocate higher
order folios when fgp_flags had higher order hint set but it will default
to order 0 folio if higher order memory allocation fails.

Supporting mapping_min_order implies that we guarantee each folio in the
page cache has at least an order of mapping_min_order. When adding new
folios to the page cache we must also ensure the index used is aligned to
the mapping_min_order as the page cache requires the index to be aligned
to the order of the folio.

Co-developed-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Link: https://lore.kernel.org/r/20240822135018.1931258-3-kernel@pankajraghav.com
Tested-by: David Howells <dhowells@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Daniel Gomez <da.gomez@samsung.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pagemap.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c60025bb584c..4cc170949e9c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -448,6 +448,26 @@ mapping_min_folio_order(const struct address_space *mapping)
 	return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN;
 }
 
+static inline unsigned long
+mapping_min_folio_nrpages(struct address_space *mapping)
+{
+	return 1UL << mapping_min_folio_order(mapping);
+}
+
+/**
+ * mapping_align_index() - Align index for this mapping.
+ * @mapping: The address_space.
+ *
+ * The index of a folio must be naturally aligned.  If you are adding a
+ * new folio to the page cache and need to know what index to give it,
+ * call this function.
+ */
+static inline pgoff_t mapping_align_index(struct address_space *mapping,
+					  pgoff_t index)
+{
+	return round_down(index, mapping_min_folio_nrpages(mapping));
+}
+
 /*
  * Large folio support currently depends on THP.  These dependencies are
  * being worked on but are not yet fixed.
-- 
cgit v1.2.3


From 3849687869092094003ba009dc00e2e0237a3b8a Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Wed, 21 Aug 2024 17:09:55 +0200
Subject: net: phy: Introduce ethernet link topology representation

Link topologies containing multiple network PHYs attached to the same
net_device can be found when using a PHY as a media converter for use
with an SFP connector, on which an SFP transceiver containing a PHY can
be used.

With the current model, the transceiver's PHY can't be used for
operations such as cable testing, timestamping, macsec offload, etc.

The reason being that most of the logic for these configuration, coming
from either ethtool netlink or ioctls tend to use netdev->phydev, which
in multi-phy systems will reference the PHY closest to the MAC.

Introduce a numbering scheme allowing to enumerate PHY devices that
belong to any netdev, which can in turn allow userspace to take more
precise decisions with regard to each PHY's configuration.

The numbering is maintained per-netdev, in a phy_device_list.
The numbering works similarly to a netdevice's ifindex, with
identifiers that are only recycled once INT_MAX has been reached.

This prevents races that could occur between PHY listing and SFP
transceiver removal/insertion.

The identifiers are assigned at phy_attach time, as the numbering
depends on the netdevice the phy is attached to. The PHY index can be
re-used for PHYs that are persistent.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Tested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h         |  4 +-
 include/linux/phy.h               |  4 ++
 include/linux/phy_link_topology.h | 82 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/phy_link_topology.h

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 614ec5d3d75b..289a6133769c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -40,7 +40,6 @@
 #include <net/dcbnl.h>
 #endif
 #include <net/netprio_cgroup.h>
-
 #include <linux/netdev_features.h>
 #include <linux/neighbour.h>
 #include <linux/netdevice_xmit.h>
@@ -81,6 +80,7 @@ struct xdp_frame;
 struct xdp_metadata_ops;
 struct xdp_md;
 struct ethtool_netdev_state;
+struct phy_link_topology;
 
 typedef u32 xdp_features_t;
 
@@ -1951,6 +1951,7 @@ enum netdev_reg_state {
  *	@fcoe_ddp_xid:	Max exchange id for FCoE LRO by ddp
  *
  *	@priomap:	XXX: need comments on this one
+ *	@link_topo:	Physical link topology tracking attached PHYs
  *	@phydev:	Physical device may attach itself
  *			for hardware timestamping
  *	@sfp_bus:	attached &struct sfp_bus structure.
@@ -2342,6 +2343,7 @@ struct net_device {
 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 	struct netprio_map __rcu *priomap;
 #endif
+	struct phy_link_topology	*link_topo;
 	struct phy_device	*phydev;
 	struct sfp_bus		*sfp_bus;
 	struct lock_class_key	*qdisc_tx_busylock;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 6b7d40d49129..3b0f4bf80650 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -554,6 +554,9 @@ struct macsec_ops;
  * @drv: Pointer to the driver for this PHY instance
  * @devlink: Create a link between phy dev and mac dev, if the external phy
  *           used by current mac interface is managed by another mac interface.
+ * @phyindex: Unique id across the phy's parent tree of phys to address the PHY
+ *	      from userspace, similar to ifindex. A zero index means the PHY
+ *	      wasn't assigned an id yet.
  * @phy_id: UID for this device found during discovery
  * @c45_ids: 802.3-c45 Device Identifiers if is_c45.
  * @is_c45:  Set to true if this PHY uses clause 45 addressing.
@@ -656,6 +659,7 @@ struct phy_device {
 
 	struct device_link *devlink;
 
+	u32 phyindex;
 	u32 phy_id;
 
 	struct phy_c45_device_ids c45_ids;
diff --git a/include/linux/phy_link_topology.h b/include/linux/phy_link_topology.h
new file mode 100644
index 000000000000..68a59e25821c
--- /dev/null
+++ b/include/linux/phy_link_topology.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PHY device list allow maintaining a list of PHY devices that are
+ * part of a netdevice's link topology. PHYs can for example be chained,
+ * as is the case when using a PHY that exposes an SFP module, on which an
+ * SFP transceiver that embeds a PHY is connected.
+ *
+ * This list can then be used by userspace to leverage individual PHY
+ * capabilities.
+ */
+#ifndef __PHY_LINK_TOPOLOGY_H
+#define __PHY_LINK_TOPOLOGY_H
+
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+
+struct xarray;
+struct phy_device;
+struct sfp_bus;
+
+struct phy_link_topology {
+	struct xarray phys;
+	u32 next_phy_index;
+};
+
+struct phy_device_node {
+	enum phy_upstream upstream_type;
+
+	union {
+		struct net_device	*netdev;
+		struct phy_device	*phydev;
+	} upstream;
+
+	struct sfp_bus *parent_sfp_bus;
+
+	struct phy_device *phy;
+};
+
+#if IS_ENABLED(CONFIG_PHYLIB)
+int phy_link_topo_add_phy(struct net_device *dev,
+			  struct phy_device *phy,
+			  enum phy_upstream upt, void *upstream);
+
+void phy_link_topo_del_phy(struct net_device *dev, struct phy_device *phy);
+
+static inline struct phy_device *
+phy_link_topo_get_phy(struct net_device *dev, u32 phyindex)
+{
+	struct phy_link_topology *topo = dev->link_topo;
+	struct phy_device_node *pdn;
+
+	if (!topo)
+		return NULL;
+
+	pdn = xa_load(&topo->phys, phyindex);
+	if (pdn)
+		return pdn->phy;
+
+	return NULL;
+}
+
+#else
+static inline int phy_link_topo_add_phy(struct net_device *dev,
+					struct phy_device *phy,
+					enum phy_upstream upt, void *upstream)
+{
+	return 0;
+}
+
+static inline void phy_link_topo_del_phy(struct net_device *dev,
+					 struct phy_device *phy)
+{
+}
+
+static inline struct phy_device *
+phy_link_topo_get_phy(struct net_device *dev, u32 phyindex)
+{
+	return NULL;
+}
+#endif
+
+#endif /* __PHY_LINK_TOPOLOGY_H */
-- 
cgit v1.2.3


From 4d76f115ab9121f4458330da962ae9ef5430e60b Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Wed, 21 Aug 2024 17:09:56 +0200
Subject: net: sfp: pass the phy_device when disconnecting an sfp module's PHY

Pass the phy_device as a parameter to the sfp upstream .disconnect_phy
operation. This is preparatory work to help track phy devices across
a net_device's link.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Tested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sfp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index b14be59550e3..54abb4d22b2e 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -550,7 +550,7 @@ struct sfp_upstream_ops {
 	void (*link_down)(void *priv);
 	void (*link_up)(void *priv);
 	int (*connect_phy)(void *priv, struct phy_device *);
-	void (*disconnect_phy)(void *priv);
+	void (*disconnect_phy)(void *priv, struct phy_device *);
 };
 
 #if IS_ENABLED(CONFIG_SFP)
-- 
cgit v1.2.3


From b2db6f4ace72e71fa09b8d1354f8ac9854140d74 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Wed, 21 Aug 2024 17:09:57 +0200
Subject: net: phy: add helpers to handle sfp phy connect/disconnect

There are a few PHY drivers that can handle SFP modules through their
sfp_upstream_ops. Introduce Phylib helpers to keep track of connected
SFP PHYs in a netdevice's namespace, by adding the SFP PHY to the
upstream PHY's netdev's namespace.

By doing so, these SFP PHYs can be enumerated and exposed to users,
which will be able to use their capabilities.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Tested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 3b0f4bf80650..a98bc91a0cde 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1781,6 +1781,8 @@ int phy_suspend(struct phy_device *phydev);
 int phy_resume(struct phy_device *phydev);
 int __phy_resume(struct phy_device *phydev);
 int phy_loopback(struct phy_device *phydev, bool enable);
+int phy_sfp_connect_phy(void *upstream, struct phy_device *phy);
+void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy);
 void phy_sfp_attach(void *upstream, struct sfp_bus *bus);
 void phy_sfp_detach(void *upstream, struct sfp_bus *bus);
 int phy_sfp_probe(struct phy_device *phydev,
-- 
cgit v1.2.3


From 0a2f7de0f3b96c4a30e56d2c79f8d812f89599a1 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Wed, 21 Aug 2024 17:09:58 +0200
Subject: net: sfp: Add helper to return the SFP bus name

Knowing the bus name is helpful when we want to expose the link topology
to userspace, add a helper to return the SFP bus name.

This call will always be made while holding the RTNL which ensures
that the SFP driver won't unbind from the device. The returned pointer
to the bus name will only be used while RTNL is held.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Suggested-by: "Russell King (Oracle)" <linux@armlinux.org.uk>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Tested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sfp.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 54abb4d22b2e..60c65cea74f6 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -576,6 +576,7 @@ struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode);
 int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 			 const struct sfp_upstream_ops *ops);
 void sfp_bus_del_upstream(struct sfp_bus *bus);
+const char *sfp_get_name(struct sfp_bus *bus);
 #else
 static inline int sfp_parse_port(struct sfp_bus *bus,
 				 const struct sfp_eeprom_id *id,
@@ -654,6 +655,11 @@ static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 static inline void sfp_bus_del_upstream(struct sfp_bus *bus)
 {
 }
+
+static inline const char *sfp_get_name(struct sfp_bus *bus)
+{
+	return NULL;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3


From c579286a514d88a3f0d3bdabfd4d88737b33cb17 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 19 Aug 2024 18:31:33 +0200
Subject: thermal: core: Drop unused bind/unbind functions and callbacks

There are no more callers of thermal_zone_bind_cooling_device() and
thermal_zone_unbind_cooling_device(), so drop them along with all of
the corresponding headers, code and documentation.

Moreover, because the .bind() and .unbind() thermal zone callbacks would
only be used when the above functions, respectively, were called, drop
them as well along with all of the code related to them.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Acked-by: Huisong Li <lihuisong@huawei.com>
Reviewed-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/4251116.1IzOArtZ34@rjwysocki.net
---
 include/linux/thermal.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 6599a26847f7..25ea8fe2313e 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -92,10 +92,6 @@ struct cooling_spec {
 };
 
 struct thermal_zone_device_ops {
-	int (*bind) (struct thermal_zone_device *,
-		     struct thermal_cooling_device *);
-	int (*unbind) (struct thermal_zone_device *,
-		       struct thermal_cooling_device *);
 	bool (*should_bind) (struct thermal_zone_device *,
 			     const struct thermal_trip *,
 			     struct thermal_cooling_device *,
@@ -247,12 +243,6 @@ const char *thermal_zone_device_type(struct thermal_zone_device *tzd);
 int thermal_zone_device_id(struct thermal_zone_device *tzd);
 struct device *thermal_zone_device(struct thermal_zone_device *tzd);
 
-int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
-				     struct thermal_cooling_device *,
-				     unsigned long, unsigned long,
-				     unsigned int);
-int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int,
-				       struct thermal_cooling_device *);
 void thermal_zone_device_update(struct thermal_zone_device *,
 				enum thermal_notify_event);
 
-- 
cgit v1.2.3


From ccce71013406a0a3c81850dab940f07b112349d3 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 14 Aug 2024 14:21:18 +0200
Subject: mtd: rawnand: davinci: make platform_data private

There are no longer any users of the platform data for davinci rawnand
in board files. We can remove the public pdata headers and move the
structures that are still used into the driver compilation unit while
removing the rest.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20240814122120.13975-1-brgl@bgdev.pl
---
 include/linux/platform_data/mtd-davinci-aemif.h | 36 ----------
 include/linux/platform_data/mtd-davinci.h       | 88 -------------------------
 2 files changed, 124 deletions(-)
 delete mode 100644 include/linux/platform_data/mtd-davinci-aemif.h
 delete mode 100644 include/linux/platform_data/mtd-davinci.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/mtd-davinci-aemif.h b/include/linux/platform_data/mtd-davinci-aemif.h
deleted file mode 100644
index a49826214a39..000000000000
--- a/include/linux/platform_data/mtd-davinci-aemif.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * TI DaVinci AEMIF support
- *
- * Copyright 2010 (C) Texas Instruments, Inc. https://www.ti.com/
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
- */
-#ifndef _MACH_DAVINCI_AEMIF_H
-#define _MACH_DAVINCI_AEMIF_H
-
-#include <linux/platform_device.h>
-
-#define NRCSR_OFFSET		0x00
-#define AWCCR_OFFSET		0x04
-#define A1CR_OFFSET		0x10
-
-#define ACR_ASIZE_MASK		0x3
-#define ACR_EW_MASK		BIT(30)
-#define ACR_SS_MASK		BIT(31)
-
-/* All timings in nanoseconds */
-struct davinci_aemif_timing {
-	u8	wsetup;
-	u8	wstrobe;
-	u8	whold;
-
-	u8	rsetup;
-	u8	rstrobe;
-	u8	rhold;
-
-	u8	ta;
-};
-
-#endif
diff --git a/include/linux/platform_data/mtd-davinci.h b/include/linux/platform_data/mtd-davinci.h
deleted file mode 100644
index dd474dd44848..000000000000
--- a/include/linux/platform_data/mtd-davinci.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * mach-davinci/nand.h
- *
- * Copyright © 2006 Texas Instruments.
- *
- * Ported to 2.6.23 Copyright © 2008 by
- *   Sander Huijsen <Shuijsen@optelecom-nkf.com>
- *   Troy Kisky <troy.kisky@boundarydevices.com>
- *   Dirk Behme <Dirk.Behme@gmail.com>
- *
- * --------------------------------------------------------------------------
- */
-
-#ifndef __ARCH_ARM_DAVINCI_NAND_H
-#define __ARCH_ARM_DAVINCI_NAND_H
-
-#include <linux/mtd/rawnand.h>
-
-#define NANDFCR_OFFSET		0x60
-#define NANDFSR_OFFSET		0x64
-#define NANDF1ECC_OFFSET	0x70
-
-/* 4-bit ECC syndrome registers */
-#define NAND_4BIT_ECC_LOAD_OFFSET	0xbc
-#define NAND_4BIT_ECC1_OFFSET		0xc0
-#define NAND_4BIT_ECC2_OFFSET		0xc4
-#define NAND_4BIT_ECC3_OFFSET		0xc8
-#define NAND_4BIT_ECC4_OFFSET		0xcc
-#define NAND_ERR_ADD1_OFFSET		0xd0
-#define NAND_ERR_ADD2_OFFSET		0xd4
-#define NAND_ERR_ERRVAL1_OFFSET		0xd8
-#define NAND_ERR_ERRVAL2_OFFSET		0xdc
-
-/* NOTE:  boards don't need to use these address bits
- * for ALE/CLE unless they support booting from NAND.
- * They're used unless platform data overrides them.
- */
-#define	MASK_ALE		0x08
-#define	MASK_CLE		0x10
-
-struct davinci_nand_pdata {		/* platform_data */
-	uint32_t		mask_ale;
-	uint32_t		mask_cle;
-
-	/*
-	 * 0-indexed chip-select number of the asynchronous
-	 * interface to which the NAND device has been connected.
-	 *
-	 * So, if you have NAND connected to CS3 of DA850, you
-	 * will pass '1' here. Since the asynchronous interface
-	 * on DA850 starts from CS2.
-	 */
-	uint32_t		core_chipsel;
-
-	/* for packages using two chipselects */
-	uint32_t		mask_chipsel;
-
-	/* board's default static partition info */
-	struct mtd_partition	*parts;
-	unsigned		nr_parts;
-
-	/* none  == NAND_ECC_ENGINE_TYPE_NONE (strongly *not* advised!!)
-	 * soft  == NAND_ECC_ENGINE_TYPE_SOFT
-	 * else  == NAND_ECC_ENGINE_TYPE_ON_HOST, according to ecc_bits
-	 *
-	 * All DaVinci-family chips support 1-bit hardware ECC.
-	 * Newer ones also support 4-bit ECC, but are awkward
-	 * using it with large page chips.
-	 */
-	enum nand_ecc_engine_type engine_type;
-	enum nand_ecc_placement ecc_placement;
-	u8			ecc_bits;
-
-	/* e.g. NAND_BUSWIDTH_16 */
-	unsigned		options;
-	/* e.g. NAND_BBT_USE_FLASH */
-	unsigned		bbt_options;
-
-	/* Main and mirror bbt descriptor overrides */
-	struct nand_bbt_descr	*bbt_td;
-	struct nand_bbt_descr	*bbt_md;
-
-	/* Access timings */
-	struct davinci_aemif_timing	*timing;
-};
-
-#endif	/* __ARCH_ARM_DAVINCI_NAND_H */
-- 
cgit v1.2.3


From 8a48281cfa7f380707d42b649acda3ea36348697 Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Fri, 23 Aug 2024 15:42:01 +0800
Subject: PCI: Make pci_bus_type constant

Since commit d492cc2573a0 ("driver core: device.h: make struct bus_type a
const *"), the driver core can properly handle constant struct bus_type,
move the pci_bus_type variable to be a constant structure as well, placing
it into read-only memory which can not be modified at runtime.

Link: https://lore.kernel.org/r/20240823074202.139265-1-kunwu.chan@linux.dev
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4cf89a4b4cbc..197292b336a5 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1098,7 +1098,7 @@ enum pcie_bus_config_types {
 
 extern enum pcie_bus_config_types pcie_bus_config;
 
-extern struct bus_type pci_bus_type;
+extern const struct bus_type pci_bus_type;
 
 /* Do NOT directly access these two variables, unless you are arch-specific PCI
  * code, or PCI core code. */
-- 
cgit v1.2.3


From 9246b487ab3c3b5993aae7552b7a4c541cc14a49 Mon Sep 17 00:00:00 2001
From: WangYuli <wangyuli@uniontech.com>
Date: Fri, 23 Aug 2024 17:57:08 +0800
Subject: PCI: Add function 0 DMA alias quirk for Glenfly Arise chip

Add DMA support for audio function of Glenfly Arise chip, which uses
Requester ID of function 0.

Link: https://lore.kernel.org/r/CA2BBD087345B6D1+20240823095708.3237375-1-wangyuli@uniontech.com
Signed-off-by: SiyuLi <siyuli@glenfly.com>
Signed-off-by: WangYuli <wangyuli@uniontech.com>
[bhelgaas: lower-case hex to match local code, drop unused Device IDs]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index e388c8b1cbc2..2c94d4004dd5 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2661,6 +2661,8 @@
 #define PCI_DEVICE_ID_DCI_PCCOM8	0x0002
 #define PCI_DEVICE_ID_DCI_PCCOM2	0x0004
 
+#define PCI_VENDOR_ID_GLENFLY		0x6766
+
 #define PCI_VENDOR_ID_INTEL		0x8086
 #define PCI_DEVICE_ID_INTEL_EESSC	0x0008
 #define PCI_DEVICE_ID_INTEL_HDA_CML_LP	0x02c8
-- 
cgit v1.2.3


From d59232afb0344e33e9399f308d9b4a03876e7676 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 13 Aug 2024 21:24:22 +0000
Subject: bpf: Rename ARG_PTR_TO_KPTR -> ARG_KPTR_XCHG_DEST

ARG_PTR_TO_KPTR is currently only used by the bpf_kptr_xchg helper.
Although it limits reg types for that helper's first arg to
PTR_TO_MAP_VALUE, any arbitrary mapval won't do: further custom
verification logic ensures that the mapval reg being xchgd-into is
pointing to a kptr field. If this is not the case, it's not safe to xchg
into that reg's pointee.

Let's rename the bpf_arg_type to more accurately describe the fairly
specific expectations that this arg type encodes.

This is a nonfunctional change.

Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Amery Hung <amery.hung@bytedance.com>
Link: https://lore.kernel.org/r/20240813212424.2871455-4-amery.hung@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 00dc4dd28cbd..dc63083f76b7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -744,7 +744,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_STACK,	/* pointer to stack */
 	ARG_PTR_TO_CONST_STR,	/* pointer to a null terminated read-only string */
 	ARG_PTR_TO_TIMER,	/* pointer to bpf_timer */
-	ARG_PTR_TO_KPTR,	/* pointer to referenced kptr */
+	ARG_KPTR_XCHG_DEST,	/* pointer to destination that kptrs are bpf_kptr_xchg'd into */
 	ARG_PTR_TO_DYNPTR,      /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */
 	__BPF_ARG_TYPE_MAX,
 
-- 
cgit v1.2.3


From 9e83dd3ebb14fadccb936308b7b101c75da76324 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Fri, 23 Aug 2024 18:39:34 +0800
Subject: irqchip/loongson-eiointc: Rename CPUHP_AP_IRQ_LOONGARCH_STARTING

Rename CPUHP_AP_IRQ_LOONGARCH_STARTING to CPUHP_AP_IRQ_EIOINTC_STARTING
because the upcoming AVECINTC irqchip driver will introduce a new state
and so both are clearly identifiable.

Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Tianyang Zhang <zhangtianyang@loongson.cn>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240823103936.25092-3-zhangtianyang@loongson.cn
---
 include/linux/cpuhotplug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 51ba681b915a..e49807f7805c 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -145,7 +145,7 @@ enum cpuhp_state {
 	CPUHP_AP_IRQ_ARMADA_XP_STARTING,
 	CPUHP_AP_IRQ_BCM2836_STARTING,
 	CPUHP_AP_IRQ_MIPS_GIC_STARTING,
-	CPUHP_AP_IRQ_LOONGARCH_STARTING,
+	CPUHP_AP_IRQ_EIOINTC_STARTING,
 	CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING,
 	CPUHP_AP_IRQ_RISCV_IMSIC_STARTING,
 	CPUHP_AP_ARM_MVEBU_COHERENCY,
-- 
cgit v1.2.3


From ae16f05c928a1336d5d9d19fd805d7bf29c3f0c8 Mon Sep 17 00:00:00 2001
From: Tianyang Zhang <zhangtianyang@loongson.cn>
Date: Fri, 23 Aug 2024 18:43:37 +0800
Subject: irqchip/loongarch-avec: Add AVEC irqchip support

Introduce the advanced extended interrupt controllers (AVECINTC). This
feature will allow each core to have 256 independent interrupt vectors
and MSI interrupts can be independently routed to any vector on any CPU.

The whole topology of irqchips in LoongArch machines looks like this if
AVECINTC is supported:

  +-----+     +-----------------------+     +-------+
  | IPI | --> |        CPUINTC        | <-- | Timer |
  +-----+     +-----------------------+     +-------+
               ^          ^          ^
               |          |          |
        +---------+ +----------+ +---------+     +-------+
        | EIOINTC | | AVECINTC | | LIOINTC | <-- | UARTs |
        +---------+ +----------+ +---------+     +-------+
             ^            ^
             |            |
        +---------+  +---------+
        | PCH-PIC |  | PCH-MSI |
        +---------+  +---------+
          ^     ^           ^
          |     |           |
  +---------+ +---------+ +---------+
  | Devices | | PCH-LPC | | Devices |
  +---------+ +---------+ +---------+
                   ^
                   |
              +---------+
              | Devices |
              +---------+

Co-developed-by: Jianmin Lv <lvjianmin@loongson.cn>
Signed-off-by: Jianmin Lv <lvjianmin@loongson.cn>
Co-developed-by: Liupu Wang <wangliupu@loongson.cn>
Signed-off-by: Liupu Wang <wangliupu@loongson.cn>
Co-developed-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Tianyang Zhang <zhangtianyang@loongson.cn>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240823104337.25577-2-zhangtianyang@loongson.cn
---
 include/linux/cpuhotplug.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index e49807f7805c..55a726d317d4 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -146,6 +146,7 @@ enum cpuhp_state {
 	CPUHP_AP_IRQ_BCM2836_STARTING,
 	CPUHP_AP_IRQ_MIPS_GIC_STARTING,
 	CPUHP_AP_IRQ_EIOINTC_STARTING,
+	CPUHP_AP_IRQ_AVECINTC_STARTING,
 	CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING,
 	CPUHP_AP_IRQ_RISCV_IMSIC_STARTING,
 	CPUHP_AP_ARM_MVEBU_COHERENCY,
-- 
cgit v1.2.3


From 17e28a9aeae40d2de3c1ea3b94819ed94bfd6392 Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Thu, 22 Aug 2024 15:31:58 +0300
Subject: genirq: Fix typo in struct comment

Remove redundant "e" in "assign(e)ments".

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20240822123205.2186221-1-costa.shul@redhat.com
---
 include/linux/interrupt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 694de61e0b38..457151f9f263 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -276,7 +276,7 @@ struct irq_affinity_notify {
 #define	IRQ_AFFINITY_MAX_SETS  4
 
 /**
- * struct irq_affinity - Description for automatic irq affinity assignements
+ * struct irq_affinity - Description for automatic irq affinity assignments
  * @pre_vectors:	Don't apply affinity to @pre_vectors at beginning of
  *			the MSI(-X) vector space
  * @post_vectors:	Don't apply affinity to @post_vectors at end of
-- 
cgit v1.2.3


From 7d3aed652d090508990d245f9d80dcc481910d02 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Thu, 22 Aug 2024 05:51:54 +0000
Subject: net: refactor ->ndo_bpf calls into dev_xdp_propagate

When net devices propagate xdp configurations to slave devices,
we will need to perform a memory provider check to ensure we're
not binding xdp to a device using unreadable netmem.

Currently the ->ndo_bpf calls in a few places. Adding checks to all
these places would not be ideal.

Refactor all the ->ndo_bpf calls into one place where we can add this
check in the future.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 289a6133769c..7dfa836d9dbf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3925,6 +3925,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 
 int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 u8 dev_xdp_prog_count(struct net_device *dev);
+int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf);
 u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
-- 
cgit v1.2.3


From 2b8e976b984278edbeab3251d370e76d237699f9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 7 Aug 2024 15:18:14 +0100
Subject: io_uring: user registered clockid for wait timeouts

Add a new registration opcode IORING_REGISTER_CLOCK, which allows the
user to select which clock id it wants to use with CQ waiting timeouts.
It only allows a subset of all posix clocks and currently supports
CLOCK_MONOTONIC and CLOCK_BOOTTIME.

Suggested-by: Lewis Baker <lewissbaker@gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/98f2bc8a3c36cdf8f0e6a275245e81e903459703.1723039801.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 3315005df117..4b9ba523978d 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -239,6 +239,9 @@ struct io_ring_ctx {
 		struct io_rings		*rings;
 		struct percpu_ref	refs;
 
+		clockid_t		clockid;
+		enum tk_offsets		clock_offset;
+
 		enum task_work_notify_mode	notify_method;
 		unsigned			sq_thread_idle;
 	} ____cacheline_aligned_in_smp;
-- 
cgit v1.2.3


From 1e9046e3a154608f63ce79edcb01e6afd6b10c7c Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Wed, 14 Aug 2024 17:35:55 +0200
Subject: rpmb: add Replay Protected Memory Block (RPMB) subsystem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A number of storage technologies support a specialised hardware
partition designed to be resistant to replay attacks. The underlying
HW protocols differ but the operations are common. The RPMB partition
cannot be accessed via standard block layer, but by a set of specific
RPMB commands. Such a partition provides authenticated and replay
protected access, hence suitable as a secure storage.

The initial aim of this patch is to provide a simple RPMB driver
interface which can be accessed by the optee driver to facilitate early
RPMB access to OP-TEE OS (secure OS) during the boot time.

A TEE device driver can claim the RPMB interface, for example, via
rpmb_interface_register() or rpmb_dev_find_device(). The RPMB driver
provides a callback to route RPMB frames to the RPMB device accessible
via rpmb_route_frames().

The detailed operation of implementing the access is left to the TEE
device driver itself.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Shyam Saini <shyamsaini@linux.microsoft.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Tested-by: Manuel Traut <manut@mecka.net>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20240814153558.708365-2-jens.wiklander@linaro.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/rpmb.h | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 include/linux/rpmb.h

(limited to 'include/linux')

diff --git a/include/linux/rpmb.h b/include/linux/rpmb.h
new file mode 100644
index 000000000000..cccda73eea4d
--- /dev/null
+++ b/include/linux/rpmb.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2015-2019 Intel Corp. All rights reserved
+ * Copyright (C) 2021-2022 Linaro Ltd
+ */
+#ifndef __RPMB_H__
+#define __RPMB_H__
+
+#include <linux/device.h>
+#include <linux/types.h>
+
+/**
+ * enum rpmb_type - type of underlying storage technology
+ *
+ * @RPMB_TYPE_EMMC  : emmc (JESD84-B50.1)
+ * @RPMB_TYPE_UFS   : UFS (JESD220)
+ * @RPMB_TYPE_NVME  : NVM Express
+ */
+enum rpmb_type {
+	RPMB_TYPE_EMMC,
+	RPMB_TYPE_UFS,
+	RPMB_TYPE_NVME,
+};
+
+/**
+ * struct rpmb_descr - RPMB description provided by the underlying block device
+ *
+ * @type             : block device type
+ * @route_frames     : routes frames to and from the RPMB device
+ * @dev_id           : unique device identifier read from the hardware
+ * @dev_id_len       : length of unique device identifier
+ * @reliable_wr_count: number of sectors that can be written in one access
+ * @capacity         : capacity of the device in units of 128K
+ *
+ * @dev_id is intended to be used as input when deriving the authenticaion key.
+ */
+struct rpmb_descr {
+	enum rpmb_type type;
+	int (*route_frames)(struct device *dev, u8 *req, unsigned int req_len,
+			    u8 *resp, unsigned int resp_len);
+	u8 *dev_id;
+	size_t dev_id_len;
+	u16 reliable_wr_count;
+	u16 capacity;
+};
+
+/**
+ * struct rpmb_dev - device which can support RPMB partition
+ *
+ * @dev              : device
+ * @id               : device_id
+ * @list_node        : linked list node
+ * @descr            : RPMB description
+ */
+struct rpmb_dev {
+	struct device dev;
+	int id;
+	struct list_head list_node;
+	struct rpmb_descr descr;
+};
+
+#define to_rpmb_dev(x)		container_of((x), struct rpmb_dev, dev)
+
+#if IS_ENABLED(CONFIG_RPMB)
+struct rpmb_dev *rpmb_dev_get(struct rpmb_dev *rdev);
+void rpmb_dev_put(struct rpmb_dev *rdev);
+struct rpmb_dev *rpmb_dev_find_device(const void *data,
+				      const struct rpmb_dev *start,
+				      int (*match)(struct device *dev,
+						   const void *data));
+int rpmb_interface_register(struct class_interface *intf);
+void rpmb_interface_unregister(struct class_interface *intf);
+struct rpmb_dev *rpmb_dev_register(struct device *dev,
+				   struct rpmb_descr *descr);
+int rpmb_dev_unregister(struct rpmb_dev *rdev);
+
+int rpmb_route_frames(struct rpmb_dev *rdev, u8 *req,
+		      unsigned int req_len, u8 *resp, unsigned int resp_len);
+
+#else
+static inline struct rpmb_dev *rpmb_dev_get(struct rpmb_dev *rdev)
+{
+	return NULL;
+}
+
+static inline void rpmb_dev_put(struct rpmb_dev *rdev) { }
+
+static inline struct rpmb_dev *
+rpmb_dev_find_device(const void *data, const struct rpmb_dev *start,
+		     int (*match)(struct device *dev, const void *data))
+{
+	return NULL;
+}
+
+static inline int rpmb_interface_register(struct class_interface *intf)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void rpmb_interface_unregister(struct class_interface *intf)
+{
+}
+
+static inline struct rpmb_dev *
+rpmb_dev_register(struct device *dev, struct rpmb_descr *descr)
+{
+	return NULL;
+}
+
+static inline int rpmb_dev_unregister(struct rpmb_dev *dev)
+{
+	return 0;
+}
+
+static inline int rpmb_route_frames(struct rpmb_dev *rdev, u8 *req,
+				    unsigned int req_len, u8 *resp,
+				    unsigned int resp_len)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_RPMB */
+
+#endif /* __RPMB_H__ */
-- 
cgit v1.2.3


From c30b855e814d9094e369a19fbd86c9bb5badc154 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Wed, 14 Aug 2024 17:35:57 +0200
Subject: tee: add tee_device_set_dev_groups()

Add tee_device_set_dev_groups() to TEE drivers to supply driver specific
attribute groups. The class specific attributes are from now on added
via the tee_class, which currently only consist of implementation_id.

Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
Reviewed-by: Sumit Garg <sumit.garg@linaro.org>
Link: https://lore.kernel.org/r/20240814153558.708365-4-jens.wiklander@linaro.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/tee_core.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h
index efd16ed52315..a38494d6b5f4 100644
--- a/include/linux/tee_core.h
+++ b/include/linux/tee_core.h
@@ -154,6 +154,18 @@ int tee_device_register(struct tee_device *teedev);
  */
 void tee_device_unregister(struct tee_device *teedev);
 
+/**
+ * tee_device_set_dev_groups() - Set device attribute groups
+ * @teedev:	Device to register
+ * @dev_groups: Attribute groups
+ *
+ * Assigns the provided @dev_groups to the @teedev to be registered later
+ * with tee_device_register(). Calling this function is optional, but if
+ * it's called it must be called before tee_device_register().
+ */
+void tee_device_set_dev_groups(struct tee_device *teedev,
+			       const struct attribute_group **dev_groups);
+
 /**
  * tee_session_calc_client_uuid() - Calculates client UUID for session
  * @uuid:		Resulting UUID
-- 
cgit v1.2.3


From d3596c73011d1b0e9b43ae12f1e0f491d6bb96eb Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Wed, 21 Aug 2024 15:24:06 +0200
Subject: mmc: core: Remove struct mmc_context_info

The 'mmc_context_info' structure is unused.

It has been introduced in:

  - commit 2220eedfd7ae ("mmc: fix async request mechanism for sequential
    read scenarios")

in 2013-02 and its usages have been removed in:

  - commit 126b62700386 ("mmc: core: Remove code no longer needed after the
    switch to blk-mq")
  - commit 0fbfd1251830 ("mmc: block: Remove code no longer needed after
    the switch to blk-mq")

in 2017-12.

Now remove this unused structure.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Vladimir Zapolskiy <vz@mleia.com>
Link: https://lore.kernel.org/r/232106a8a6a374dee25feea9b94498361568c10b.1724246389.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/host.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 88c6a76042ee..f85df7d045ca 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -291,20 +291,6 @@ struct mmc_slot {
 	void *handler_priv;
 };
 
-/**
- * mmc_context_info - synchronization details for mmc context
- * @is_done_rcv		wake up reason was done request
- * @is_new_req		wake up reason was new request
- * @is_waiting_last_req	mmc context waiting for single running request
- * @wait		wait queue
- */
-struct mmc_context_info {
-	bool			is_done_rcv;
-	bool			is_new_req;
-	bool			is_waiting_last_req;
-	wait_queue_head_t	wait;
-};
-
 struct regulator;
 struct mmc_pwrseq;
 
-- 
cgit v1.2.3


From f5e1638bf3c785600e2ab53e5532007af5296581 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vladimir.zapolskiy@linaro.org>
Date: Sat, 24 Aug 2024 01:59:17 +0300
Subject: mmc: core: remove left-over data structure declarations

The last users of 'enum mmc_blk_status' and 'struct mmc_async_req'
were removed by commit 126b62700386 ("mmc: core: Remove code no longer
needed after the switch to blk-mq") in 2017, remove these two left-over
data structures.

Signed-off-by: Vladimir Zapolskiy <vladimir.zapolskiy@linaro.org>
Link: https://lore.kernel.org/r/20240823225917.2826156-1-vladimir.zapolskiy@linaro.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/core.h | 12 ------------
 include/linux/mmc/host.h | 10 ----------
 2 files changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 2c7928a50907..f0ac2e469b32 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -11,18 +11,6 @@
 struct mmc_data;
 struct mmc_request;
 
-enum mmc_blk_status {
-	MMC_BLK_SUCCESS = 0,
-	MMC_BLK_PARTIAL,
-	MMC_BLK_CMD_ERR,
-	MMC_BLK_RETRY,
-	MMC_BLK_ABORT,
-	MMC_BLK_DATA_ERR,
-	MMC_BLK_ECC_ERR,
-	MMC_BLK_NOMEDIUM,
-	MMC_BLK_NEW_REQUEST,
-};
-
 struct mmc_command {
 	u32			opcode;
 	u32			arg;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index f85df7d045ca..ac01cd1622ef 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -264,16 +264,6 @@ struct mmc_cqe_ops {
 	void	(*cqe_recovery_finish)(struct mmc_host *host);
 };
 
-struct mmc_async_req {
-	/* active mmc request */
-	struct mmc_request	*mrq;
-	/*
-	 * Check error status of completed mmc request.
-	 * Returns 0 if success otherwise non zero.
-	 */
-	enum mmc_blk_status (*err_check)(struct mmc_card *, struct mmc_async_req *);
-};
-
 /**
  * struct mmc_slot - MMC slot functions
  *
-- 
cgit v1.2.3


From 8d3cefaf659265aa82b0373a563fdb9d16a2b947 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 15 Aug 2024 21:44:50 +0200
Subject: i2c: core: Lock address during client device instantiation

Krzysztof reported an issue [0] which is caused by parallel attempts to
instantiate the same I2C client device. This can happen if driver
supports auto-detection, but certain devices are also instantiated
explicitly.
The original change isn't actually wrong, it just revealed that I2C core
isn't prepared yet to handle this scenario.
Calls to i2c_new_client_device() can be nested, therefore we can't use a
simple mutex here. Parallel instantiation of devices at different addresses
is ok, so we just have to prevent parallel instantiation at the same address.
We can use a bitmap with one bit per 7-bit I2C client address, and atomic
bit operations to set/check/clear bits.
Now a parallel attempt to instantiate a device at the same address will
result in -EBUSY being returned, avoiding the "sysfs: cannot create duplicate
filename" splash.

Note: This patch version includes small cosmetic changes to the Tested-by
      version, only functional change is that address locking is supported
      for slave addresses too.

[0] https://lore.kernel.org/linux-i2c/9479fe4e-eb0c-407e-84c0-bd60c15baf74@ans.pl/T/#m12706546e8e2414d8f1a0dc61c53393f731685cc

Fixes: caba40ec3531 ("eeprom: at24: Probe for DDR3 thermal sensor in the SPD case")
Cc: stable@vger.kernel.org
Tested-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 include/linux/i2c.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 7eedd0c662da..8d79440cd8ce 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -761,6 +761,9 @@ struct i2c_adapter {
 	struct regulator *bus_regulator;
 
 	struct dentry *debugfs;
+
+	/* 7bit address space */
+	DECLARE_BITMAP(addrs_in_instantiation, 1 << 7);
 };
 #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
 
-- 
cgit v1.2.3


From af8a6e65f42c6c89414017cc4d78403e83056172 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Fri, 3 May 2024 12:10:52 +0200
Subject: USB: serial: set driver owner when registering drivers

Modules registering drivers with usb_serial_register_drivers() might
forget to set .owner field.  The field is used by some of other kernel
parts for reference counting (try_module_get()), so it is expected that
drivers will set it.

Solve the problem by moving this task away from the drivers to the core
USB serial code, just like we did for platform_driver in
commit 9447057eaff8 ("platform_device: use a macro instead of
platform_driver_register").

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
[ johan: amend commit summary ]
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 include/linux/usb/serial.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 1a0a4dc87980..75b2b763f1ba 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -311,8 +311,11 @@ struct usb_serial_driver {
 #define to_usb_serial_driver(d) \
 	container_of(d, struct usb_serial_driver, driver)
 
-int usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[],
-		const char *name, const struct usb_device_id *id_table);
+#define usb_serial_register_drivers(serial_drivers, name, id_table) \
+	__usb_serial_register_drivers(serial_drivers, THIS_MODULE, name, id_table)
+int __usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[],
+				  struct module *owner, const char *name,
+				  const struct usb_device_id *id_table);
 void usb_serial_deregister_drivers(struct usb_serial_driver *const serial_drivers[]);
 void usb_serial_port_softint(struct usb_serial_port *port);
 
-- 
cgit v1.2.3


From 1d4684fbe88dc28e2bf79f5e94a432f0469d2dac Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 2 Aug 2024 17:32:02 -0700
Subject: iommufd: Reorder include files

Reorder include files to alphabetic order to simplify maintenance, and
separate local headers and global headers with a blank line.

No functional change intended.

Link: https://patch.msgid.link/r/7524b037cc05afe19db3c18f863253e1d1554fa2.1722644866.git.nicolinc@nvidia.com
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommufd.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index ffc3a949f837..c2f2f6b9148e 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -6,9 +6,9 @@
 #ifndef __LINUX_IOMMUFD_H
 #define __LINUX_IOMMUFD_H
 
-#include <linux/types.h>
-#include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/types.h>
 
 struct device;
 struct iommufd_device;
-- 
cgit v1.2.3


From 03c3d7c74371a46d967fbf41628874ec04ddda96 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <cassel@kernel.org>
Date: Thu, 15 Aug 2024 22:11:31 +0200
Subject: nvme-rdma: send cntlid in the RDMA_CM_REQUEST Private Data

When sending a RDMA_CM_REQUEST, the NVMe RDMA Transport Specification
allows you to populate the cntlid field in the RDMA_CM_REQUEST Private
Data.

The cntlid is returned by the target on completion of the first
RDMA_CM_REQUEST command (which creates the admin queue).

The cntlid field can then be populated by the host when the I/O queues
are created (using additional RDMA_CM_REQUEST commands), such that the
target can perform extra validation for additional RDMA_CM_REQUEST
commands.

This additional error code and error message is also added, such that
nvme_rdma_cm_msg() will display the proper error message if the target
fails the RDMA_CM_REQUEST command because of this extra validation.

Signed-off-by: Niklas Cassel <cassel@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme-rdma.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h
index eb2f04d636c8..97c5f00b9aa3 100644
--- a/include/linux/nvme-rdma.h
+++ b/include/linux/nvme-rdma.h
@@ -25,6 +25,7 @@ enum nvme_rdma_cm_status {
 	NVME_RDMA_CM_NO_RSC		= 0x06,
 	NVME_RDMA_CM_INVALID_IRD	= 0x07,
 	NVME_RDMA_CM_INVALID_ORD	= 0x08,
+	NVME_RDMA_CM_INVALID_CNTLID	= 0x09,
 };
 
 static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status)
@@ -46,6 +47,8 @@ static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status)
 		return "invalid IRD";
 	case NVME_RDMA_CM_INVALID_ORD:
 		return "Invalid ORD";
+	case NVME_RDMA_CM_INVALID_CNTLID:
+		return "invalid controller ID";
 	default:
 		return "unrecognized reason";
 	}
@@ -64,7 +67,8 @@ struct nvme_rdma_cm_req {
 	__le16		qid;
 	__le16		hrqsize;
 	__le16		hsqsize;
-	u8		rsvd[24];
+	__le16		cntlid;
+	u8		rsvd[22];
 };
 
 /**
-- 
cgit v1.2.3


From 4715d87e11ac805ab95a75adfbf93d67dfbdbe81 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Thu, 22 Aug 2024 14:07:02 +0200
Subject: ethtool: Add support for specifying information source in cable test
 results

Enhance the ethtool cable test interface by introducing the ability to
specify the source of the diagnostic information for cable test results.
This is particularly useful for PHYs that offer multiple diagnostic
methods, such as Time Domain Reflectometry (TDR) and Active Link Cable
Diagnostic (ALCD).

Key changes:
- Added `ethnl_cable_test_result_with_src` and
  `ethnl_cable_test_fault_length_with_src` functions to allow specifying
  the information source when reporting cable test results.
- Updated existing `ethnl_cable_test_result` and
  `ethnl_cable_test_fault_length` functions to use TDR as the default
  source, ensuring backward compatibility.
- Modified the UAPI to support these new attributes, enabling drivers to
  provide more detailed diagnostic information.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20240822120703.1393130-3-o.rempel@pengutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool_netlink.h | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h
index fae0dfb9a9c8..aba91335273a 100644
--- a/include/linux/ethtool_netlink.h
+++ b/include/linux/ethtool_netlink.h
@@ -23,8 +23,10 @@ struct phy_device;
 int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd);
 void ethnl_cable_test_free(struct phy_device *phydev);
 void ethnl_cable_test_finished(struct phy_device *phydev);
-int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result);
-int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm);
+int ethnl_cable_test_result_with_src(struct phy_device *phydev, u8 pair,
+				     u8 result, u32 src);
+int ethnl_cable_test_fault_length_with_src(struct phy_device *phydev, u8 pair,
+					   u32 cm, u32 src);
 int ethnl_cable_test_amplitude(struct phy_device *phydev, u8 pair, s16 mV);
 int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV);
 int ethnl_cable_test_step(struct phy_device *phydev, u32 first, u32 last,
@@ -54,14 +56,14 @@ static inline void ethnl_cable_test_free(struct phy_device *phydev)
 static inline void ethnl_cable_test_finished(struct phy_device *phydev)
 {
 }
-static inline int ethnl_cable_test_result(struct phy_device *phydev, u8 pair,
-					  u8 result)
+static inline int ethnl_cable_test_result_with_src(struct phy_device *phydev,
+						   u8 pair, u8 result, u32 src)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int ethnl_cable_test_fault_length(struct phy_device *phydev,
-						u8 pair, u32 cm)
+static inline int ethnl_cable_test_fault_length_with_src(struct phy_device *phydev,
+							 u8 pair, u32 cm, u32 src)
 {
 	return -EOPNOTSUPP;
 }
@@ -119,4 +121,19 @@ static inline bool ethtool_dev_mm_supported(struct net_device *dev)
 }
 
 #endif /* IS_ENABLED(CONFIG_ETHTOOL_NETLINK) */
+
+static inline int ethnl_cable_test_result(struct phy_device *phydev, u8 pair,
+					  u8 result)
+{
+	return ethnl_cable_test_result_with_src(phydev, pair, result,
+						ETHTOOL_A_CABLE_INF_SRC_TDR);
+}
+
+static inline int ethnl_cable_test_fault_length(struct phy_device *phydev,
+						u8 pair, u32 cm)
+{
+	return ethnl_cable_test_fault_length_with_src(phydev, pair, cm,
+						      ETHTOOL_A_CABLE_INF_SRC_TDR);
+}
+
 #endif /* _LINUX_ETHTOOL_NETLINK_H_ */
-- 
cgit v1.2.3


From 19f1f11c9a8e92a2211a3854bd2cfbd4bb6303f1 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Thu, 22 Aug 2024 13:57:27 +0100
Subject: net: qualcomm: rmnet: Correct spelling in if_rmnet.h

Correct spelling in if_rmnet.h
As reported by codespell.

Cc: Sean Tranchetti <quic_stranche@quicinc.com>
Signed-off-by: Simon Horman <horms@kernel.org>
Reviewed-by: Subash Abhinov Kasiviswanathan <quic_subashab@quicinc.com>
Link: https://patch.msgid.link/20240822-net-spell-v1-6-3a98971ce2d2@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/if_rmnet.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h
index 839d1e48b85e..c44bf6e80ecb 100644
--- a/include/linux/if_rmnet.h
+++ b/include/linux/if_rmnet.h
@@ -42,7 +42,7 @@ struct rmnet_map_ul_csum_header {
 
 /* csum_info field:
  *  OFFSET:	where (offset in bytes) to insert computed checksum
- *  UDP:	1 = UDP checksum (zero checkum means no checksum)
+ *  UDP:	1 = UDP checksum (zero checksum means no checksum)
  *  ENABLED:	1 = checksum computation requested
  */
 #define MAP_CSUM_UL_OFFSET_MASK		GENMASK(13, 0)
-- 
cgit v1.2.3


From 70d0bb45fae87a3b08970a318e15f317446a1956 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Thu, 22 Aug 2024 13:57:33 +0100
Subject: net: Correct spelling in headers

Correct spelling in Networking headers.
As reported by codespell.

Signed-off-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20240822-net-spell-v1-12-3a98971ce2d2@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/etherdevice.h | 2 +-
 include/linux/netdevice.h   | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 0ed47d00549b..30114c25ad12 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -645,7 +645,7 @@ static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb)
 }
 
 /**
- * eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame
+ * eth_skb_pad - Pad buffer to minimum number of octets for Ethernet frame
  * @skb: Buffer to pad
  *
  * An Ethernet frame should have a minimum size of 60 bytes.  This function
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7dfa836d9dbf..fce70990b209 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1237,7 +1237,7 @@ struct netdev_net_notifier {
  * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[],
  *		      struct net_device *dev,
  *		      const unsigned char *addr, u16 vid)
- *	Deletes the FDB entry from dev coresponding to addr.
+ *	Deletes the FDB entry from dev corresponding to addr.
  * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev,
  *			   struct netlink_ext_ack *extack);
  * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
@@ -3514,7 +3514,7 @@ static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue,
 	dql_completed(&dev_queue->dql, bytes);
 
 	/*
-	 * Without the memory barrier there is a small possiblity that
+	 * Without the memory barrier there is a small possibility that
 	 * netdev_tx_sent_queue will miss the update and cause the queue to
 	 * be stopped forever
 	 */
@@ -4583,7 +4583,7 @@ void dev_uc_flush(struct net_device *dev);
 void dev_uc_init(struct net_device *dev);
 
 /**
- *  __dev_uc_sync - Synchonize device's unicast list
+ *  __dev_uc_sync - Synchronize device's unicast list
  *  @dev:  device to sync
  *  @sync: function to call if address should be added
  *  @unsync: function to call if address should be removed
@@ -4627,7 +4627,7 @@ void dev_mc_flush(struct net_device *dev);
 void dev_mc_init(struct net_device *dev);
 
 /**
- *  __dev_mc_sync - Synchonize device's multicast list
+ *  __dev_mc_sync - Synchronize device's multicast list
  *  @dev:  device to sync
  *  @sync: function to call if address should be added
  *  @unsync: function to call if address should be removed
-- 
cgit v1.2.3


From cead0b8991713bdeb5b947758dd62287fcf8da40 Mon Sep 17 00:00:00 2001
From: Anuj Gupta <anuj20.g@samsung.com>
Date: Mon, 26 Aug 2024 16:09:43 +0530
Subject: nvme: rename apptag and appmask to lbat and lbatm

Rename apptag and appmask to lbat and lbatm so that it matches the field
names used in NVMe spec.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 7b2ae2e43544..b58d9405d65e 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -987,8 +987,8 @@ struct nvme_rw_command {
 	__le16			control;
 	__le32			dsmgmt;
 	__le32			reftag;
-	__le16			apptag;
-	__le16			appmask;
+	__le16			lbat;
+	__le16			lbatm;
 };
 
 enum {
@@ -1057,8 +1057,8 @@ struct nvme_write_zeroes_cmd {
 	__le16			control;
 	__le32			dsmgmt;
 	__le32			reftag;
-	__le16			apptag;
-	__le16			appmask;
+	__le16			lbat;
+	__le16			lbatm;
 };
 
 enum nvme_zone_mgmt_action {
-- 
cgit v1.2.3


From 50c6dbdfd16e312382842198a7919341ad480e05 Mon Sep 17 00:00:00 2001
From: Max Ramanouski <max8rr8@gmail.com>
Date: Sun, 25 Aug 2024 01:01:11 +0300
Subject: x86/ioremap: Improve iounmap() address range checks

Allowing iounmap() on memory that was not ioremap()'d in the first
place is obviously a bad idea.  There is currently a feeble attempt to
avoid errant iounmap()s by checking to see if the address is below
"high_memory".  But that's imprecise at best because there are plenty
of high addresses that are also invalid to call iounmap() on.

Thankfully, there is a more precise helper: is_ioremap_addr().  x86
just does not use it in iounmap().

Restrict iounmap() to addresses in the ioremap region, by using
is_ioremap_addr(). This aligns x86 closer to the generic iounmap()
implementation.

Additionally, add a warning in case there is an attempt to iounmap()
invalid memory.  This replaces an existing silent return and will
help alert folks to any incorrect usage of iounmap().

Due to VMALLOC_START on i386 not being present in asm/pgtable.h,
include for asm/vmalloc.h had to be added to include/linux/ioremap.h.

[ dhansen: tweak subject and changelog ]

Signed-off-by: Max Ramanouski <max8rr8@gmail.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Link: https://lore.kernel.org/all/20240824220111.84441-1-max8rr8%40gmail.com
---
 include/linux/ioremap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ioremap.h b/include/linux/ioremap.h
index f0e99fc7dd8b..2bd1661fe9ad 100644
--- a/include/linux/ioremap.h
+++ b/include/linux/ioremap.h
@@ -4,6 +4,7 @@
 
 #include <linux/kasan.h>
 #include <asm/pgtable.h>
+#include <asm/vmalloc.h>
 
 #if defined(CONFIG_HAS_IOMEM) || defined(CONFIG_GENERIC_IOREMAP)
 /*
-- 
cgit v1.2.3


From 1bf8012fc6997f2117f6919369cde16659db60e0 Mon Sep 17 00:00:00 2001
From: Wen Yang <wen.yang@linux.dev>
Date: Mon, 19 Aug 2024 22:59:45 +0800
Subject: pstore: replace spinlock_t by raw_spinlock_t

pstore_dump() is called when both preemption and local IRQ are disabled,
and a spinlock is obtained, which is problematic for the RT kernel because
in this configuration, spinlocks are sleep locks.

Replace the spinlock_t with raw_spinlock_t to avoid sleeping in atomic context.

Signed-off-by: Wen Yang <wen.yang@linux.dev>
Cc: Kees Cook <kees@kernel.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Guilherme G. Piccoli <gpiccoli@igalia.com>
Cc: linux-hardening@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Link: https://lore.kernel.org/r/20240819145945.61274-1-wen.yang@linux.dev
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/pstore.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 638507a3c8ff..fed601053c51 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -182,7 +182,7 @@ struct pstore_info {
 	struct module	*owner;
 	const char	*name;
 
-	spinlock_t	buf_lock;
+	raw_spinlock_t	buf_lock;
 	char		*buf;
 	size_t		bufsize;
 
-- 
cgit v1.2.3


From 13acf2b74803a9a9ab3475c306d5814cf3cefbd8 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Mon, 26 Aug 2024 14:32:42 +0800
Subject: ata: libata: Remove obsolete function declarations

The function ata_schedule_scsi_eh() was removed with commit
f8bbfc247efb ("[PATCH] SCSI: make scsi_implement_eh() generic API for
SCSI transports"), and the function ata_sff_irq_clear() was removed
with commit 37f65b8bc262("libata-sff: ata_sff_irq_clear() is BMDMA
specific").

Remove the now useless declarations of these functions in
drivers/ata/libata.h and include/linux/libata.h.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 include/linux/libata.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0279c0a6302f..6552e90753ae 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -2006,7 +2006,6 @@ extern unsigned int ata_sff_data_xfer(struct ata_queued_cmd *qc,
 extern unsigned int ata_sff_data_xfer32(struct ata_queued_cmd *qc,
 			unsigned char *buf, unsigned int buflen, int rw);
 extern void ata_sff_irq_on(struct ata_port *ap);
-extern void ata_sff_irq_clear(struct ata_port *ap);
 extern int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
 			    u8 status, int in_wq);
 extern void ata_sff_queue_work(struct work_struct *work);
-- 
cgit v1.2.3


From cda1fba15cb2282b3c364805c9767698f11c3b0e Mon Sep 17 00:00:00 2001
From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Date: Fri, 23 Aug 2024 00:25:12 +0200
Subject: dpll: add Embedded SYNC feature for a pin

Implement and document new pin attributes for providing Embedded SYNC
capabilities to the DPLL subsystem users through a netlink pin-get
do/dump messages. Allow the user to set Embedded SYNC frequency with
pin-set do netlink message.

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20240822222513.255179-2-arkadiusz.kubalewski@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dpll.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index d275736230b3..81f7b623d0ba 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -15,6 +15,7 @@
 
 struct dpll_device;
 struct dpll_pin;
+struct dpll_pin_esync;
 
 struct dpll_device_ops {
 	int (*mode_get)(const struct dpll_device *dpll, void *dpll_priv,
@@ -83,6 +84,13 @@ struct dpll_pin_ops {
 	int (*ffo_get)(const struct dpll_pin *pin, void *pin_priv,
 		       const struct dpll_device *dpll, void *dpll_priv,
 		       s64 *ffo, struct netlink_ext_ack *extack);
+	int (*esync_set)(const struct dpll_pin *pin, void *pin_priv,
+			 const struct dpll_device *dpll, void *dpll_priv,
+			 u64 freq, struct netlink_ext_ack *extack);
+	int (*esync_get)(const struct dpll_pin *pin, void *pin_priv,
+			 const struct dpll_device *dpll, void *dpll_priv,
+			 struct dpll_pin_esync *esync,
+			 struct netlink_ext_ack *extack);
 };
 
 struct dpll_pin_frequency {
@@ -111,6 +119,13 @@ struct dpll_pin_phase_adjust_range {
 	s32 max;
 };
 
+struct dpll_pin_esync {
+	u64 freq;
+	const struct dpll_pin_frequency *range;
+	u8 range_num;
+	u8 pulse;
+};
+
 struct dpll_pin_properties {
 	const char *board_label;
 	const char *panel_label;
-- 
cgit v1.2.3


From 7e8fb2eda9885ea2d13179a4c0bbf810f900ef25 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 22 Jul 2024 22:16:47 -0700
Subject: jbd2: fix kernel-doc for j_transaction_overhead_buffers

Use the correct struct member name in the kernel-doc notation
to prevent a kernel-doc build warning.

include/linux/jbd2.h:1303: warning: Function parameter or struct member 'j_transaction_overhead_buffers' not described in 'journal_s'
include/linux/jbd2.h:1303: warning: Excess struct member 'j_transaction_overhead' description in 'journal_s'

Fixes: e3a00a23781c ("jbd2: precompute number of transaction descriptor blocks")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/linux-next/20240710182252.4c281445@canb.auug.org.au/
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20240723051647.3053491-1-rdunlap@infradead.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/jbd2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 5157d92b6f23..17662eae408f 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1086,7 +1086,7 @@ struct journal_s
 	int			j_revoke_records_per_block;
 
 	/**
-	 * @j_transaction_overhead:
+	 * @j_transaction_overhead_buffers:
 	 *
 	 * Number of blocks each transaction needs for its own bookkeeping
 	 */
-- 
cgit v1.2.3


From fa10db138d205362026daecc8ea65d4e339ade00 Mon Sep 17 00:00:00 2001
From: Kemeng Shi <shikemeng@huaweicloud.com>
Date: Thu, 1 Aug 2024 09:38:10 +0800
Subject: jbd2: remove unused return value of jbd2_fc_release_bufs

Remove unused return value of jbd2_fc_release_bufs.

Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Link: https://patch.msgid.link/20240801013815.2393869-4-shikemeng@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/jbd2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 17662eae408f..8aef9bb6ad57 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1675,7 +1675,7 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out);
 int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode);
 int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode);
 int jbd2_fc_wait_bufs(journal_t *journal, int num_blks);
-int jbd2_fc_release_bufs(journal_t *journal);
+void jbd2_fc_release_bufs(journal_t *journal);
 
 /*
  * is_journal_abort
-- 
cgit v1.2.3


From 9a948c0c8e741776ee6d938b49d34d22034fbb7d Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Wed, 14 Aug 2024 11:34:15 +0800
Subject: ceph: Remove unused declarations

These functions is never implemented and used.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index f66f6aac74f6..d7941478158c 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -449,8 +449,6 @@ extern int ceph_osdc_init(struct ceph_osd_client *osdc,
 extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
 extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc);
 
-extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
-				   struct ceph_msg *msg);
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
 void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
-- 
cgit v1.2.3


From 373d3f8dcbb1c0d764123653ca4574be636b8d86 Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Sun, 11 Aug 2024 12:47:26 +0800
Subject: wifi: rfkill: Correct parameter type for rfkill_set_hw_state_reason()

Change type of parameter @reason to enum rfkill_hard_block_reasons
for API rfkill_set_hw_state_reason() according to its comments, and
all kernel callers have invoked the API with enum type actually.

Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Link: https://patch.msgid.link/20240811-rfkill_fix-v2-1-9050760336f4@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/rfkill.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index 373003ace639..997b34197385 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -147,7 +147,8 @@ void rfkill_destroy(struct rfkill *rfkill);
  * Prefer to use rfkill_set_hw_state if you don't need any special reason.
  */
 bool rfkill_set_hw_state_reason(struct rfkill *rfkill,
-				bool blocked, unsigned long reason);
+				bool blocked,
+				enum rfkill_hard_block_reasons reason);
 /**
  * rfkill_set_hw_state - Set the internal rfkill hardware block state
  * @rfkill: pointer to the rfkill class to modify.
@@ -280,7 +281,7 @@ static inline void rfkill_destroy(struct rfkill *rfkill)
 
 static inline bool rfkill_set_hw_state_reason(struct rfkill *rfkill,
 					      bool blocked,
-					      unsigned long reason)
+					      enum rfkill_hard_block_reasons reason)
 {
 	return blocked;
 }
-- 
cgit v1.2.3


From 21b91d40575fb64f3f280f6c3af586e32a704a92 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Thu, 8 Aug 2024 22:05:54 +0800
Subject: efi: Remove unused declaration efi_initialize_iomem_resources()

Since commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"),
this is not used anymore.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 6bf3c4fe8511..e28d88066033 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -764,8 +764,6 @@ extern int efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md);
 extern int __efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md);
 extern void efi_mem_reserve(phys_addr_t addr, u64 size);
 extern int efi_mem_reserve_persistent(phys_addr_t addr, u64 size);
-extern void efi_initialize_iomem_resources(struct resource *code_resource,
-		struct resource *data_resource, struct resource *bss_resource);
 extern u64 efi_get_fdt_params(struct efi_memory_map_data *data);
 extern struct kobject *efi_kobj;
 
-- 
cgit v1.2.3


From 2b55d6a42d14c8675e38d6d9adca3014fdf01951 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 20 Aug 2024 17:59:35 +0200
Subject: rcu/kvfree: Add kvfree_rcu_barrier() API

Add a kvfree_rcu_barrier() function. It waits until all
in-flight pointers are freed over RCU machinery. It does
not wait any GP completion and it is within its right to
return immediately if there are no outstanding pointers.

This function is useful when there is a need to guarantee
that a memory is fully freed before destroying memory caches.
For example, during unloading a kernel module.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/rcutiny.h | 5 +++++
 include/linux/rcutree.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index d9ac7b136aea..522123050ff8 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -111,6 +111,11 @@ static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr)
 	kvfree(ptr);
 }
 
+static inline void kvfree_rcu_barrier(void)
+{
+	rcu_barrier();
+}
+
 #ifdef CONFIG_KASAN_GENERIC
 void kvfree_call_rcu(struct rcu_head *head, void *ptr);
 #else
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 254244202ea9..58e7db80f3a8 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -35,6 +35,7 @@ static inline void rcu_virt_note_context_switch(void)
 
 void synchronize_rcu_expedited(void);
 void kvfree_call_rcu(struct rcu_head *head, void *ptr);
+void kvfree_rcu_barrier(void);
 
 void rcu_barrier(void);
 void rcu_momentary_dyntick_idle(void);
-- 
cgit v1.2.3


From b3c34245756adada8a50bdaedbb3965b071c7b0a Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Fri, 9 Aug 2024 17:36:55 +0200
Subject: kasan: catch invalid free before SLUB reinitializes the object

Currently, when KASAN is combined with init-on-free behavior, the
initialization happens before KASAN's "invalid free" checks.

More importantly, a subsequent commit will want to RCU-delay the actual
SLUB freeing of an object, and we'd like KASAN to still validate
synchronously that freeing the object is permitted. (Otherwise this
change will make the existing testcase kmem_cache_invalid_free fail.)

So add a new KASAN hook that allows KASAN to pre-validate a
kmem_cache_free() operation before SLUB actually starts modifying the
object or its metadata.

Inside KASAN, this:

 - moves checks from poison_slab_object() into check_slab_allocation()
 - moves kasan_arch_is_ready() up into callers of poison_slab_object()
 - removes "ip" argument of poison_slab_object() and __kasan_slab_free()
   (since those functions no longer do any reporting)

Acked-by: Vlastimil Babka <vbabka@suse.cz> #slub
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/kasan.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 70d6a8f6e25d..1570c7191176 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -175,13 +175,55 @@ static __always_inline void * __must_check kasan_init_slab_obj(
 	return (void *)object;
 }
 
-bool __kasan_slab_free(struct kmem_cache *s, void *object,
-			unsigned long ip, bool init);
+bool __kasan_slab_pre_free(struct kmem_cache *s, void *object,
+			unsigned long ip);
+/**
+ * kasan_slab_pre_free - Check whether freeing a slab object is safe.
+ * @object: Object to be freed.
+ *
+ * This function checks whether freeing the given object is safe. It may
+ * check for double-free and invalid-free bugs and report them.
+ *
+ * This function is intended only for use by the slab allocator.
+ *
+ * @Return true if freeing the object is unsafe; false otherwise.
+ */
+static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s,
+						void *object)
+{
+	if (kasan_enabled())
+		return __kasan_slab_pre_free(s, object, _RET_IP_);
+	return false;
+}
+
+bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init);
+/**
+ * kasan_slab_free - Poison, initialize, and quarantine a slab object.
+ * @object: Object to be freed.
+ * @init: Whether to initialize the object.
+ *
+ * This function informs that a slab object has been freed and is not
+ * supposed to be accessed anymore, except for objects in
+ * SLAB_TYPESAFE_BY_RCU caches.
+ *
+ * For KASAN modes that have integrated memory initialization
+ * (kasan_has_integrated_init() == true), this function also initializes
+ * the object's memory. For other modes, the @init argument is ignored.
+ *
+ * This function might also take ownership of the object to quarantine it.
+ * When this happens, KASAN will defer freeing the object to a later
+ * stage and handle it internally until then. The return value indicates
+ * whether KASAN took ownership of the object.
+ *
+ * This function is intended only for use by the slab allocator.
+ *
+ * @Return true if KASAN took ownership of the object; false otherwise.
+ */
 static __always_inline bool kasan_slab_free(struct kmem_cache *s,
 						void *object, bool init)
 {
 	if (kasan_enabled())
-		return __kasan_slab_free(s, object, _RET_IP_, init);
+		return __kasan_slab_free(s, object, init);
 	return false;
 }
 
@@ -371,6 +413,12 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache,
 {
 	return (void *)object;
 }
+
+static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object)
+{
+	return false;
+}
+
 static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init)
 {
 	return false;
-- 
cgit v1.2.3


From b8c8ba73c68bb3c3e9dad22f488b86c540c839f9 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Fri, 9 Aug 2024 17:36:56 +0200
Subject: slub: Introduce CONFIG_SLUB_RCU_DEBUG

Currently, KASAN is unable to catch use-after-free in SLAB_TYPESAFE_BY_RCU
slabs because use-after-free is allowed within the RCU grace period by
design.

Add a SLUB debugging feature which RCU-delays every individual
kmem_cache_free() before either actually freeing the object or handing it
off to KASAN, and change KASAN to poison freed objects as normal when this
option is enabled.

For now I've configured Kconfig.debug to default-enable this feature in the
KASAN GENERIC and SW_TAGS modes; I'm not enabling it by default in HW_TAGS
mode because I'm not sure if it might have unwanted performance degradation
effects there.

Note that this is mostly useful with KASAN in the quarantine-based GENERIC
mode; SLAB_TYPESAFE_BY_RCU slabs are basically always also slabs with a
->ctor, and KASAN's assign_tag() currently has to assign fixed tags for
those, reducing the effectiveness of SW_TAGS/HW_TAGS mode.
(A possible future extension of this work would be to also let SLUB call
the ->ctor() on every allocation instead of only when the slab page is
allocated; then tag-based modes would be able to assign new tags on every
reallocation.)

Tested-by: syzbot+263726e59eab6b442723@syzkaller.appspotmail.com
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Acked-by: Marco Elver <elver@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz> #slab
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/kasan.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 1570c7191176..00a3bf7c0d8f 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -196,15 +196,18 @@ static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s,
 	return false;
 }
 
-bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init);
+bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init,
+		       bool still_accessible);
 /**
  * kasan_slab_free - Poison, initialize, and quarantine a slab object.
  * @object: Object to be freed.
  * @init: Whether to initialize the object.
+ * @still_accessible: Whether the object contents are still accessible.
  *
  * This function informs that a slab object has been freed and is not
- * supposed to be accessed anymore, except for objects in
- * SLAB_TYPESAFE_BY_RCU caches.
+ * supposed to be accessed anymore, except when @still_accessible is set
+ * (indicating that the object is in a SLAB_TYPESAFE_BY_RCU cache and an RCU
+ * grace period might not have passed yet).
  *
  * For KASAN modes that have integrated memory initialization
  * (kasan_has_integrated_init() == true), this function also initializes
@@ -220,10 +223,11 @@ bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init);
  * @Return true if KASAN took ownership of the object; false otherwise.
  */
 static __always_inline bool kasan_slab_free(struct kmem_cache *s,
-						void *object, bool init)
+						void *object, bool init,
+						bool still_accessible)
 {
 	if (kasan_enabled())
-		return __kasan_slab_free(s, object, init);
+		return __kasan_slab_free(s, object, init, still_accessible);
 	return false;
 }
 
@@ -419,7 +423,8 @@ static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object)
 	return false;
 }
 
-static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init)
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
+				   bool init, bool still_accessible)
 {
 	return false;
 }
-- 
cgit v1.2.3


From a9b8f337ea4eb66e4980f90bb73e8786e56cacfd Mon Sep 17 00:00:00 2001
From: Sunil V L <sunilvl@ventanamicro.com>
Date: Mon, 12 Aug 2024 06:29:14 +0530
Subject: ACPI: scan: Add a weak arch_sort_irqchip_probe() to order the IRQCHIP
 probe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unlike OF framework, the irqchip probe using IRQCHIP_ACPI_DECLARE has no
order defined. Depending on the Makefile is not a good idea. So,
usually it is worked around by mandating only root interrupt controller
probed using IRQCHIP_ACPI_DECLARE and other interrupt controllers are
probed via cascade mechanism.

However, this is also not a clean solution because if there are multiple
root controllers (ex: RINTC in RISC-V which is per CPU) which need to be
probed first, then the cascade will happen for every root controller.
So, introduce an architecture specific weak function
arch_sort_irqchip_probe() to order the probing of the interrupt
controllers which can be implemented by different architectures as per
their interrupt controller hierarchy.

Signed-off-by: Sunil V L <sunilvl@ventanamicro.com>
Tested-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://patch.msgid.link/20240812005929.113499-3-sunilvl@ventanamicro.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 0687a442fec7..3fff86f95c2f 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1343,6 +1343,8 @@ struct acpi_probe_entry {
 	kernel_ulong_t driver_data;
 };
 
+void arch_sort_irqchip_probe(struct acpi_probe_entry *ap_head, int nr);
+
 #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable,	\
 				 valid, data, fn)			\
 	static const struct acpi_probe_entry __acpi_probe_##name	\
-- 
cgit v1.2.3


From f7d7ccf92f2b9398781f791b4af1a74a9f65b5c3 Mon Sep 17 00:00:00 2001
From: Sunil V L <sunilvl@ventanamicro.com>
Date: Mon, 12 Aug 2024 06:29:15 +0530
Subject: ACPI: bus: Add acpi_riscv_init() function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new function for RISC-V to do architecture specific initialization
similar to acpi_arm_init(). Some of the ACPI tables are architecture
specific and there is no reason trying to find them on other
architectures. So, add acpi_riscv_init() similar to acpi_arm_init().

Signed-off-by: Sunil V L <sunilvl@ventanamicro.com>
Tested-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://patch.msgid.link/20240812005929.113499-4-sunilvl@ventanamicro.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 3fff86f95c2f..892025d873f0 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1531,6 +1531,12 @@ void acpi_arm_init(void);
 static inline void acpi_arm_init(void) { }
 #endif
 
+#ifdef CONFIG_RISCV
+void acpi_riscv_init(void);
+#else
+static inline void acpi_riscv_init(void) { }
+#endif
+
 #ifdef CONFIG_ACPI_PCC
 void acpi_init_pcc(void);
 #else
-- 
cgit v1.2.3


From 21734d29f84ad610dbafdca5f4c1bf7ecceeb2fb Mon Sep 17 00:00:00 2001
From: Sunil V L <sunilvl@ventanamicro.com>
Date: Mon, 12 Aug 2024 06:29:19 +0530
Subject: ACPI: bus: Add RINTC IRQ model for RISC-V
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the IRQ model for RISC-V INTC so that acpi_set_irq_model can use this
for RISC-V.

Signed-off-by: Sunil V L <sunilvl@ventanamicro.com>
Tested-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://patch.msgid.link/20240812005929.113499-8-sunilvl@ventanamicro.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 892025d873f0..3a21f1cf126f 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -107,6 +107,7 @@ enum acpi_irq_model_id {
 	ACPI_IRQ_MODEL_PLATFORM,
 	ACPI_IRQ_MODEL_GIC,
 	ACPI_IRQ_MODEL_LPIC,
+	ACPI_IRQ_MODEL_RINTC,
 	ACPI_IRQ_MODEL_COUNT
 };
 
-- 
cgit v1.2.3


From fbe826b1c10699a70b81a441fe47b817d1019f37 Mon Sep 17 00:00:00 2001
From: Sunil V L <sunilvl@ventanamicro.com>
Date: Mon, 12 Aug 2024 06:29:27 +0530
Subject: irqchip/riscv-imsic: Add ACPI support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RISC-V IMSIC interrupt controller provides IPI and MSI support.
Currently, DT based drivers setup the IPI feature early during boot but
defer setting up the MSI functionality. However, in ACPI systems, PCI
subsystem is probed early and assume MSI controller is already setup.
Hence, both IPI and MSI features need to be initialized early itself.

Signed-off-by: Sunil V L <sunilvl@ventanamicro.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Tested-by: Björn Töpel <bjorn@rivosinc.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20240812005929.113499-16-sunilvl@ventanamicro.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/irqchip/riscv-imsic.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/riscv-imsic.h b/include/linux/irqchip/riscv-imsic.h
index faf0b800b1b0..7494952c5518 100644
--- a/include/linux/irqchip/riscv-imsic.h
+++ b/include/linux/irqchip/riscv-imsic.h
@@ -8,6 +8,8 @@
 
 #include <linux/types.h>
 #include <linux/bitops.h>
+#include <linux/device.h>
+#include <linux/fwnode.h>
 #include <asm/csr.h>
 
 #define IMSIC_MMIO_PAGE_SHIFT		12
@@ -84,4 +86,11 @@ static inline const struct imsic_global_config *imsic_get_global_config(void)
 
 #endif
 
+#ifdef CONFIG_ACPI
+int imsic_platform_acpi_probe(struct fwnode_handle *fwnode);
+struct fwnode_handle *imsic_acpi_get_fwnode(struct device *dev);
+#else
+static inline struct fwnode_handle *imsic_acpi_get_fwnode(struct device *dev) { return NULL; }
+#endif
+
 #endif
-- 
cgit v1.2.3


From a707f85d47cad3978bef8fe90c7a79e3998e9d36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 3 Aug 2024 14:34:17 +0200
Subject: HID: bpf: constify parameter rdesc of call_hid_bpf_rdesc_fixup()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The parameter is never modified, so mark it as const.
Also inline the return statement to avoid a type mismatch error.

This is a prerequisite for constification changes in the HID core.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://patch.msgid.link/20240803-hid-const-fixup-v2-1-f53d7a7b29d8@weissschuh.net
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 include/linux/hid_bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h
index d4d063cf63b5..6a47223e6460 100644
--- a/include/linux/hid_bpf.h
+++ b/include/linux/hid_bpf.h
@@ -212,7 +212,7 @@ int hid_bpf_connect_device(struct hid_device *hdev);
 void hid_bpf_disconnect_device(struct hid_device *hdev);
 void hid_bpf_destroy_device(struct hid_device *hid);
 int hid_bpf_device_init(struct hid_device *hid);
-u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, u8 *rdesc, unsigned int *size);
+u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned int *size);
 #else /* CONFIG_HID_BPF */
 static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type,
 						u8 *data, u32 *size, int interrupt,
-- 
cgit v1.2.3


From 6737769ca0b6bfc82df53db5fd69068902d608db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 3 Aug 2024 14:34:18 +0200
Subject: HID: constify parameter rdesc of hid_parse_report()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The parameter is never modified, so mark it as const.

This is a prerequisite for constification changes in the HID core.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://patch.msgid.link/20240803-hid-const-fixup-v2-2-f53d7a7b29d8@weissschuh.net
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 include/linux/hid.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index 1533c9dcd3a6..e7a5d6f2f2eb 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -953,7 +953,7 @@ struct hid_device *hid_allocate_device(void);
 struct hid_report *hid_register_report(struct hid_device *device,
 				       enum hid_report_type type, unsigned int id,
 				       unsigned int application);
-int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size);
+int hid_parse_report(struct hid_device *hid, const __u8 *start, unsigned size);
 struct hid_report *hid_validate_values(struct hid_device *hid,
 				       enum hid_report_type type, unsigned int id,
 				       unsigned int field_index,
-- 
cgit v1.2.3


From 24ddd0d7de7aaf50f537fd727f31616cb5a65a9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 3 Aug 2024 14:34:19 +0200
Subject: HID: constify hid_device::rdesc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Once a report descriptor has been created by the HID core it is not
supposed to be modified anymore.
Enforce this invariant through the type system.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://patch.msgid.link/20240803-hid-const-fixup-v2-3-f53d7a7b29d8@weissschuh.net
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 include/linux/hid.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index e7a5d6f2f2eb..dda34d8cad19 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -602,7 +602,7 @@ struct hid_ll_driver;
 struct hid_device {							/* device report descriptor */
 	__u8 *dev_rdesc;
 	unsigned dev_rsize;
-	__u8 *rdesc;
+	const __u8 *rdesc;
 	unsigned rsize;
 	struct hid_collection *collection;				/* List of HID collections */
 	unsigned collection_size;					/* Number of allocated hid_collections */
-- 
cgit v1.2.3


From 80cfb508f3fe4c7c6a567fc4aa863c9a38709cd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 3 Aug 2024 14:34:20 +0200
Subject: HID: constify params and return value of fetch_item()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fetch_item() does not modify the descriptor it operates on.
As a prerequisite for the constification of hid_driver::dev_rdesc,
mark the parameters and return value of fetch_item() as const.

Also adapt the variable types in the callers to match this
constification.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://patch.msgid.link/20240803-hid-const-fixup-v2-4-f53d7a7b29d8@weissschuh.net
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 include/linux/hid.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index dda34d8cad19..502bbc6f078c 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -46,7 +46,7 @@ struct hid_item {
 	    __s16  s16;
 	    __u32  u32;
 	    __s32  s32;
-	    __u8  *longdata;
+	    const __u8  *longdata;
 	} data;
 };
 
-- 
cgit v1.2.3


From 3593630c89d7d3963c42262694f8aa8b4727a0ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 3 Aug 2024 14:34:21 +0200
Subject: HID: constify hid_device::dev_rdesc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Once a report descriptor has been created by the HID core it is not
supposed to be modified anymore.
Enforce this invariant through the type system.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://patch.msgid.link/20240803-hid-const-fixup-v2-5-f53d7a7b29d8@weissschuh.net
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 include/linux/hid.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index 502bbc6f078c..c5fb43db0f2e 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -600,7 +600,7 @@ struct hid_driver;
 struct hid_ll_driver;
 
 struct hid_device {							/* device report descriptor */
-	__u8 *dev_rdesc;
+	const __u8 *dev_rdesc;
 	unsigned dev_rsize;
 	const __u8 *rdesc;
 	unsigned rsize;
-- 
cgit v1.2.3


From fe73965d078670406acee0218f118c0870d6a58b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 3 Aug 2024 14:34:22 +0200
Subject: HID: change return type of report_fixup() to const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

By allowing the drivers to return a "const *" they can constify their
static report arrays.
This makes it clear to driver authors that the HID core will not modify
those reports and they can be reused for multiple devices.
Furthermore security is slightly improved as those reports are protected
against accidental or malicious modifications.

[bentiss: fixup hid-cougar.c and hid-multitouch.c for latest version of
the master branch]

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://patch.msgid.link/20240803-hid-const-fixup-v2-6-f53d7a7b29d8@weissschuh.net
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 include/linux/hid.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index c5fb43db0f2e..3a998fa3812d 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -822,7 +822,7 @@ struct hid_driver {
 			struct hid_usage *usage, __s32 value);
 	void (*report)(struct hid_device *hdev, struct hid_report *report);
 
-	__u8 *(*report_fixup)(struct hid_device *hdev, __u8 *buf,
+	const __u8 *(*report_fixup)(struct hid_device *hdev, __u8 *buf,
 			unsigned int *size);
 
 	int (*input_mapping)(struct hid_device *hdev,
-- 
cgit v1.2.3


From ba7e053ec89f61be9d27bfb244de52849b5138aa Mon Sep 17 00:00:00 2001
From: Artur Weber <aweber.kernel@gmail.com>
Date: Fri, 16 Aug 2024 10:19:09 +0200
Subject: power: supply: max77693: Expose input current limit and CC current
 properties

There are two charger current limit registers:

- Fast charge current limit (which controls current going from the
  charger to the battery);
- CHGIN input current limit (which controls current going into the
  charger through the cable).

Add the necessary functions to retrieve the CHGIN input limit (from CHARGER
regulator) and maximum fast charge current values, and expose them as power
supply properties.

Tested-by: Henrik Grimler <henrik@grimler.se>
Signed-off-by: Artur Weber <aweber.kernel@gmail.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240816-max77693-charger-extcon-v4-3-050a0a9bfea0@gmail.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/linux/mfd/max77693-private.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h
index 54444ff2a5de..20c5e02ed9da 100644
--- a/include/linux/mfd/max77693-private.h
+++ b/include/linux/mfd/max77693-private.h
@@ -217,6 +217,10 @@ enum max77693_charger_battery_state {
 #define CHG_CNFG_01_CHGRSTRT_MASK	(0x3 << CHG_CNFG_01_CHGRSTRT_SHIFT)
 #define CHG_CNFG_01_PQEN_MAKS		BIT(CHG_CNFG_01_PQEN_SHIFT)
 
+/* MAX77693_CHG_REG_CHG_CNFG_02 register */
+#define CHG_CNFG_02_CC_SHIFT		0
+#define CHG_CNFG_02_CC_MASK		0x3F
+
 /* MAX77693_CHG_REG_CHG_CNFG_03 register */
 #define CHG_CNFG_03_TOITH_SHIFT		0
 #define CHG_CNFG_03_TOTIME_SHIFT	3
@@ -244,6 +248,7 @@ enum max77693_charger_battery_state {
 #define CHG_CNFG_12_VCHGINREG_MASK	(0x3 << CHG_CNFG_12_VCHGINREG_SHIFT)
 
 /* MAX77693 CHG_CNFG_09 Register */
+#define CHG_CNFG_09_CHGIN_ILIM_SHIFT	0
 #define CHG_CNFG_09_CHGIN_ILIM_MASK	0x7F
 
 /* MAX77693 CHG_CTRL Register */
-- 
cgit v1.2.3


From a96c5515d0d15df103598b2bc57245d66143b5dd Mon Sep 17 00:00:00 2001
From: Tristram Ha <tristram.ha@microchip.com>
Date: Mon, 26 Aug 2024 21:43:08 +0000
Subject: net: dsa: microchip: Add KSZ8895/KSZ8864 switch support

KSZ8895/KSZ8864 is a switch family between KSZ8863/73 and KSZ8795, so it
shares some registers and functions in those switches already
implemented in the KSZ DSA driver.

Signed-off-by: Tristram Ha <tristram.ha@microchip.com>
Tested-by: Pieter Van Trappen <pieter.van.trappen@cern.ch>
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/platform_data/microchip-ksz.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h
index 8c659db4da6b..d074019474f5 100644
--- a/include/linux/platform_data/microchip-ksz.h
+++ b/include/linux/platform_data/microchip-ksz.h
@@ -28,6 +28,8 @@ enum ksz_chip_id {
 	KSZ8794_CHIP_ID = 0x8794,
 	KSZ8765_CHIP_ID = 0x8765,
 	KSZ8830_CHIP_ID = 0x8830,
+	KSZ8864_CHIP_ID = 0x8864,
+	KSZ8895_CHIP_ID = 0x8895,
 	KSZ9477_CHIP_ID = 0x00947700,
 	KSZ9896_CHIP_ID = 0x00989600,
 	KSZ9897_CHIP_ID = 0x00989700,
-- 
cgit v1.2.3


From 1934b212615dc617ac84fc306333ab2b9fc3b04f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 9 Aug 2024 18:00:01 +0200
Subject: file: reclaim 24 bytes from f_owner

We do embedd struct fown_struct into struct file letting it take up 32
bytes in total. We could tweak struct fown_struct to be more compact but
really it shouldn't even be embedded in struct file in the first place.

Instead, actual users of struct fown_struct should allocate the struct
on demand. This frees up 24 bytes in struct file.

That will have some potentially user-visible changes for the ownership
fcntl()s. Some of them can now fail due to allocation failures.
Practically, that probably will almost never happen as the allocations
are small and they only happen once per file.

The fown_struct is used during kill_fasync() which is used by e.g.,
pipes to generate a SIGIO signal. Sending of such signals is conditional
on userspace having set an owner for the file using one of the F_OWNER
fcntl()s. Such users will be unaffected if struct fown_struct is
allocated during the fcntl() call.

There are a few subsystems that call __f_setown() expecting
file->f_owner to be allocated:

(1) tun devices
    file->f_op->fasync::tun_chr_fasync()
    -> __f_setown()

    There are no callers of tun_chr_fasync().

(2) tty devices

    file->f_op->fasync::tty_fasync()
    -> __tty_fasync()
       -> __f_setown()

    tty_fasync() has no additional callers but __tty_fasync() has. Note
    that __tty_fasync() only calls __f_setown() if the @on argument is
    true. It's called from:

    file->f_op->release::tty_release()
    -> tty_release()
       -> __tty_fasync()
          -> __f_setown()

    tty_release() calls __tty_fasync() with @on false
    => __f_setown() is never called from tty_release().
       => All callers of tty_release() are safe as well.

    file->f_op->release::tty_open()
    -> tty_release()
       -> __tty_fasync()
          -> __f_setown()

    __tty_hangup() calls __tty_fasync() with @on false
    => __f_setown() is never called from tty_release().
       => All callers of __tty_hangup() are safe as well.

From the callchains it's obvious that (1) and (2) end up getting called
via file->f_op->fasync(). That can happen either through the F_SETFL
fcntl() with the FASYNC flag raised or via the FIOASYNC ioctl(). If
FASYNC is requested and the file isn't already FASYNC then
file->f_op->fasync() is called with @on true which ends up causing both
(1) and (2) to call __f_setown().

(1) and (2) are the only subsystems that call __f_setown() from the
file->f_op->fasync() handler. So both (1) and (2) have been updated to
allocate a struct fown_struct prior to calling fasync_helper() to
register with the fasync infrastructure. That's safe as they both call
fasync_helper() which also does allocations if @on is true.

The other interesting case are file leases:

(3) file leases
    lease_manager_ops->lm_setup::lease_setup()
    -> __f_setown()

    Which in turn is called from:

    generic_add_lease()
    -> lease_manager_ops->lm_setup::lease_setup()
       -> __f_setown()

So here again we can simply make generic_add_lease() allocate struct
fown_struct prior to the lease_manager_ops->lm_setup::lease_setup()
which happens under a spinlock.

With that the two remaining subsystems that call __f_setown() are:

(4) dnotify
(5) sockets

Both have their own custom ioctls to set struct fown_struct and both
have been converted to allocate a struct fown_struct on demand from
their respective ioctls.

Interactions with O_PATH are fine as well e.g., when opening a /dev/tty
as O_PATH then no file->f_op->open() happens thus no file->f_owner is
allocated. That's fine as no file operation will be set for those and
the device has never been opened. fcntl()s called on such things will
just allocate a ->f_owner on demand. Although I have zero idea why'd you
care about f_owner on an O_PATH fd.

Link: https://lore.kernel.org/r/20240813-work-f_owner-v2-1-4e9343a79f9f@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb0426f349fc..7af239ca87e2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -947,6 +947,7 @@ static inline unsigned imajor(const struct inode *inode)
 }
 
 struct fown_struct {
+	struct file *file;	/* backpointer for security modules */
 	rwlock_t lock;          /* protects pid, uid, euid fields */
 	struct pid *pid;	/* pid or -pgrp where SIGIO should be sent */
 	enum pid_type pid_type;	/* Kind of process group SIGIO should be sent to */
@@ -1011,7 +1012,7 @@ struct file {
 	struct mutex		f_pos_lock;
 	loff_t			f_pos;
 	unsigned int		f_flags;
-	struct fown_struct	f_owner;
+	struct fown_struct	*f_owner;
 	const struct cred	*f_cred;
 	struct file_ra_state	f_ra;
 	struct path		f_path;
@@ -1076,6 +1077,12 @@ struct file_lease;
 #define OFFT_OFFSET_MAX	type_max(off_t)
 #endif
 
+int file_f_owner_allocate(struct file *file);
+static inline struct fown_struct *file_f_owner(const struct file *file)
+{
+	return READ_ONCE(file->f_owner);
+}
+
 extern void send_sigio(struct fown_struct *fown, int fd, int band);
 
 static inline struct inode *file_inode(const struct file *f)
@@ -1124,7 +1131,7 @@ extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force
 extern int f_setown(struct file *filp, int who, int force);
 extern void f_delown(struct file *filp);
 extern pid_t f_getown(struct file *filp);
-extern int send_sigurg(struct fown_struct *fown);
+extern int send_sigurg(struct file *file);
 
 /*
  * sb->s_flags.  Note that these mirror the equivalent MS_* flags where
-- 
cgit v1.2.3


From a55d1cbd1720679cfe9837bce250e397ec513989 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Aug 2024 16:14:46 +0200
Subject: fs: switch f_iocb_flags and f_ra

Now that we shrank struct file by 24 bytes we still have a 4 byte hole.
If we move struct file_ra_state into the union and f_iocb_flags out of
the union we close that whole and bring down struct file to 192 bytes.
Which means struct file is 3 cachelines and we managed to shrink it by
40 bytes this cycle.

I've tried to audit all codepaths that use f_ra and none of them seem to
rely on it in file->f_op->release() and never have since commit
1da177e4c3f4 ("Linux-2.6.12-rc2").

Link: https://lore.kernel.org/r/20240823-luftdicht-berappen-d69a2166a0db@brauner
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7af239ca87e2..095a956aeb29 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -999,9 +999,9 @@ struct file {
 		struct callback_head 	f_task_work;
 		/* fput() must use workqueue (most kernel threads). */
 		struct llist_node	f_llist;
-		unsigned int 		f_iocb_flags;
+		/* Invalid after last fput(). */
+		struct file_ra_state	f_ra;
 	};
-
 	/*
 	 * Protects f_ep, f_flags.
 	 * Must not be taken from IRQ context.
@@ -1012,9 +1012,9 @@ struct file {
 	struct mutex		f_pos_lock;
 	loff_t			f_pos;
 	unsigned int		f_flags;
+	unsigned int 		f_iocb_flags;
 	struct fown_struct	*f_owner;
 	const struct cred	*f_cred;
-	struct file_ra_state	f_ra;
 	struct path		f_path;
 	struct inode		*f_inode;	/* cached value */
 	const struct file_operations	*f_op;
-- 
cgit v1.2.3


From 947697c6f0f75f9866f2f891a102dece7a09a064 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Thu, 22 Aug 2024 10:18:52 +0530
Subject: uapi: Define GENMASK_U128

This adds GENMASK_U128() and __GENMASK_U128() macros using __BITS_PER_U128
and __int128 data types. These macros will be used in providing support for
generating 128 bit masks.

The macros wouldn't work in all assembler flavors for reasons described
in the comments on top of declarations. Enforce it for more by adding
!__ASSEMBLY__ guard.

Cc: Yury Norov <yury.norov@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>>
Cc: linux-kernel@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bits.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bits.h b/include/linux/bits.h
index 0eb24d21aac2..60044b608817 100644
--- a/include/linux/bits.h
+++ b/include/linux/bits.h
@@ -36,4 +36,19 @@
 #define GENMASK_ULL(h, l) \
 	(GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l))
 
+#if !defined(__ASSEMBLY__)
+/*
+ * Missing asm support
+ *
+ * __GENMASK_U128() depends on _BIT128() which would not work
+ * in the asm code, as it shifts an 'unsigned __init128' data
+ * type instead of direct representation of 128 bit constants
+ * such as long and unsigned long. The fundamental problem is
+ * that a 128 bit constant will get silently truncated by the
+ * gcc compiler.
+ */
+#define GENMASK_U128(h, l) \
+	(GENMASK_INPUT_CHECK(h, l) + __GENMASK_U128(h, l))
+#endif
+
 #endif	/* __LINUX_BITS_H */
-- 
cgit v1.2.3


From 57413d8e172c10c90fbd91f98d0f7d8eb27e824c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 27 Aug 2024 08:50:47 +0200
Subject: fs: sort out the fallocate mode vs flag mess

The fallocate system call takes a mode argument, but that argument
contains a wild mix of exclusive modes and an optional flags.

Replace FALLOC_FL_SUPPORTED_MASK with FALLOC_FL_MODE_MASK, which excludes
the optional flag bit, so that we can use switch statement on the value
to easily enumerate the cases while getting the check for duplicate modes
for free.

To make this (and in the future the file system implementations) more
readable also add a symbolic name for the 0 mode used to allocate blocks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240827065123.1762168-4-hch@lst.de
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/falloc.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index f3f0b97b1675..3f49f3df6af5 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -25,12 +25,18 @@ struct space_resv {
 #define FS_IOC_UNRESVSP64	_IOW('X', 43, struct space_resv)
 #define FS_IOC_ZERO_RANGE	_IOW('X', 57, struct space_resv)
 
-#define	FALLOC_FL_SUPPORTED_MASK	(FALLOC_FL_KEEP_SIZE |		\
-					 FALLOC_FL_PUNCH_HOLE |		\
-					 FALLOC_FL_COLLAPSE_RANGE |	\
-					 FALLOC_FL_ZERO_RANGE |		\
-					 FALLOC_FL_INSERT_RANGE |	\
-					 FALLOC_FL_UNSHARE_RANGE)
+/*
+ * Mask of all supported fallocate modes.  Only one can be set at a time.
+ *
+ * In addition to the mode bit, the mode argument can also encode flags.
+ * FALLOC_FL_KEEP_SIZE is the only supported flag so far.
+ */
+#define FALLOC_FL_MODE_MASK	(FALLOC_FL_ALLOCATE_RANGE |	\
+				 FALLOC_FL_PUNCH_HOLE |		\
+				 FALLOC_FL_COLLAPSE_RANGE |	\
+				 FALLOC_FL_ZERO_RANGE |		\
+				 FALLOC_FL_INSERT_RANGE |	\
+				 FALLOC_FL_UNSHARE_RANGE)
 
 /* on ia32 l_start is on a 32-bit boundary */
 #if defined(CONFIG_X86_64)
-- 
cgit v1.2.3


From 654beb75ca95240c00c78be0bbc2ea66a125a997 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Wed, 21 Aug 2024 18:46:33 +0200
Subject: dma: ipu: Remove include/linux/dma/ipu-dma.h

When this file was renamed in commit b8a6d9980f75 ("dma: ipu: rename
mach/ipu.h to include/linux/dma/ipu-dma.h"), 4 .c files have been modified
accordingly:

  drivers/dma/ipu/ipu_idmac.c
  drivers/dma/ipu/ipu_irq.c
  --> removed in commit f1de55ff7c70 ("dmaengine: ipu: Remove the driver")
      in 2023-08

  drivers/media/platform/soc_camera/mx3_camera.c
  --> removed in commit c93cc61475eb ("[media] staging/media: remove
      deprecated mx3 driver") in 2016-06

  drivers/video/mx3fb.c
  --> removed in commit bfac19e239a7 ("fbdev: mx3fb: Remove the driver")
      in 2023-08

Now include/linux/dma/ipu-dma.h is unused and can be removed as-well.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/532e7e2816ccf226f3ab1fa76ec7873bc09299d0.1724258714.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/ipu-dma.h | 174 --------------------------------------------
 1 file changed, 174 deletions(-)
 delete mode 100644 include/linux/dma/ipu-dma.h

(limited to 'include/linux')

diff --git a/include/linux/dma/ipu-dma.h b/include/linux/dma/ipu-dma.h
deleted file mode 100644
index 6969391580d2..000000000000
--- a/include/linux/dma/ipu-dma.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2008
- * Guennadi Liakhovetski, DENX Software Engineering, <lg@denx.de>
- *
- * Copyright (C) 2005-2007 Freescale Semiconductor, Inc.
- */
-
-#ifndef __LINUX_DMA_IPU_DMA_H
-#define __LINUX_DMA_IPU_DMA_H
-
-#include <linux/types.h>
-#include <linux/dmaengine.h>
-
-/* IPU DMA Controller channel definitions. */
-enum ipu_channel {
-	IDMAC_IC_0 = 0,		/* IC (encoding task) to memory */
-	IDMAC_IC_1 = 1,		/* IC (viewfinder task) to memory */
-	IDMAC_ADC_0 = 1,
-	IDMAC_IC_2 = 2,
-	IDMAC_ADC_1 = 2,
-	IDMAC_IC_3 = 3,
-	IDMAC_IC_4 = 4,
-	IDMAC_IC_5 = 5,
-	IDMAC_IC_6 = 6,
-	IDMAC_IC_7 = 7,		/* IC (sensor data) to memory */
-	IDMAC_IC_8 = 8,
-	IDMAC_IC_9 = 9,
-	IDMAC_IC_10 = 10,
-	IDMAC_IC_11 = 11,
-	IDMAC_IC_12 = 12,
-	IDMAC_IC_13 = 13,
-	IDMAC_SDC_0 = 14,	/* Background synchronous display data */
-	IDMAC_SDC_1 = 15,	/* Foreground data (overlay) */
-	IDMAC_SDC_2 = 16,
-	IDMAC_SDC_3 = 17,
-	IDMAC_ADC_2 = 18,
-	IDMAC_ADC_3 = 19,
-	IDMAC_ADC_4 = 20,
-	IDMAC_ADC_5 = 21,
-	IDMAC_ADC_6 = 22,
-	IDMAC_ADC_7 = 23,
-	IDMAC_PF_0 = 24,
-	IDMAC_PF_1 = 25,
-	IDMAC_PF_2 = 26,
-	IDMAC_PF_3 = 27,
-	IDMAC_PF_4 = 28,
-	IDMAC_PF_5 = 29,
-	IDMAC_PF_6 = 30,
-	IDMAC_PF_7 = 31,
-};
-
-/* Order significant! */
-enum ipu_channel_status {
-	IPU_CHANNEL_FREE,
-	IPU_CHANNEL_INITIALIZED,
-	IPU_CHANNEL_READY,
-	IPU_CHANNEL_ENABLED,
-};
-
-#define IPU_CHANNELS_NUM 32
-
-enum pixel_fmt {
-	/* 1 byte */
-	IPU_PIX_FMT_GENERIC,
-	IPU_PIX_FMT_RGB332,
-	IPU_PIX_FMT_YUV420P,
-	IPU_PIX_FMT_YUV422P,
-	IPU_PIX_FMT_YUV420P2,
-	IPU_PIX_FMT_YVU422P,
-	/* 2 bytes */
-	IPU_PIX_FMT_RGB565,
-	IPU_PIX_FMT_RGB666,
-	IPU_PIX_FMT_BGR666,
-	IPU_PIX_FMT_YUYV,
-	IPU_PIX_FMT_UYVY,
-	/* 3 bytes */
-	IPU_PIX_FMT_RGB24,
-	IPU_PIX_FMT_BGR24,
-	/* 4 bytes */
-	IPU_PIX_FMT_GENERIC_32,
-	IPU_PIX_FMT_RGB32,
-	IPU_PIX_FMT_BGR32,
-	IPU_PIX_FMT_ABGR32,
-	IPU_PIX_FMT_BGRA32,
-	IPU_PIX_FMT_RGBA32,
-};
-
-enum ipu_color_space {
-	IPU_COLORSPACE_RGB,
-	IPU_COLORSPACE_YCBCR,
-	IPU_COLORSPACE_YUV
-};
-
-/*
- * Enumeration of IPU rotation modes
- */
-enum ipu_rotate_mode {
-	/* Note the enum values correspond to BAM value */
-	IPU_ROTATE_NONE = 0,
-	IPU_ROTATE_VERT_FLIP = 1,
-	IPU_ROTATE_HORIZ_FLIP = 2,
-	IPU_ROTATE_180 = 3,
-	IPU_ROTATE_90_RIGHT = 4,
-	IPU_ROTATE_90_RIGHT_VFLIP = 5,
-	IPU_ROTATE_90_RIGHT_HFLIP = 6,
-	IPU_ROTATE_90_LEFT = 7,
-};
-
-/*
- * Enumeration of DI ports for ADC.
- */
-enum display_port {
-	DISP0,
-	DISP1,
-	DISP2,
-	DISP3
-};
-
-struct idmac_video_param {
-	unsigned short		in_width;
-	unsigned short		in_height;
-	uint32_t		in_pixel_fmt;
-	unsigned short		out_width;
-	unsigned short		out_height;
-	uint32_t		out_pixel_fmt;
-	unsigned short		out_stride;
-	bool			graphics_combine_en;
-	bool			global_alpha_en;
-	bool			key_color_en;
-	enum display_port	disp;
-	unsigned short		out_left;
-	unsigned short		out_top;
-};
-
-/*
- * Union of initialization parameters for a logical channel. So far only video
- * parameters are used.
- */
-union ipu_channel_param {
-	struct idmac_video_param video;
-};
-
-struct idmac_tx_desc {
-	struct dma_async_tx_descriptor	txd;
-	struct scatterlist		*sg;	/* scatterlist for this */
-	unsigned int			sg_len;	/* tx-descriptor. */
-	struct list_head		list;
-};
-
-struct idmac_channel {
-	struct dma_chan		dma_chan;
-	dma_cookie_t		completed;	/* last completed cookie	   */
-	union ipu_channel_param	params;
-	enum ipu_channel	link;	/* input channel, linked to the output	   */
-	enum ipu_channel_status	status;
-	void			*client;	/* Only one client per channel	   */
-	unsigned int		n_tx_desc;
-	struct idmac_tx_desc	*desc;		/* allocated tx-descriptors	   */
-	struct scatterlist	*sg[2];	/* scatterlist elements in buffer-0 and -1 */
-	struct list_head	free_list;	/* free tx-descriptors		   */
-	struct list_head	queue;		/* queued tx-descriptors	   */
-	spinlock_t		lock;		/* protects sg[0,1], queue	   */
-	struct mutex		chan_mutex; /* protects status, cookie, free_list  */
-	bool			sec_chan_en;
-	int			active_buffer;
-	unsigned int		eof_irq;
-	char			eof_name[16];	/* EOF IRQ name for request_irq()  */
-};
-
-#define to_tx_desc(tx) container_of(tx, struct idmac_tx_desc, txd)
-#define to_idmac_chan(c) container_of(c, struct idmac_channel, dma_chan)
-
-#endif /* __LINUX_DMA_IPU_DMA_H */
-- 
cgit v1.2.3


From 73d5fc92a11cacb73a1aac0b5793c47e48c5b537 Mon Sep 17 00:00:00 2001
From: Nishad Saraf <nishads@amd.com>
Date: Mon, 19 Aug 2024 14:19:48 -0700
Subject: dmaengine: amd: qdma: Add AMD QDMA driver

Adds driver to enable PCIe board which uses AMD QDMA (the Queue-based
Direct Memory Access) subsystem. For example, Xilinx Alveo V70 AI
Accelerator devices.
    https://www.xilinx.com/applications/data-center/v70.html

The QDMA subsystem is used in conjunction with the PCI Express IP block
to provide high performance data transfer between host memory and the
card's DMA subsystem.

            +-------+       +-------+       +-----------+
   PCIe     |       |       |       |       |           |
   Tx/Rx    |       |       |       |  AXI  |           |
 <=======>  | PCIE  | <===> | QDMA  | <====>| User Logic|
            |       |       |       |       |           |
            +-------+       +-------+       +-----------+

The primary mechanism to transfer data using the QDMA is for the QDMA
engine to operate on instructions (descriptors) provided by the host
operating system. Using the descriptors, the QDMA can move data in both
the Host to Card (H2C) direction, or the Card to Host (C2H) direction.
The QDMA provides a per-queue basis option whether DMA traffic goes
to an AXI4 memory map (MM) interface or to an AXI4-Stream interface.

The hardware detail is provided by
    https://docs.xilinx.com/r/en-US/pg302-qdma

Implements dmaengine APIs to support MM DMA transfers.
- probe the available DMA channels
- use dma_slave_map for channel lookup
- use virtual channel to manage dmaengine tx descriptors
- implement device_prep_slave_sg callback to handle host scatter gather
  list

Signed-off-by: Nishad Saraf <nishads@amd.com>
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
Link: https://lore.kernel.org/r/20240819211948.688786-2-lizhi.hou@amd.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/platform_data/amd_qdma.h | 36 ++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 include/linux/platform_data/amd_qdma.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/amd_qdma.h b/include/linux/platform_data/amd_qdma.h
new file mode 100644
index 000000000000..576d952f97ed
--- /dev/null
+++ b/include/linux/platform_data/amd_qdma.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _PLATDATA_AMD_QDMA_H
+#define _PLATDATA_AMD_QDMA_H
+
+#include <linux/dmaengine.h>
+
+/**
+ * struct qdma_queue_info - DMA queue information. This information is used to
+ *			    match queue when DMA channel is requested
+ * @dir: Channel transfer direction
+ */
+struct qdma_queue_info {
+	enum dma_transfer_direction dir;
+};
+
+#define QDMA_FILTER_PARAM(qinfo)	((void *)(qinfo))
+
+struct dma_slave_map;
+
+/**
+ * struct qdma_platdata - Platform specific data for QDMA engine
+ * @max_mm_channels: Maximum number of MM DMA channels in each direction
+ * @device_map: DMA slave map
+ * @irq_index: The index of first IRQ
+ */
+struct qdma_platdata {
+	u32			max_mm_channels;
+	u32			irq_index;
+	struct dma_slave_map	*device_map;
+};
+
+#endif /* _PLATDATA_AMD_QDMA_H */
-- 
cgit v1.2.3


From 4bb59323450db610b06c549fa4d5936e94fb9a36 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Sat, 10 Aug 2024 17:45:40 +0800
Subject: dmaengine: ti: k3-udma: Remove unused declarations

Commit d70241913413 ("dmaengine: ti: k3-udma: Add glue layer for non DMAengine users")
declared but never implemented these.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Acked-by: Peter Ujfalusi <peter.ujfalusi@gmail.com>
Link: https://lore.kernel.org/r/20240810094540.2589310-1-yuehaibing@huawei.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/k3-udma-glue.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h
index 1e491c5dcac2..2dea217629d0 100644
--- a/include/linux/dma/k3-udma-glue.h
+++ b/include/linux/dma/k3-udma-glue.h
@@ -136,8 +136,6 @@ u32 k3_udma_glue_rx_flow_get_fdq_id(struct k3_udma_glue_rx_channel *rx_chn,
 u32 k3_udma_glue_rx_get_flow_id_base(struct k3_udma_glue_rx_channel *rx_chn);
 int k3_udma_glue_rx_get_irq(struct k3_udma_glue_rx_channel *rx_chn,
 			    u32 flow_num);
-void k3_udma_glue_rx_put_irq(struct k3_udma_glue_rx_channel *rx_chn,
-			     u32 flow_num);
 void k3_udma_glue_reset_rx_chn(struct k3_udma_glue_rx_channel *rx_chn,
 		u32 flow_num, void *data,
 		void (*cleanup)(void *data, dma_addr_t desc_dma),
-- 
cgit v1.2.3


From 41845541adebc503b8574943c92670016d5e566b Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Fri, 23 Aug 2024 17:05:18 +0800
Subject: firmware: arm_scmi: Add initial support for i.MX BBM protocol

i.MX95 has a battery-backed module(BBM), which has persistent storage
(GPR), an RTC, and the ON/OFF button. The System Manager(SM) firmware
use SCMI vendor protocol(SCMI BBM) to let agent be able to use GPR, RTC
and ON/OFF button.

Reviewed-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Message-Id: <20240823-imx95-bbm-misc-v2-v8-2-e600ed9e9271@nxp.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scmi_imx_protocol.h | 42 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 include/linux/scmi_imx_protocol.h

(limited to 'include/linux')

diff --git a/include/linux/scmi_imx_protocol.h b/include/linux/scmi_imx_protocol.h
new file mode 100644
index 000000000000..2df2ea0f1809
--- /dev/null
+++ b/include/linux/scmi_imx_protocol.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SCMI Message Protocol driver NXP extension header
+ *
+ * Copyright 2024 NXP.
+ */
+
+#ifndef _LINUX_SCMI_NXP_PROTOCOL_H
+#define _LINUX_SCMI_NXP_PROTOCOL_H
+
+#include <linux/bitfield.h>
+#include <linux/device.h>
+#include <linux/notifier.h>
+#include <linux/types.h>
+
+enum scmi_nxp_protocol {
+	SCMI_PROTOCOL_IMX_BBM = 0x81,
+};
+
+struct scmi_imx_bbm_proto_ops {
+	int (*rtc_time_set)(const struct scmi_protocol_handle *ph, u32 id,
+			    uint64_t sec);
+	int (*rtc_time_get)(const struct scmi_protocol_handle *ph, u32 id,
+			    u64 *val);
+	int (*rtc_alarm_set)(const struct scmi_protocol_handle *ph, u32 id,
+			     bool enable, u64 sec);
+	int (*button_get)(const struct scmi_protocol_handle *ph, u32 *state);
+};
+
+enum scmi_nxp_notification_events {
+	SCMI_EVENT_IMX_BBM_RTC = 0x0,
+	SCMI_EVENT_IMX_BBM_BUTTON = 0x1,
+};
+
+struct scmi_imx_bbm_notif_report {
+	bool			is_rtc;
+	bool			is_button;
+	ktime_t			timestamp;
+	unsigned int		rtc_id;
+	unsigned int		rtc_evt;
+};
+#endif
-- 
cgit v1.2.3


From 61c9f03e22fc57fe61726c513b1f92c0ed1ef00f Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Fri, 23 Aug 2024 17:05:19 +0800
Subject: firmware: arm_scmi: Add initial support for i.MX MISC protocol

i.MX95 System Manager(SM) firmware includes a SCMI vendor protocol, SCMI
MISC protocol which includes controls that are misc settings/actions that
must be exposed from the SM to agents. They are device specific and are
usually define to access bit fields in various mix block control modules,
IOMUX_GPR, and other General Purpose registers, Control Status Registers
owned by the SM.

Reviewed-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Message-Id: <20240823-imx95-bbm-misc-v2-v8-3-e600ed9e9271@nxp.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scmi_imx_protocol.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/scmi_imx_protocol.h b/include/linux/scmi_imx_protocol.h
index 2df2ea0f1809..066216f1357a 100644
--- a/include/linux/scmi_imx_protocol.h
+++ b/include/linux/scmi_imx_protocol.h
@@ -15,6 +15,7 @@
 
 enum scmi_nxp_protocol {
 	SCMI_PROTOCOL_IMX_BBM = 0x81,
+	SCMI_PROTOCOL_IMX_MISC = 0x84,
 };
 
 struct scmi_imx_bbm_proto_ops {
@@ -30,6 +31,7 @@ struct scmi_imx_bbm_proto_ops {
 enum scmi_nxp_notification_events {
 	SCMI_EVENT_IMX_BBM_RTC = 0x0,
 	SCMI_EVENT_IMX_BBM_BUTTON = 0x1,
+	SCMI_EVENT_IMX_MISC_CONTROL = 0x0,
 };
 
 struct scmi_imx_bbm_notif_report {
@@ -39,4 +41,19 @@ struct scmi_imx_bbm_notif_report {
 	unsigned int		rtc_id;
 	unsigned int		rtc_evt;
 };
+
+struct scmi_imx_misc_ctrl_notify_report {
+	ktime_t			timestamp;
+	unsigned int		ctrl_id;
+	unsigned int		flags;
+};
+
+struct scmi_imx_misc_proto_ops {
+	int (*misc_ctrl_set)(const struct scmi_protocol_handle *ph, u32 id,
+			     u32 num, u32 *val);
+	int (*misc_ctrl_get)(const struct scmi_protocol_handle *ph, u32 id,
+			     u32 *num, u32 *val);
+	int (*misc_ctrl_req_notify)(const struct scmi_protocol_handle *ph,
+				    u32 ctrl_id, u32 evt_id, u32 flags);
+};
 #endif
-- 
cgit v1.2.3


From 0b4f8a68b292e7ee82107b1ce15c3aad31c864b1 Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Fri, 23 Aug 2024 17:05:21 +0800
Subject: firmware: imx: Add i.MX95 MISC driver

The i.MX95 System manager exports SCMI MISC protocol for linux to do
various settings, such as set board gpio expander as wakeup source.

The driver is to add the support.

Reviewed-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Message-Id: <20240823-imx95-bbm-misc-v2-v8-5-e600ed9e9271@nxp.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/firmware/imx/sm.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 include/linux/firmware/imx/sm.h

(limited to 'include/linux')

diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h
new file mode 100644
index 000000000000..62a2690e2abd
--- /dev/null
+++ b/include/linux/firmware/imx/sm.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright 2024 NXP
+ */
+
+#ifndef _SCMI_IMX_H
+#define _SCMI_IMX_H
+
+#include <linux/bitfield.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+
+#define SCMI_IMX_CTRL_PDM_CLK_SEL	0	/* AON PDM clock sel */
+#define SCMI_IMX_CTRL_MQS1_SETTINGS	1	/* AON MQS settings */
+#define SCMI_IMX_CTRL_SAI1_MCLK		2	/* AON SAI1 MCLK */
+#define SCMI_IMX_CTRL_SAI3_MCLK		3	/* WAKE SAI3 MCLK */
+#define SCMI_IMX_CTRL_SAI4_MCLK		4	/* WAKE SAI4 MCLK */
+#define SCMI_IMX_CTRL_SAI5_MCLK		5	/* WAKE SAI5 MCLK */
+
+#if IS_ENABLED(CONFIG_IMX_SCMI_MISC_EXT)
+int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val);
+int scmi_imx_misc_ctrl_set(u32 id, u32 val);
+#else
+static inline int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int scmi_imx_misc_ctrl_set(u32 id, u32 val);
+{
+	return -EOPNOTSUPP;
+}
+#endif
+#endif
-- 
cgit v1.2.3


From c42a01264ba1497eb3193c08ff3c2656d98250a6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 19 Jul 2024 06:10:24 +0200
Subject: dma-mapping: don't return errors from dma_set_min_align_mask

A NULL dev->dma_parms indicates either a bus that is not DMA capable or
grave bug in the implementation of the bus code.

There isn't much the driver can do in terms of error handling for either
case, so just warn and continue as DMA operations will fail anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/dma-mapping.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f693aafe221f..cfd6bafec3f9 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -575,13 +575,12 @@ static inline unsigned int dma_get_min_align_mask(struct device *dev)
 	return 0;
 }
 
-static inline int dma_set_min_align_mask(struct device *dev,
+static inline void dma_set_min_align_mask(struct device *dev,
 		unsigned int min_align_mask)
 {
 	if (WARN_ON_ONCE(!dev->dma_parms))
-		return -EIO;
+		return;
 	dev->dma_parms->min_align_mask = min_align_mask;
-	return 0;
 }
 
 #ifndef dma_get_cache_alignment
-- 
cgit v1.2.3


From 560a861ab4174b42240157ab5cebe36b8c7bc418 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 19 Jul 2024 06:06:28 +0200
Subject: dma-mapping: don't return errors from dma_set_seg_boundary

A NULL dev->dma_parms indicates either a bus that is not DMA capable or
grave bug in the implementation of the bus code.

There isn't much the driver can do in terms of error handling for either
case, so just warn and continue as DMA operations will fail anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/dma-mapping.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index cfd6bafec3f9..6bd1333dbacb 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -559,13 +559,11 @@ static inline unsigned long dma_get_seg_boundary_nr_pages(struct device *dev,
 	return (dma_get_seg_boundary(dev) >> page_shift) + 1;
 }
 
-static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask)
+static inline void dma_set_seg_boundary(struct device *dev, unsigned long mask)
 {
-	if (dev->dma_parms) {
-		dev->dma_parms->segment_boundary_mask = mask;
-		return 0;
-	}
-	return -EIO;
+	if (WARN_ON_ONCE(!dev->dma_parms))
+		return;
+	dev->dma_parms->segment_boundary_mask = mask;
 }
 
 static inline unsigned int dma_get_min_align_mask(struct device *dev)
-- 
cgit v1.2.3


From 334304ac2baca7f3e821c47cf5129d90e7a6b1e6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 19 Jul 2024 06:07:38 +0200
Subject: dma-mapping: don't return errors from dma_set_max_seg_size

A NULL dev->dma_parms indicates either a bus that is not DMA capable or
grave bug in the implementation of the bus code.

There isn't much the driver can do in terms of error handling for either
case, so just warn and continue as DMA operations will fail anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # For MMC
---
 include/linux/dma-mapping.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 6bd1333dbacb..1524da363734 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -524,13 +524,11 @@ static inline unsigned int dma_get_max_seg_size(struct device *dev)
 	return SZ_64K;
 }
 
-static inline int dma_set_max_seg_size(struct device *dev, unsigned int size)
+static inline void dma_set_max_seg_size(struct device *dev, unsigned int size)
 {
-	if (dev->dma_parms) {
-		dev->dma_parms->max_segment_size = size;
-		return 0;
-	}
-	return -EIO;
+	if (WARN_ON_ONCE(!dev->dma_parms))
+		return;
+	dev->dma_parms->max_segment_size = size;
 }
 
 static inline unsigned long dma_get_seg_boundary(struct device *dev)
-- 
cgit v1.2.3


From b31c9d9dc343146b9f4ce67b4eee748c49296e99 Mon Sep 17 00:00:00 2001
From: Peter Hutterer <peter.hutterer@who-t.net>
Date: Tue, 27 Aug 2024 17:19:29 +0900
Subject: HID: hidraw: add HIDIOCREVOKE ioctl

There is a need for userspace applications to open HID devices directly.
Use-cases include configuration of gaming mice or direct access to
joystick devices. The latter is currently handled by the uaccess tag in
systemd, other devices include more custom/local configurations or just
sudo.

A better approach is what we already have for evdev devices: give the
application a file descriptor and revoke it when it may no longer access
that device.

This patch is the hidraw equivalent to the EVIOCREVOKE ioctl, see
commit c7dc65737c9a ("Input: evdev - add EVIOCREVOKE ioctl") for full
details.

An MR for systemd-logind has been filed here:
https://github.com/systemd/systemd/pull/33970

Signed-off-by: Peter Hutterer <peter.hutterer@who-t.net>
Link: https://patch.msgid.link/20240827-hidraw-revoke-v5-1-d004a7451aea@kernel.org
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 include/linux/hidraw.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/hidraw.h b/include/linux/hidraw.h
index cd67f4ca5599..18fd30a288de 100644
--- a/include/linux/hidraw.h
+++ b/include/linux/hidraw.h
@@ -32,6 +32,7 @@ struct hidraw_list {
 	struct hidraw *hidraw;
 	struct list_head node;
 	struct mutex read_mutex;
+	bool revoked;
 };
 
 #ifdef CONFIG_HIDRAW
-- 
cgit v1.2.3


From b35243a447b9fe6457fa8e1352152b818436ba5a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 26 Aug 2024 19:37:54 +0200
Subject: block: rework bio splitting

The current setup with bio_may_exceed_limit and __bio_split_to_limits
is a bit of a mess.

Change it so that __bio_split_to_limits does all the work and is just
a variant of bio_split_to_limits that returns nr_segs.  This is done
by inlining it and instead have the various bio_split_* helpers directly
submit the potentially split bios.

To support btrfs, the rw version has a lower level helper split out
that just returns the offset to split.  This turns out to nicely clean
up the btrfs flow as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: David Sterba <dsterba@suse.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
Link: https://lore.kernel.org/r/20240826173820.1690925-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index a46e2047bea4..faceadb040f9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -324,8 +324,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio)
 void bio_trim(struct bio *bio, sector_t offset, sector_t size);
 extern struct bio *bio_split(struct bio *bio, int sectors,
 			     gfp_t gfp, struct bio_set *bs);
-struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
-		unsigned *segs, struct bio_set *bs, unsigned max_bytes);
+int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
+		unsigned *segs, unsigned max_bytes);
 
 /**
  * bio_next_split - get next @sectors from a bio, splitting if necessary
-- 
cgit v1.2.3


From 379b122a3ec8033aa43cb70e8ecb6fb7f98aa68f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 26 Aug 2024 19:37:55 +0200
Subject: block: constify the lim argument to
 queue_limits_max_zone_append_sectors

queue_limits_max_zone_append_sectors doesn't change the lim argument,
so mark it as const.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
Link: https://lore.kernel.org/r/20240826173820.1690925-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e85ec73a07d5..ec3ea5d1f99d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1187,7 +1187,8 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q)
 	return q->limits.max_segment_size;
 }
 
-static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l)
+static inline unsigned int
+queue_limits_max_zone_append_sectors(const struct queue_limits *l)
 {
 	unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);
 
-- 
cgit v1.2.3


From 2e1a57d5b0adbb5bd1d85245ec29b6d3cc7edc69 Mon Sep 17 00:00:00 2001
From: Chris Morgan <macromorgan@hotmail.com>
Date: Wed, 21 Aug 2024 16:54:52 -0500
Subject: mfd: axp20x: Add ADC, BAT, and USB cells for AXP717

Add support for the AXP717 PMIC to utilize the ADC (for reading
voltage, current, and temperature information from the PMIC) as well
as the USB charger and battery.

Signed-off-by: Chris Morgan <macromorgan@hotmail.com>
Link: https://lore.kernel.org/r/20240821215456.962564-12-macroalpha82@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/axp20x.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 8c0a33a2e9ce..9b7ef473adcb 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -115,6 +115,16 @@ enum axp20x_variants {
 #define AXP313A_IRQ_STATE		0x21
 
 #define AXP717_ON_INDICATE		0x00
+#define AXP717_PMU_STATUS_2		0x01
+#define AXP717_BC_DETECT		0x05
+#define AXP717_PMU_FAULT		0x08
+#define AXP717_MODULE_EN_CONTROL_1	0x0b
+#define AXP717_MIN_SYS_V_CONTROL	0x15
+#define AXP717_INPUT_VOL_LIMIT_CTRL	0x16
+#define AXP717_INPUT_CUR_LIMIT_CTRL	0x17
+#define AXP717_MODULE_EN_CONTROL_2	0x19
+#define AXP717_BOOST_CONTROL		0x1e
+#define AXP717_VSYS_V_POWEROFF		0x24
 #define AXP717_IRQ0_EN			0x40
 #define AXP717_IRQ1_EN			0x41
 #define AXP717_IRQ2_EN			0x42
@@ -125,6 +135,9 @@ enum axp20x_variants {
 #define AXP717_IRQ2_STATE		0x4a
 #define AXP717_IRQ3_STATE		0x4b
 #define AXP717_IRQ4_STATE		0x4c
+#define AXP717_ICC_CHG_SET		0x62
+#define AXP717_ITERM_CHG_SET		0x63
+#define AXP717_CV_CHG_SET		0x64
 #define AXP717_DCDC_OUTPUT_CONTROL	0x80
 #define AXP717_DCDC1_CONTROL		0x83
 #define AXP717_DCDC2_CONTROL		0x84
@@ -145,6 +158,19 @@ enum axp20x_variants {
 #define AXP717_CLDO3_CONTROL		0x9d
 #define AXP717_CLDO4_CONTROL		0x9e
 #define AXP717_CPUSLDO_CONTROL		0x9f
+#define AXP717_BATT_PERCENT_DATA	0xa4
+#define AXP717_ADC_CH_EN_CONTROL	0xc0
+#define AXP717_BATT_V_H			0xc4
+#define AXP717_BATT_V_L			0xc5
+#define AXP717_VBUS_V_H			0xc6
+#define AXP717_VBUS_V_L			0xc7
+#define AXP717_VSYS_V_H			0xc8
+#define AXP717_VSYS_V_L			0xc9
+#define AXP717_BATT_CHRG_I_H		0xca
+#define AXP717_BATT_CHRG_I_L		0xcb
+#define AXP717_ADC_DATA_SEL		0xcd
+#define AXP717_ADC_DATA_H		0xce
+#define AXP717_ADC_DATA_L		0xcf
 
 #define AXP806_STARTUP_SRC		0x00
 #define AXP806_CHIP_ID			0x03
-- 
cgit v1.2.3


From c0390d541128e8820af8177a572d9d87ff68a3bb Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 23 Aug 2024 21:06:58 +0200
Subject: fs: pack struct file

Now that we shrunk struct file to 192 bytes aka 3 cachelines reorder
struct file to not leave any holes or have members cross cachelines.

Add a short comment to each of the fields and mark the cachelines.
It's possible that we may have to tweak this based on profiling in the
future. So far I had Jens test this comparing io_uring with non-fixed
and fixed files and it improved performance. The layout is a combination
of Jens' and my changes.

Link: https: //lore.kernel.org/r/20240824-peinigen-hocken-7384b977c643@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 91 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 51 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 095a956aeb29..af8bbd4eeb3a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -987,52 +987,63 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
 		index <  ra->start + ra->size);
 }
 
-/*
- * f_{lock,count,pos_lock} members can be highly contended and share
- * the same cacheline. f_{lock,mode} are very frequently used together
- * and so share the same cacheline as well. The read-mostly
- * f_{path,inode,op} are kept on a separate cacheline.
+/**
+ * struct file - Represents a file
+ * @f_count: reference count
+ * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
+ * @f_mode: FMODE_* flags often used in hotpaths
+ * @f_op: file operations
+ * @f_mapping: Contents of a cacheable, mappable object.
+ * @private_data: filesystem or driver specific data
+ * @f_inode: cached inode
+ * @f_flags: file flags
+ * @f_iocb_flags: iocb flags
+ * @f_cred: stashed credentials of creator/opener
+ * @f_path: path of the file
+ * @f_pos_lock: lock protecting file position
+ * @f_pos: file position
+ * @f_version: file version
+ * @f_security: LSM security context of this file
+ * @f_owner: file owner
+ * @f_wb_err: writeback error
+ * @f_sb_err: per sb writeback errors
+ * @f_ep: link of all epoll hooks for this file
+ * @f_task_work: task work entry point
+ * @f_llist: work queue entrypoint
+ * @f_ra: file's readahead state
  */
 struct file {
-	union {
-		/* fput() uses task work when closing and freeing file (default). */
-		struct callback_head 	f_task_work;
-		/* fput() must use workqueue (most kernel threads). */
-		struct llist_node	f_llist;
-		/* Invalid after last fput(). */
-		struct file_ra_state	f_ra;
-	};
-	/*
-	 * Protects f_ep, f_flags.
-	 * Must not be taken from IRQ context.
-	 */
-	spinlock_t		f_lock;
-	fmode_t			f_mode;
-	atomic_long_t		f_count;
-	struct mutex		f_pos_lock;
-	loff_t			f_pos;
-	unsigned int		f_flags;
-	unsigned int 		f_iocb_flags;
-	struct fown_struct	*f_owner;
-	const struct cred	*f_cred;
-	struct path		f_path;
-	struct inode		*f_inode;	/* cached value */
+	atomic_long_t			f_count;
+	spinlock_t			f_lock;
+	fmode_t				f_mode;
 	const struct file_operations	*f_op;
-
-	u64			f_version;
+	struct address_space		*f_mapping;
+	void				*private_data;
+	struct inode			*f_inode;
+	unsigned int			f_flags;
+	unsigned int			f_iocb_flags;
+	const struct cred		*f_cred;
+	/* --- cacheline 1 boundary (64 bytes) --- */
+	struct path			f_path;
+	struct mutex			f_pos_lock;
+	loff_t				f_pos;
+	u64				f_version;
+	/* --- cacheline 2 boundary (128 bytes) --- */
 #ifdef CONFIG_SECURITY
-	void			*f_security;
+	void				*f_security;
 #endif
-	/* needed for tty driver, and maybe others */
-	void			*private_data;
-
+	struct fown_struct		*f_owner;
+	errseq_t			f_wb_err;
+	errseq_t			f_sb_err;
 #ifdef CONFIG_EPOLL
-	/* Used by fs/eventpoll.c to link all the hooks to this file */
-	struct hlist_head	*f_ep;
-#endif /* #ifdef CONFIG_EPOLL */
-	struct address_space	*f_mapping;
-	errseq_t		f_wb_err;
-	errseq_t		f_sb_err; /* for syncfs */
+	struct hlist_head		*f_ep;
+#endif
+	union {
+		struct callback_head	f_task_work;
+		struct llist_node	f_llist;
+		struct file_ra_state	f_ra;
+	};
+	/* --- cacheline 3 boundary (192 bytes) --- */
 } __randomize_layout
   __attribute__((aligned(4)));	/* lest something weird decides that 2 is OK */
 
-- 
cgit v1.2.3


From d345bd2e9834e2da505977e154a1c179c793b7b2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 28 Aug 2024 12:56:24 +0200
Subject: mm: add kmem_cache_create_rcu()

When a kmem cache is created with SLAB_TYPESAFE_BY_RCU the free pointer
must be located outside of the object because we don't know what part of
the memory can safely be overwritten as it may be needed to prevent
object recycling.

That has the consequence that SLAB_TYPESAFE_BY_RCU may end up adding a
new cacheline. This is the case for e.g., struct file. After having it
shrunk down by 40 bytes and having it fit in three cachelines we still
have SLAB_TYPESAFE_BY_RCU adding a fourth cacheline because it needs to
accommodate the free pointer.

Add a new kmem_cache_create_rcu() function that allows the caller to
specify an offset where the free pointer is supposed to be placed.

Link: https://lore.kernel.org/r/20240828-work-kmem_cache-rcu-v3-2-5460bc1f09f6@kernel.org
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/slab.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index eb2bf4629157..5b2da2cf31a8 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -212,6 +212,12 @@ enum _slab_flag_bits {
 #define SLAB_NO_OBJ_EXT		__SLAB_FLAG_UNUSED
 #endif
 
+/*
+ * freeptr_t represents a SLUB freelist pointer, which might be encoded
+ * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
+ */
+typedef struct { unsigned long v; } freeptr_t;
+
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
  *
@@ -242,6 +248,9 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
 			slab_flags_t flags,
 			unsigned int useroffset, unsigned int usersize,
 			void (*ctor)(void *));
+struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size,
+					 unsigned int freeptr_offset,
+					 slab_flags_t flags);
 void kmem_cache_destroy(struct kmem_cache *s);
 int kmem_cache_shrink(struct kmem_cache *s);
 
-- 
cgit v1.2.3


From ea566e18b4deea6e998088de4f1a76d1f39c8d3f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 28 Aug 2024 12:56:25 +0200
Subject: fs: use kmem_cache_create_rcu()

Switch to the new kmem_cache_create_rcu() helper which allows us to use
a custom free pointer offset avoiding the need to have an external free
pointer which would grow struct file behind our backs.

Link: https://lore.kernel.org/r/20240828-work-kmem_cache-rcu-v3-3-5460bc1f09f6@kernel.org
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index af8bbd4eeb3a..58c91a52cad1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1011,6 +1011,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
  * @f_task_work: task work entry point
  * @f_llist: work queue entrypoint
  * @f_ra: file's readahead state
+ * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
  */
 struct file {
 	atomic_long_t			f_count;
@@ -1042,6 +1043,7 @@ struct file {
 		struct callback_head	f_task_work;
 		struct llist_node	f_llist;
 		struct file_ra_state	f_ra;
+		freeptr_t		f_freeptr;
 	};
 	/* --- cacheline 3 boundary (192 bytes) --- */
 } __randomize_layout
-- 
cgit v1.2.3


From f91f2a9879cc77db1f45f690f38f42698580416e Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 28 Aug 2024 16:34:00 -0700
Subject: dmaengine: idxd: Add a new DSA device ID for Granite Rapids-D
 platform

A new DSA device ID, 0x11fb, is introduced for the Granite Rapids-D
platform. Add the device ID to the IDXD driver.

Since a potential security issue has been fixed on the new device, it's
secure to assign the device to virtual machines, and therefore, the new
device ID will not be added to the VFIO denylist. Additionally, the new
device ID may be useful in identifying and addressing any other potential
issues with this specific device in the future. The same is also applied
to any other new DSA/IAA devices with new device IDs.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20240828233401.186007-2-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index e388c8b1cbc2..ff99047dac44 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2706,6 +2706,7 @@
 #define PCI_DEVICE_ID_INTEL_82815_MC	0x1130
 #define PCI_DEVICE_ID_INTEL_82815_CGC	0x1132
 #define PCI_DEVICE_ID_INTEL_SST_TNG	0x119a
+#define PCI_DEVICE_ID_INTEL_DSA_GNRD	0x11fb
 #define PCI_DEVICE_ID_INTEL_82092AA_0	0x1221
 #define PCI_DEVICE_ID_INTEL_82437	0x122d
 #define PCI_DEVICE_ID_INTEL_82371FB_0	0x122e
-- 
cgit v1.2.3


From 4fecf944c051ea852e98c062636bf5b4c7f5f8a7 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 28 Aug 2024 16:34:01 -0700
Subject: dmaengine: idxd: Add new DSA and IAA device IDs for Diamond Rapids
 platform

A new DSA device ID, 0x1212, and a new IAA device ID, 0x1216, are
introduced for Diamond Rapids platform. Add the device IDs to the IDXD
driver.

The name "IAA" is used in new code instead of the old name "IAX".
However, the "IAX" naming (e.g., IDXD_TYPE_IAX) is retained for legacy
code compatibility.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20240828233401.186007-3-fenghua.yu@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index ff99047dac44..8139231d0e86 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2707,6 +2707,8 @@
 #define PCI_DEVICE_ID_INTEL_82815_CGC	0x1132
 #define PCI_DEVICE_ID_INTEL_SST_TNG	0x119a
 #define PCI_DEVICE_ID_INTEL_DSA_GNRD	0x11fb
+#define PCI_DEVICE_ID_INTEL_DSA_DMR	0x1212
+#define PCI_DEVICE_ID_INTEL_IAA_DMR	0x1216
 #define PCI_DEVICE_ID_INTEL_82092AA_0	0x1221
 #define PCI_DEVICE_ID_INTEL_82437	0x122d
 #define PCI_DEVICE_ID_INTEL_82371FB_0	0x122e
-- 
cgit v1.2.3


From 6f606ffd6dd7583d8194ee3d858ba4da2eff26a3 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 29 Aug 2024 14:08:23 -0700
Subject: bpf: Move insn_buf[16] to bpf_verifier_env

This patch moves the 'struct bpf_insn insn_buf[16]' stack usage
to the bpf_verifier_env. A '#define INSN_BUF_SIZE 16' is also added
to replace the ARRAY_SIZE(insn_buf) usages.

Both convert_ctx_accesses() and do_misc_fixup() are changed
to use the env->insn_buf.

It is a refactoring work for adding the epilogue_buf[16] in a later patch.

With this patch, the stack size usage decreased.

Before:
./kernel/bpf/verifier.c:22133:5: warning: stack frame size (2584)

After:
./kernel/bpf/verifier.c:22184:5: warning: stack frame size (2264)

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240829210833.388152-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 279b4a640644..0ad2d189c546 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -23,6 +23,8 @@
  * (in the "-8,-16,...,-512" form)
  */
 #define TMP_STR_BUF_LEN 320
+/* Patch buffer size */
+#define INSN_BUF_SIZE 16
 
 /* Liveness marks, used for registers and spilled-regs (in stack slots).
  * Read marks propagate upwards until they find a write mark; they record that
@@ -780,6 +782,7 @@ struct bpf_verifier_env {
 	 * e.g., in reg_type_str() to generate reg_type string
 	 */
 	char tmp_str_buf[TMP_STR_BUF_LEN];
+	struct bpf_insn insn_buf[INSN_BUF_SIZE];
 };
 
 static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
-- 
cgit v1.2.3


From 169c31761c8d7f606f3ee628829c27998626c4f0 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 29 Aug 2024 14:08:25 -0700
Subject: bpf: Add gen_epilogue to bpf_verifier_ops

This patch adds a .gen_epilogue to the bpf_verifier_ops. It is similar
to the existing .gen_prologue. Instead of allowing a subsystem
to run code at the beginning of a bpf prog, it allows the subsystem
to run code just before the bpf prog exit.

One of the use case is to allow the upcoming bpf qdisc to ensure that
the skb->dev is the same as the qdisc->dev_queue->dev. The bpf qdisc
struct_ops implementation could either fix it up or drop the skb.
Another use case could be in bpf_tcp_ca.c to enforce snd_cwnd
has sane value (e.g. non zero).

The epilogue can do the useful thing (like checking skb->dev) if it
can access the bpf prog's ctx. Unlike prologue, r1 may not hold the
ctx pointer. This patch saves the r1 in the stack if the .gen_epilogue
has returned some instructions in the "epilogue_buf".

The existing .gen_prologue is done in convert_ctx_accesses().
The new .gen_epilogue is done in the convert_ctx_accesses() also.
When it sees the (BPF_JMP | BPF_EXIT) instruction, it will be patched
with the earlier generated "epilogue_buf". The epilogue patching is
only done for the main prog.

Only one epilogue will be patched to the main program. When the
bpf prog has multiple BPF_EXIT instructions, a BPF_JA is used
to goto the earlier patched epilogue. Majority of the archs
support (BPF_JMP32 | BPF_JA): x86, arm, s390, risv64, loongarch,
powerpc and arc. This patch keeps it simple and always
use (BPF_JMP32 | BPF_JA). A new macro BPF_JMP32_A is added to
generate the (BPF_JMP32 | BPF_JA) insn.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240829210833.388152-4-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  2 ++
 include/linux/bpf_verifier.h |  1 +
 include/linux/filter.h       | 10 ++++++++++
 3 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index dc63083f76b7..6f87fb014fba 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -974,6 +974,8 @@ struct bpf_verifier_ops {
 				struct bpf_insn_access_aux *info);
 	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
 			    const struct bpf_prog *prog);
+	int (*gen_epilogue)(struct bpf_insn *insn, const struct bpf_prog *prog,
+			    s16 ctx_stack_off);
 	int (*gen_ld_abs)(const struct bpf_insn *orig,
 			  struct bpf_insn *insn_buf);
 	u32 (*convert_ctx_access)(enum bpf_access_type type,
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 0ad2d189c546..2e20207315a9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -783,6 +783,7 @@ struct bpf_verifier_env {
 	 */
 	char tmp_str_buf[TMP_STR_BUF_LEN];
 	struct bpf_insn insn_buf[INSN_BUF_SIZE];
+	struct bpf_insn epilogue_buf[INSN_BUF_SIZE];
 };
 
 static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index b6672ff61407..99b6fc83825b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -437,6 +437,16 @@ static inline bool insn_is_cast_user(const struct bpf_insn *insn)
 		.off   = OFF,					\
 		.imm   = 0 })
 
+/* Unconditional jumps, gotol pc + imm32 */
+
+#define BPF_JMP32_A(IMM)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP32 | BPF_JA,			\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
 /* Relative call */
 
 #define BPF_CALL_REL(TGT)					\
-- 
cgit v1.2.3


From 087adb4f0f91ee330446a70af899e6a996e5cc13 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Tue, 6 Aug 2024 19:28:46 +0200
Subject: vfs: dodge smp_mb in break_lease and break_deleg in the common case

These inlines show up in the fast path (e.g., in do_dentry_open()) and
induce said full barrier regarding i_flctx access when in most cases the
pointer is NULL.

The pointer can be safely checked before issuing the barrier, dodging it
in most cases as a result.

It is plausible the consume fence would be sufficient, but I don't want
to go audit all callers regarding what they before calling here.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20240806172846.886570-1-mjguzik@gmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/filelock.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index daee999d05f3..bb44224c6676 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -420,28 +420,38 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
 #ifdef CONFIG_FILE_LOCKING
 static inline int break_lease(struct inode *inode, unsigned int mode)
 {
+	struct file_lock_context *flctx;
+
 	/*
 	 * Since this check is lockless, we must ensure that any refcounts
 	 * taken are done before checking i_flctx->flc_lease. Otherwise, we
 	 * could end up racing with tasks trying to set a new lease on this
 	 * file.
 	 */
+	flctx = READ_ONCE(inode->i_flctx);
+	if (!flctx)
+		return 0;
 	smp_mb();
-	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
+	if (!list_empty_careful(&flctx->flc_lease))
 		return __break_lease(inode, mode, FL_LEASE);
 	return 0;
 }
 
 static inline int break_deleg(struct inode *inode, unsigned int mode)
 {
+	struct file_lock_context *flctx;
+
 	/*
 	 * Since this check is lockless, we must ensure that any refcounts
 	 * taken are done before checking i_flctx->flc_lease. Otherwise, we
 	 * could end up racing with tasks trying to set a new lease on this
 	 * file.
 	 */
+	flctx = READ_ONCE(inode->i_flctx);
+	if (!flctx)
+		return 0;
 	smp_mb();
-	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
+	if (!list_empty_careful(&flctx->flc_lease))
 		return __break_lease(inode, mode, FL_DELEG);
 	return 0;
 }
-- 
cgit v1.2.3


From 8447d848e1dc6d42d1dcd00f133d7715fc732c47 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Sat, 10 Aug 2024 08:47:53 +0200
Subject: vfs: only read fops once in fops_get/put

In do_dentry_open() the usage is:
	f->f_op = fops_get(inode->i_fop);

In generated asm the compiler emits 2 reads from inode->i_fop instead of
just one.

This popped up due to false-sharing where loads from that offset end up
bouncing a cacheline during parallel open. While this is going to be fixed,
the spurious load does not need to be there.

This makes do_dentry_open() go down from 1177 to 1154 bytes.

fops_put() is patched to maintain some consistency.

No functional changes.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20240810064753.1211441-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c0286d479c9d..696c620f90e0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2553,10 +2553,17 @@ struct super_block *sget(struct file_system_type *type,
 struct super_block *sget_dev(struct fs_context *fc, dev_t dev);
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
-#define fops_get(fops) \
-	(((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
-#define fops_put(fops) \
-	do { if (fops) module_put((fops)->owner); } while(0)
+#define fops_get(fops) ({						\
+	const struct file_operations *_fops = (fops);			\
+	(((_fops) && try_module_get((_fops)->owner) ? (_fops) : NULL));	\
+})
+
+#define fops_put(fops) ({						\
+	const struct file_operations *_fops = (fops);			\
+	if (_fops)							\
+		module_put((_fops)->owner);				\
+})
+
 /*
  * This one is to be used *ONLY* from ->open() instances.
  * fops must be non-NULL, pinned down *and* module dependencies
-- 
cgit v1.2.3


From 641bb4394f405cba498b100b44541ffc0aed5be1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 9 Aug 2024 12:38:56 +0200
Subject: fs: move FMODE_UNSIGNED_OFFSET to fop_flags

This is another flag that is statically set and doesn't need to use up
an FMODE_* bit. Move it to ->fop_flags and free up another FMODE_* bit.

(1) mem_open() used from proc_mem_operations
(2) adi_open() used from adi_fops
(3) drm_open_helper():
    (3.1) accel_open() used from DRM_ACCEL_FOPS
    (3.2) drm_open() used from
    (3.2.1) amdgpu_driver_kms_fops
    (3.2.2) psb_gem_fops
    (3.2.3) i915_driver_fops
    (3.2.4) nouveau_driver_fops
    (3.2.5) panthor_drm_driver_fops
    (3.2.6) radeon_driver_kms_fops
    (3.2.7) tegra_drm_fops
    (3.2.8) vmwgfx_driver_fops
    (3.2.9) xe_driver_fops
    (3.2.10) DRM_GEM_FOPS
    (3.2.11) DEFINE_DRM_GEM_DMA_FOPS
(4) struct memdev sets fmode flags based on type of device opened. For
    devices using struct mem_fops unsigned offset is used.

Mark all these file operations as FOP_UNSIGNED_OFFSET and add asserts
into the open helper to ensure that the flag is always set.

Link: https://lore.kernel.org/r/20240809-work-fop_unsigned-v1-1-658e054d893e@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 696c620f90e0..0145bda465ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -146,8 +146,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* Expect random access pattern */
 #define FMODE_RANDOM		((__force fmode_t)(1 << 12))
 
-/* File is huge (eg. /dev/mem): treat loff_t as unsigned */
-#define FMODE_UNSIGNED_OFFSET	((__force fmode_t)(1 << 13))
+/* FMODE_* bit 13 */
 
 /* File is opened with O_PATH; almost nothing can be done with it */
 #define FMODE_PATH		((__force fmode_t)(1 << 14))
@@ -2073,6 +2072,8 @@ struct file_operations {
 #define FOP_DIO_PARALLEL_WRITE	((__force fop_flags_t)(1 << 3))
 /* Contains huge pages */
 #define FOP_HUGE_PAGES		((__force fop_flags_t)(1 << 4))
+/* Treat loff_t as unsigned (e.g., /dev/mem) */
+#define FOP_UNSIGNED_OFFSET	((__force fop_flags_t)(1 << 5))
 
 /* Wrap a directory iterator that needs exclusive inode access */
 int wrap_directory_iterator(struct file *, struct dir_context *,
-- 
cgit v1.2.3


From 1c48d441468c425e878d81c5c3e7ff8a9a594cc0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 16 Aug 2024 16:35:52 +0200
Subject: inode: remove __I_DIO_WAKEUP

Afaict, we can just rely on inode->i_dio_count for waiting instead of
this awkward indirection through __I_DIO_WAKEUP. This survives LTP dio
and xfstests dio tests.

Link: https://lore.kernel.org/r/20240816-vfs-misc-dio-v1-1-80fe21a2c710@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0145bda465ff..8f8d274929d7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2373,8 +2373,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
  *
  * I_REFERENCED		Marks the inode as recently references on the LRU list.
  *
- * I_DIO_WAKEUP		Never set.  Only used as a key for wait_on_bit().
- *
  * I_WB_SWITCH		Cgroup bdi_writeback switching in progress.  Used to
  *			synchronize competing switching instances and to tell
  *			wb stat updates to grab the i_pages lock.  See
@@ -2409,8 +2407,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
 #define __I_SYNC		7
 #define I_SYNC			(1 << __I_SYNC)
 #define I_REFERENCED		(1 << 8)
-#define __I_DIO_WAKEUP		9
-#define I_DIO_WAKEUP		(1 << __I_DIO_WAKEUP)
 #define I_LINKABLE		(1 << 10)
 #define I_DIRTY_TIME		(1 << 11)
 #define I_WB_SWITCH		(1 << 13)
@@ -3227,7 +3223,9 @@ static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
 }
 #endif
 
+bool inode_dio_finished(const struct inode *inode);
 void inode_dio_wait(struct inode *inode);
+void inode_dio_wait_interruptible(struct inode *inode);
 
 /**
  * inode_dio_begin - signal start of a direct I/O requests
@@ -3251,7 +3249,7 @@ static inline void inode_dio_begin(struct inode *inode)
 static inline void inode_dio_end(struct inode *inode)
 {
 	if (atomic_dec_and_test(&inode->i_dio_count))
-		wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
+		wake_up_var(&inode->i_dio_count);
 }
 
 extern void inode_set_flags(struct inode *inode, unsigned int flags,
-- 
cgit v1.2.3


From 029c3f27fe84c5fd29d49a99cc5176433bf12209 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Aug 2024 11:28:38 +0200
Subject: fs: remove unused path_put_init()

This helper has been unused for a while now.

Link: https://lore.kernel.org/r/20240822-bewuchs-werktag-46672b3c0606@brauner
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/path.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/path.h b/include/linux/path.h
index ca073e70decd..7ea389dc764b 100644
--- a/include/linux/path.h
+++ b/include/linux/path.h
@@ -18,12 +18,6 @@ static inline int path_equal(const struct path *path1, const struct path *path2)
 	return path1->mnt == path2->mnt && path1->dentry == path2->dentry;
 }
 
-static inline void path_put_init(struct path *path)
-{
-	path_put(path);
-	*path = (struct path) { };
-}
-
 /*
  * Cleanup macro for use with __free(path_put). Avoids dereference and
  * copying @path unlike DEFINE_FREE(). path_put() will handle the empty
-- 
cgit v1.2.3


From 71ff58ce3428b2471efae5b00594e892b285c97c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Aug 2024 11:30:58 +0200
Subject: fs: s/__u32/u32/ for s_fsnotify_mask

The underscore variants are for uapi whereas the non-underscore variants
are for in-kernel consumers.

Link: https://lore.kernel.org/r/20240822-anwerben-nutzung-1cd6c82a565f@brauner
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8f8d274929d7..cda1f2545ea0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1266,7 +1266,7 @@ struct super_block {
 	time64_t		   s_time_min;
 	time64_t		   s_time_max;
 #ifdef CONFIG_FSNOTIFY
-	__u32			s_fsnotify_mask;
+	u32			s_fsnotify_mask;
 	struct fsnotify_sb_info	*s_fsnotify_info;
 #endif
 
-- 
cgit v1.2.3


From da18ecbf0fb6fb73e6f9273e7aa15610f148ce17 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 23 Aug 2024 14:47:35 +0200
Subject: fs: add i_state helpers

The i_state member is an unsigned long so that it can be used with the
wait bit infrastructure which expects unsigned long. This wastes 4 bytes
which we're unlikely to ever use. Switch to using the var event wait
mechanism using the address of the bit. Thanks to Linus for the address
idea.

Link: https://lore.kernel.org/r/20240823-work-i_state-v3-1-5cd5fd207a57@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index cda1f2545ea0..d35de348d2ec 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -744,6 +744,21 @@ struct inode {
 	void			*i_private; /* fs or device private pointer */
 } __randomize_layout;
 
+/*
+ * Get bit address from inode->i_state to use with wait_var_event()
+ * infrastructre.
+ */
+#define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit))
+
+struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
+					    struct inode *inode, u32 bit);
+
+static inline void inode_wake_up_bit(struct inode *inode, u32 bit)
+{
+	/* Caller is responsible for correct memory barriers. */
+	wake_up_var(inode_state_wait_address(inode, bit));
+}
+
 struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode);
 
 static inline unsigned int i_blocksize(const struct inode *node)
-- 
cgit v1.2.3


From 2ed634c96ed1d6e4ee0497be176f9980866e81cb Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 23 Aug 2024 14:47:36 +0200
Subject: fs: reorder i_state bits

so that we can use the first bits to derive unique addresses from
i_state.

Link: https://lore.kernel.org/r/20240823-work-i_state-v3-2-5cd5fd207a57@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index d35de348d2ec..a868c9823c10 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2410,28 +2410,32 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
  *			i_count.
  *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
+ *
+ * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait
+ * upon. There's one free address left.
  */
-#define I_DIRTY_SYNC		(1 << 0)
-#define I_DIRTY_DATASYNC	(1 << 1)
-#define I_DIRTY_PAGES		(1 << 2)
-#define __I_NEW			3
+#define __I_NEW			0
 #define I_NEW			(1 << __I_NEW)
-#define I_WILL_FREE		(1 << 4)
-#define I_FREEING		(1 << 5)
-#define I_CLEAR			(1 << 6)
-#define __I_SYNC		7
+#define __I_SYNC		1
 #define I_SYNC			(1 << __I_SYNC)
-#define I_REFERENCED		(1 << 8)
+#define __I_LRU_ISOLATING	2
+#define I_LRU_ISOLATING		(1 << __I_LRU_ISOLATING)
+
+#define I_DIRTY_SYNC		(1 << 3)
+#define I_DIRTY_DATASYNC	(1 << 4)
+#define I_DIRTY_PAGES		(1 << 5)
+#define I_WILL_FREE		(1 << 6)
+#define I_FREEING		(1 << 7)
+#define I_CLEAR			(1 << 8)
+#define I_REFERENCED		(1 << 9)
 #define I_LINKABLE		(1 << 10)
 #define I_DIRTY_TIME		(1 << 11)
-#define I_WB_SWITCH		(1 << 13)
-#define I_OVL_INUSE		(1 << 14)
-#define I_CREATING		(1 << 15)
-#define I_DONTCACHE		(1 << 16)
-#define I_SYNC_QUEUED		(1 << 17)
-#define I_PINNING_NETFS_WB	(1 << 18)
-#define __I_LRU_ISOLATING	19
-#define I_LRU_ISOLATING		(1 << __I_LRU_ISOLATING)
+#define I_WB_SWITCH		(1 << 12)
+#define I_OVL_INUSE		(1 << 13)
+#define I_CREATING		(1 << 14)
+#define I_DONTCACHE		(1 << 15)
+#define I_SYNC_QUEUED		(1 << 16)
+#define I_PINNING_NETFS_WB	(1 << 17)
 
 #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
-- 
cgit v1.2.3


From 0fe340a98b584a31b07c664dc5ff0c923730581e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 23 Aug 2024 14:47:38 +0200
Subject: inode: port __I_NEW to var event

Port the __I_NEW mechanism to use the new var event mechanism.

Link: https://lore.kernel.org/r/20240823-work-i_state-v3-4-5cd5fd207a57@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/writeback.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 56b85841ae4c..8f651bb0a1a5 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -200,7 +200,8 @@ void inode_io_list_del(struct inode *inode);
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
 {
-	wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
+	wait_var_event(inode_state_wait_address(inode, __I_NEW),
+		       !(READ_ONCE(inode->i_state) & I_NEW));
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
-- 
cgit v1.2.3


From 2b111edbe0a9c441605be5cfb73001dc98ec686f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 23 Aug 2024 14:47:40 +0200
Subject: inode: make i_state a u32

Now that we use the wait var event mechanism make i_state a u32 and free
up 4 bytes. This means we currently have two 4 byte holes in struct
inode which we can pack.

Link: https://lore.kernel.org/r/20240823-work-i_state-v3-6-5cd5fd207a57@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index a868c9823c10..a1ef8c65d828 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -681,7 +681,8 @@ struct inode {
 #endif
 
 	/* Misc */
-	unsigned long		i_state;
+	u32			i_state;
+	/* 32-bit hole */
 	struct rw_semaphore	i_rwsem;
 
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
-- 
cgit v1.2.3


From 2a9f8995f7139a759fe8cb31d9e8a433757228ec Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Date: Thu, 4 Jul 2024 19:23:18 +0200
Subject: mfd: 88pm80x: Constify read-only regmap structs

`pm800_irq`, `pm805_irq` and `pm805_irq_chip` are not modified and can
be declared as const to move their data to a read-only section.

In order to keep the const modifier for the regmap_irq_chip structures,
the pointer used to reference them must be converted to const as well.

Signed-off-by: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Link: https://lore.kernel.org/r/20240704-mfd-const-regmap_config-v2-8-0c8785b1331d@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/88pm80x.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/88pm80x.h b/include/linux/mfd/88pm80x.h
index def5df6e74bf..551ef1c367d6 100644
--- a/include/linux/mfd/88pm80x.h
+++ b/include/linux/mfd/88pm80x.h
@@ -294,7 +294,7 @@ struct pm80x_chip {
 	struct i2c_client *client;
 	struct i2c_client *companion;
 	struct regmap *regmap;
-	struct regmap_irq_chip *regmap_irq_chip;
+	const struct regmap_irq_chip *regmap_irq_chip;
 	struct regmap_irq_chip_data *irq_data;
 	int type;
 	int irq;
-- 
cgit v1.2.3


From 87b9c9ea01f4fd9193f30cc19bd03b05541ff4cb Mon Sep 17 00:00:00 2001
From: Wilken Gottwalt <wilken.gottwalt@posteo.net>
Date: Mon, 15 Jul 2024 08:17:28 +0000
Subject: mfd: ds1wm: Remove remaining header file

The driver was removed after kernel 6.2 rendering the header file
unused.

Signed-off-by: Wilken Gottwalt <wilken.gottwalt@posteo.net>
Link: https://lore.kernel.org/r/ZpTbGHb6EX2Oe7ok@monster.localdomain
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/ds1wm.h | 29 -----------------------------
 1 file changed, 29 deletions(-)
 delete mode 100644 include/linux/mfd/ds1wm.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/ds1wm.h b/include/linux/mfd/ds1wm.h
deleted file mode 100644
index 43dfca1c9702..000000000000
--- a/include/linux/mfd/ds1wm.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* MFD cell driver data for the DS1WM driver
- *
- * to be defined in the MFD device that is
- * using this driver for one of his sub devices
- */
-
-struct ds1wm_driver_data {
-	int active_high;
-	int clock_rate;
-	/* in milliseconds, the amount of time to
-	 * sleep following a reset pulse. Zero
-	 * should work if your bus devices recover
-	 * time respects the 1-wire spec since the
-	 * ds1wm implements the precise timings of
-	 * a reset pulse/presence detect sequence.
-	 */
-	unsigned int reset_recover_delay;
-
-	/* Say 1 here for big endian Hardware
-	 * (only relevant with bus-shift > 0
-	 */
-	bool is_hw_big_endian;
-
-	/* left shift of register number to get register address offsett.
-	 * Only 0,1,2 allowed for 8,16 or 32 bit bus width respectively
-	 */
-	unsigned int bus_shift;
-};
-- 
cgit v1.2.3


From 5e9629d0ae977d6f6916d7e519724804e95f0b07 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Tue, 27 Aug 2024 15:51:12 +0100
Subject: drivers/perf: arm_spe: Use perf_allow_kernel() for permissions

Use perf_allow_kernel() for 'pa_enable' (physical addresses),
'pct_enable' (physical timestamps) and context IDs. This means that
perf_event_paranoid is now taken into account and LSM hooks can be used,
which is more consistent with other perf_event_open calls. For example
PERF_SAMPLE_PHYS_ADDR uses perf_allow_kernel() rather than just
perfmon_capable().

This also indirectly fixes the following error message which is
misleading because perf_event_paranoid is not taken into account by
perfmon_capable():

  $ perf record -e arm_spe/pa_enable/

  Error:
  Access to performance monitoring and observability operations is
  limited. Consider adjusting /proc/sys/kernel/perf_event_paranoid
  setting ...

Suggested-by: Al Grant <al.grant@arm.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Link: https://lore.kernel.org/r/20240827145113.1224604-1-james.clark@linaro.org
Link: https://lore.kernel.org/all/20240807120039.GD37996@noisy.programming.kicks-ass.net/
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf_event.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1a8942277dda..e336306b8c08 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1602,13 +1602,7 @@ static inline int perf_is_paranoid(void)
 	return sysctl_perf_event_paranoid > -1;
 }
 
-static inline int perf_allow_kernel(struct perf_event_attr *attr)
-{
-	if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
-		return -EACCES;
-
-	return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
-}
+int perf_allow_kernel(struct perf_event_attr *attr);
 
 static inline int perf_allow_cpu(struct perf_event_attr *attr)
 {
-- 
cgit v1.2.3


From 6c17c7d5936e6af6a5bda9f9de98a5e2ee6e8a6f Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 7 Aug 2024 15:19:20 -0300
Subject: iommu: Allow ATS to work on VFs when the PF uses IDENTITY

PCI ATS has a global Smallest Translation Unit field that is located in
the PF but shared by all of the VFs.

The expectation is that the STU will be set to the root port's global STU
capability which is driven by the IO page table configuration of the iommu
HW. Today it becomes set when the iommu driver first enables ATS.

Thus, to enable ATS on the VF, the PF must have already had the correct
STU programmed, even if ATS is off on the PF.

Unfortunately the PF only programs the STU when the PF enables ATS. The
iommu drivers tend to leave ATS disabled when IDENTITY translation is
being used.

Thus we can get into a state where the PF is setup to use IDENTITY with
the DMA API while the VF would like to use VFIO with a PAGING domain and
have ATS turned on. This fails because the PF never loaded a PAGING domain
and so it never setup the STU, and the VF can't do it.

The simplest solution is to have the iommu driver set the ATS STU when it
probes the device. This way the ATS STU is loaded immediately at boot time
to all PFs and there is no issue when a VF comes to use it.

Add a new call pci_prepare_ats() which should be called by iommu drivers
in their probe_device() op for every PCI device if the iommu driver
supports ATS. This will setup the STU based on whatever page size
capability the iommu HW has.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/0-v1-0fb4d2ab6770+7e706-ats_vf_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/pci-ats.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index df54cd5b15db..0e8b74e63767 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -8,6 +8,7 @@
 /* Address Translation Service */
 bool pci_ats_supported(struct pci_dev *dev);
 int pci_enable_ats(struct pci_dev *dev, int ps);
+int pci_prepare_ats(struct pci_dev *dev, int ps);
 void pci_disable_ats(struct pci_dev *dev);
 int pci_ats_queue_depth(struct pci_dev *dev);
 int pci_ats_page_aligned(struct pci_dev *dev);
@@ -16,6 +17,8 @@ static inline bool pci_ats_supported(struct pci_dev *d)
 { return false; }
 static inline int pci_enable_ats(struct pci_dev *d, int ps)
 { return -ENODEV; }
+static inline int pci_prepare_ats(struct pci_dev *dev, int ps)
+{ return -ENODEV; }
 static inline void pci_disable_ats(struct pci_dev *d) { }
 static inline int pci_ats_queue_depth(struct pci_dev *d)
 { return -ENODEV; }
-- 
cgit v1.2.3


From 5c40e050e6ac0218af7c520095729d440cc87e6b Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 29 Aug 2024 15:06:40 +0200
Subject: fs: drop GFP_NOFAIL mode from alloc_page_buffers

There is only one called of alloc_page_buffers and it doesn't require
__GFP_NOFAIL so drop this allocation mode.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Link: https://lore.kernel.org/r/20240829130640.1397970-1-mhocko@kernel.org
Acked-by: Song Liu <song@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/buffer_head.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 14acf1bbe0ce..7e903457967a 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -199,8 +199,7 @@ void folio_set_bh(struct buffer_head *bh, struct folio *folio,
 		  unsigned long offset);
 struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
 					gfp_t gfp);
-struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-		bool retry);
+struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size);
 struct buffer_head *create_empty_buffers(struct folio *folio,
 		unsigned long blocksize, unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
-- 
cgit v1.2.3


From a06c3fad49a50d5d5eb078f93e70f4d3eca5d5a5 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 30 Aug 2024 14:01:45 +0100
Subject: drivers/virt: pkvm: Add initial support for running as a protected
 guest

Implement a pKVM protected guest driver to probe the presence of pKVM
and determine the memory protection granule using the HYP_MEMINFO
hypercall.

Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20240830130150.8568-3-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/arm-smccc.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 083f85653716..16b6dcc54e02 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -115,6 +115,7 @@
 /* KVM "vendor specific" services */
 #define ARM_SMCCC_KVM_FUNC_FEATURES		0
 #define ARM_SMCCC_KVM_FUNC_PTP			1
+#define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO		2
 #define ARM_SMCCC_KVM_FUNC_FEATURES_2		127
 #define ARM_SMCCC_KVM_NUM_FUNCS			128
 
@@ -137,6 +138,12 @@
 			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
 			   ARM_SMCCC_KVM_FUNC_PTP)
 
+#define ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_HYP_MEMINFO)
+
 /* ptp_kvm counter type ID */
 #define KVM_PTP_VIRT_COUNTER			0
 #define KVM_PTP_PHYS_COUNTER			1
-- 
cgit v1.2.3


From ebc59b120c588156feb7ce194a9636584ced18ba Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 30 Aug 2024 14:01:47 +0100
Subject: drivers/virt: pkvm: Hook up mem_encrypt API using pKVM hypercalls

If we detect the presence of pKVM's SHARE and UNSHARE hypercalls, then
register a backend implementation of the mem_encrypt API so that things
like DMA buffers can be shared appropriately with the host.

Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20240830130150.8568-5-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/arm-smccc.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 16b6dcc54e02..9cb7c95920b0 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -116,6 +116,8 @@
 #define ARM_SMCCC_KVM_FUNC_FEATURES		0
 #define ARM_SMCCC_KVM_FUNC_PTP			1
 #define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO		2
+#define ARM_SMCCC_KVM_FUNC_MEM_SHARE		3
+#define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE		4
 #define ARM_SMCCC_KVM_FUNC_FEATURES_2		127
 #define ARM_SMCCC_KVM_NUM_FUNCS			128
 
@@ -144,6 +146,18 @@
 			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
 			   ARM_SMCCC_KVM_FUNC_HYP_MEMINFO)
 
+#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MEM_SHARE)
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MEM_UNSHARE)
+
 /* ptp_kvm counter type ID */
 #define KVM_PTP_VIRT_COUNTER			0
 #define KVM_PTP_PHYS_COUNTER			1
-- 
cgit v1.2.3


From 0f12694958001c96bda811473fdb23f333c6d3ca Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 30 Aug 2024 14:01:49 +0100
Subject: drivers/virt: pkvm: Intercept ioremap using pKVM MMIO_GUARD hypercall

Hook up pKVM's MMIO_GUARD hypercall so that ioremap() and friends will
register the target physical address as MMIO with the hypervisor,
allowing guest exits to that page to be emulated by the host with full
syndrome information.

Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20240830130150.8568-7-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/arm-smccc.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 9cb7c95920b0..e93c1f7cea70 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -118,6 +118,7 @@
 #define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO		2
 #define ARM_SMCCC_KVM_FUNC_MEM_SHARE		3
 #define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE		4
+#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD		7
 #define ARM_SMCCC_KVM_FUNC_FEATURES_2		127
 #define ARM_SMCCC_KVM_NUM_FUNCS			128
 
@@ -158,6 +159,12 @@
 			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
 			   ARM_SMCCC_KVM_FUNC_MEM_UNSHARE)
 
+#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MMIO_GUARD)
+
 /* ptp_kvm counter type ID */
 #define KVM_PTP_VIRT_COUNTER			0
 #define KVM_PTP_PHYS_COUNTER			1
-- 
cgit v1.2.3


From 21be9f7110d4c044c2b49bafbd7246335f236221 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 30 Aug 2024 14:01:50 +0100
Subject: arm64: smccc: Reserve block of KVM "vendor" services for pKVM
 hypercalls

pKVM relies on hypercalls to expose services such as memory sharing to
protected guests. Tentatively allocate a block of 58 hypercalls (i.e.
fill the remaining space in the first 64 function IDs) for pKVM usage,
as future extensions such as pvIOMMU support, range-based memory sharing
and validation of assigned devices will require additional services.

Suggested-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/86a5h5yg5y.wl-maz@kernel.org
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20240830130150.8568-8-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/arm-smccc.h | 60 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index e93c1f7cea70..f59099a213d0 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -115,10 +115,70 @@
 /* KVM "vendor specific" services */
 #define ARM_SMCCC_KVM_FUNC_FEATURES		0
 #define ARM_SMCCC_KVM_FUNC_PTP			1
+/* Start of pKVM hypercall range */
 #define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO		2
 #define ARM_SMCCC_KVM_FUNC_MEM_SHARE		3
 #define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE		4
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_5		5
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_6		6
 #define ARM_SMCCC_KVM_FUNC_MMIO_GUARD		7
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_8		8
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_9		9
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_10		10
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_11		11
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_12		12
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_13		13
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_14		14
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_15		15
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_16		16
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_17		17
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_18		18
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_19		19
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_20		20
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_21		21
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_22		22
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_23		23
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_24		24
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_25		25
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_26		26
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_27		27
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_28		28
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_29		29
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_30		30
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_31		31
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_32		32
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_33		33
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_34		34
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_35		35
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_36		36
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_37		37
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_38		38
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_39		39
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_40		40
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_41		41
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_42		42
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_43		43
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_44		44
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_45		45
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_46		46
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_47		47
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_48		48
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_49		49
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_50		50
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_51		51
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_52		52
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_53		53
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_54		54
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_55		55
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_56		56
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_57		57
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_58		58
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_59		59
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_60		60
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_61		61
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_62		62
+#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_63		63
+/* End of pKVM hypercall range */
 #define ARM_SMCCC_KVM_FUNC_FEATURES_2		127
 #define ARM_SMCCC_KVM_NUM_FUNCS			128
 
-- 
cgit v1.2.3


From 929c81ade6355b87097a2a4886c10750e68626cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Fri, 30 Aug 2024 11:45:57 +0200
Subject: fbdev: Introduce devm_register_framebuffer()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a device-managed variant of register_framebuffer() which
automatically unregisters the framebuffer on device destruction.
This can simplify the error handling and resource management in drivers.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 include/linux/fb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index db7d97b10964..abf6643ebcaf 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -601,6 +601,7 @@ extern ssize_t fb_sys_write(struct fb_info *info, const char __user *buf,
 /* fbmem.c */
 extern int register_framebuffer(struct fb_info *fb_info);
 extern void unregister_framebuffer(struct fb_info *fb_info);
+extern int devm_register_framebuffer(struct device *dev, struct fb_info *fb_info);
 extern char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size);
 extern void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 idx,
 				u32 height, u32 shift_high, u32 shift_low, u32 mod);
-- 
cgit v1.2.3


From 9aee8262445d185960431e972e2d997e6aba3de0 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Mon, 26 Aug 2024 11:58:23 +0800
Subject: ARM: OMAP2+: Remove obsoleted declaration for gpmc_onenand_init

The gpmc_onenand_init() have been removed since
commit 2514830b8b8c ("ARM: OMAP2+: Remove gpmc-onenand"), and now
it is useless, so remove it.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Link: https://lore.kernel.org/r/20240826035823.4043171-1-cuigaosheng1@huawei.com
Signed-off-by: Kevin Hilman <khilman@baylibre.com>
---
 include/linux/omap-gpmc.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/omap-gpmc.h b/include/linux/omap-gpmc.h
index 082841908fe7..c9e3843d2dd5 100644
--- a/include/linux/omap-gpmc.h
+++ b/include/linux/omap-gpmc.h
@@ -84,13 +84,3 @@ extern void gpmc_read_settings_dt(struct device_node *np,
 struct gpmc_timings;
 struct omap_nand_platform_data;
 struct omap_onenand_platform_data;
-
-#if IS_ENABLED(CONFIG_MTD_ONENAND_OMAP2)
-extern int gpmc_onenand_init(struct omap_onenand_platform_data *d);
-#else
-#define board_onenand_data	NULL
-static inline int gpmc_onenand_init(struct omap_onenand_platform_data *d)
-{
-	return 0;
-}
-#endif
-- 
cgit v1.2.3


From ddc94d0b17e8ea8179ecbbefacac3fba0fb77265 Mon Sep 17 00:00:00 2001
From: Matthew Brost <matthew.brost@intel.com>
Date: Mon, 26 Aug 2024 10:01:43 -0700
Subject: dma-buf: Split out dma fence array create into alloc and arm
 functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Useful to preallocate dma fence array and then arm in path of reclaim or
a dma fence.

v2:
 - s/arm/init (Christian)
 - Drop !array warn (Christian)
v3:
 - Fix kernel doc typos (dim)

Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Christian König <christian.koenig@amd.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240826170144.2492062-2-matthew.brost@intel.com
---
 include/linux/dma-fence-array.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
index 29c5650c1038..079b3dec0a16 100644
--- a/include/linux/dma-fence-array.h
+++ b/include/linux/dma-fence-array.h
@@ -79,6 +79,12 @@ to_dma_fence_array(struct dma_fence *fence)
 	for (index = 0, fence = dma_fence_array_first(head); fence;	\
 	     ++(index), fence = dma_fence_array_next(head, index))
 
+struct dma_fence_array *dma_fence_array_alloc(int num_fences);
+void dma_fence_array_init(struct dma_fence_array *array,
+			  int num_fences, struct dma_fence **fences,
+			  u64 context, unsigned seqno,
+			  bool signal_on_any);
+
 struct dma_fence_array *dma_fence_array_create(int num_fences,
 					       struct dma_fence **fences,
 					       u64 context, unsigned seqno,
-- 
cgit v1.2.3


From 1abab1ba0775036bb67c6c57945c637be644c04f Mon Sep 17 00:00:00 2001
From: Chen Ridong <chenridong@huawei.com>
Date: Fri, 30 Aug 2024 10:02:28 +0000
Subject: cgroup/cpuset: guard cpuset-v1 code under CONFIG_CPUSETS_V1

This patch introduces CONFIG_CPUSETS_V1 and guard cpuset-v1 code under
CONFIG_CPUSETS_V1. The default value of CONFIG_CPUSETS_V1 is N, so that
user who adopted v2 don't have 'pay' for cpuset v1.

Signed-off-by: Chen Ridong <chenridong@huawei.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cpuset.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 2a6981eeebf8..835e7b793f6a 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -99,6 +99,7 @@ static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
 					  const struct task_struct *tsk2);
 
+#ifdef CONFIG_CPUSETS_V1
 #define cpuset_memory_pressure_bump() 				\
 	do {							\
 		if (cpuset_memory_pressure_enabled)		\
@@ -106,6 +107,9 @@ extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
 	} while (0)
 extern int cpuset_memory_pressure_enabled;
 extern void __cpuset_memory_pressure_bump(void);
+#else
+static inline void cpuset_memory_pressure_bump(void) { }
+#endif
 
 extern void cpuset_task_status_allowed(struct seq_file *m,
 					struct task_struct *task);
-- 
cgit v1.2.3


From 0328947c50324cf4b2d8b181bf948edb8101f59f Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Wed, 28 Aug 2024 21:16:15 +0530
Subject: PCI: endpoint: Assign PCI domain number for endpoint controllers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Right now, PCI endpoint subsystem doesn't assign PCI domain number for the
PCI endpoint controllers. But this domain number could be useful to the EPC
drivers to uniquely identify each controller based on the hardware instance
when there are multiple ones present in an SoC (even multiple RC/EP).

So let's make use of the existing pci_bus_find_domain_nr() API to allocate
domain numbers based on either devicetree (linux,pci-domain) property or
dynamic domain number allocation scheme.

It should be noted that the domain number allocated by this API will be
based on both RC and EP controllers in a SoC. If the 'linux,pci-domain' DT
property is present, then the domain number represents the actual hardware
instance of the PCI endpoint controller. If not, then the domain number
will be allocated based on the PCI EP/RC controller probe order.

If the architecture doesn't support CONFIG_PCI_DOMAINS_GENERIC (rare), then
currently a warning is thrown to indicate that the architecture specific
implementation is needed.

Link: https://lore.kernel.org/linux-pci/20240828-pci-qcom-hotplug-v4-5-263a385fbbcb@linaro.org
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
---
 include/linux/pci-epc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 85bdf2adb760..8e3dcac55dcd 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -128,6 +128,7 @@ struct pci_epc_mem {
  * @group: configfs group representing the PCI EPC device
  * @lock: mutex to protect pci_epc ops
  * @function_num_map: bitmap to manage physical function number
+ * @domain_nr: PCI domain number of the endpoint controller
  * @init_complete: flag to indicate whether the EPC initialization is complete
  *                 or not
  */
@@ -145,6 +146,7 @@ struct pci_epc {
 	/* mutex to protect against concurrent access of EP controller */
 	struct mutex			lock;
 	unsigned long			function_num_map;
+	int				domain_nr;
 	bool				init_complete;
 };
 
-- 
cgit v1.2.3


From 4ed9ef32606386efc562bada891d7baa16fc46b4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 15 Jul 2024 17:14:14 +1000
Subject: lockd: discard nlmsvc_timeout

nlmsvc_timeout always has the same value as (nlm_timeout * HZ), so use
that in the one place that nlmsvc_timeout is used.

In truth it *might* not always be the same as nlmsvc_timeout is only set
when lockd is started while nlm_timeout can be set at anytime via
sysctl.  I think this difference it not helpful so removing it is good.

Also remove the test for nlm_timout being 0.  This is not possible -
unless a module parameter is used to set the minimum timeout to 0, and
if that happens then it probably should be honoured.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/lockd/lockd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 1b95fe31051f..61c4b9c41904 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -200,7 +200,7 @@ extern const struct svc_procedure nlmsvc_procedures[24];
 extern const struct svc_procedure nlmsvc_procedures4[24];
 #endif
 extern int			nlmsvc_grace_period;
-extern unsigned long		nlmsvc_timeout;
+extern unsigned long		nlm_timeout;
 extern bool			nsm_use_hostnames;
 extern u32			nsm_local_state;
 
-- 
cgit v1.2.3


From f2b27e1d72527f94f030b6356b3187576e60885b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 15 Jul 2024 17:14:15 +1000
Subject: SUNRPC: make various functions static, or not exported.

Various functions are only used within the sunrpc module, and several
are only use in the one file.  So clean up:

These are marked static, and any EXPORT is removed.
  svc_rcpb_setup()
  svc_rqst_alloc()
  svc_rqst_free()  - also moved before first use
  svc_rpcbind_set_version()
  svc_drop() - also moved to svc.c

These are now not EXPORTed, but are not static.
  svc_authenticate()
  svc_sock_update_bufs()

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h     | 9 ---------
 include/linux/sunrpc/svcauth.h | 1 -
 include/linux/sunrpc/svcsock.h | 2 --
 3 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index a7d0406b9ef5..e4fa25fafa97 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -401,17 +401,13 @@ struct svc_procedure {
  */
 int sunrpc_set_pool_mode(const char *val);
 int sunrpc_get_pool_mode(char *val, size_t size);
-int svc_rpcb_setup(struct svc_serv *serv, struct net *net);
 void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net);
 int svc_bind(struct svc_serv *serv, struct net *net);
 struct svc_serv *svc_create(struct svc_program *, unsigned int,
 			    int (*threadfn)(void *data));
-struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv,
-					struct svc_pool *pool, int node);
 bool		   svc_rqst_replace_page(struct svc_rqst *rqstp,
 					 struct page *page);
 void		   svc_rqst_release_pages(struct svc_rqst *rqstp);
-void		   svc_rqst_free(struct svc_rqst *);
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *prog,
 				     struct svc_stat *stats,
@@ -446,11 +442,6 @@ int		   svc_generic_rpcbind_set(struct net *net,
 					   u32 version, int family,
 					   unsigned short proto,
 					   unsigned short port);
-int		   svc_rpcbind_set_version(struct net *net,
-					   const struct svc_program *progp,
-					   u32 version, int family,
-					   unsigned short proto,
-					   unsigned short port);
 
 #define	RPC_MAX_ADDRBUFLEN	(63U)
 
diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index 61c455f1e1f5..63cf6fb26dcc 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -151,7 +151,6 @@ struct auth_ops {
 
 struct svc_xprt;
 
-extern enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp);
 extern rpc_authflavor_t svc_auth_flavor(struct svc_rqst *rqstp);
 extern int	svc_authorise(struct svc_rqst *rqstp);
 extern enum svc_auth_status svc_set_client(struct svc_rqst *rqstp);
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 7c78ec6356b9..bf45d9e8492a 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -58,8 +58,6 @@ static inline u32 svc_sock_final_rec(struct svc_sock *svsk)
  */
 void		svc_recv(struct svc_rqst *rqstp);
 void		svc_send(struct svc_rqst *rqstp);
-void		svc_drop(struct svc_rqst *);
-void		svc_sock_update_bufs(struct svc_serv *serv);
 int		svc_addsock(struct svc_serv *serv, struct net *net,
 			    const int fd, char *name_return, const size_t len,
 			    const struct cred *cred);
-- 
cgit v1.2.3


From 5fe690a5946442f7f2d08448341dd621367f80bc Mon Sep 17 00:00:00 2001
From: Matthew Cassell <mcassell411@gmail.com>
Date: Mon, 22 Jul 2024 17:13:16 +0000
Subject: mm: add node_reclaim successes to VM event counters

/proc/vmstat currently shows the number of node_reclaim() failures when
vm.zone_reclaim_mode is set appropriately.  It would be convenient to have
the number of successes right next to zone_reclaim_failed (similar to
compaction and migration).

While just a trivially addition to the vmstat file.  It was helpful during
benchmarking to not have to probe node_reclaim() to observe the
success/failure ratio.

Link: https://lkml.kernel.org/r/20240722171316.7517-1-mcassell411@gmail.com
Signed-off-by: Matthew Cassell <mcassell411@gmail.com>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vm_event_item.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 747943bc8cc2..6fff8ed3b1fd 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -50,6 +50,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		PGSTEAL_ANON,
 		PGSTEAL_FILE,
 #ifdef CONFIG_NUMA
+		PGSCAN_ZONE_RECLAIM_SUCCESS,
 		PGSCAN_ZONE_RECLAIM_FAILED,
 #endif
 		PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL,
-- 
cgit v1.2.3


From 3ddc2fefe6f34ec870fcb22f4cd656e53db079f2 Mon Sep 17 00:00:00 2001
From: Danilo Krummrich <dakr@kernel.org>
Date: Mon, 22 Jul 2024 18:29:23 +0200
Subject: mm: vmalloc: implement vrealloc()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Align kvrealloc() with krealloc()", v2.

Besides the obvious (and desired) difference between krealloc() and
kvrealloc(), there is some inconsistency in their function signatures and
behavior:

 - krealloc() frees the memory when the requested size is zero, whereas
   kvrealloc() simply returns a pointer to the existing allocation.

 - krealloc() behaves like kmalloc() if a NULL pointer is passed, whereas
   kvrealloc() does not accept a NULL pointer at all and, if passed, would fault
   instead.

 - krealloc() is self-contained, whereas kvrealloc() relies on the caller to
   provide the size of the previous allocation.

Inconsistent behavior throughout allocation APIs is error prone, hence
make kvrealloc() behave like krealloc(), which seems superior in all
mentioned aspects.

In order to be able to get rid of kvrealloc()'s oldsize parameter,
introduce vrealloc() and make use of it in kvrealloc().

Making use of vrealloc() in kvrealloc() also provides oppertunities to
grow (and shrink) allocations more efficiently.  For instance, vrealloc()
can be optimized to allocate and map additional pages to grow the
allocation or unmap and free unused pages to shrink the allocation.

Besides the above, those functions are required by Rust's allocator abstractons
[1] (rework based on this series in [2]). With `Vec` or `KVec` respectively,
potentially growing (and shrinking) data structures are rather common.

[1] https://lore.kernel.org/lkml/20240704170738.3621-1-dakr@redhat.com/
[2] https://git.kernel.org/pub/scm/linux/kernel/git/dakr/linux.git/log/?h=rust/mm


This patch (of 2):

Implement vrealloc() analogous to krealloc().

Currently, krealloc() requires the caller to pass the size of the previous
memory allocation, which, instead, should be self-contained.

We attempt to fix this in a subsequent patch which, in order to do so,
requires vrealloc().

Besides that, we need realloc() functions for kernel allocators in Rust
too.  With `Vec` or `KVec` respectively, potentially growing (and
shrinking) data structures are rather common.

[dakr@kernel.org: fix missing nommu implementation]
  Link: https://lkml.kernel.org/r/20240725141227.13954-1-dakr@kernel.org
[dakr@kernel.org: document concurrency restrictions]
  Link: https://lkml.kernel.org/r/20240725125442.4957-1-dakr@kernel.org
[dakr@kernel.org: consider spare memory for __GFP_ZERO]
  Link: https://lkml.kernel.org/r/20240730185049.6244-3-dakr@kernel.org
[dakr@kernel.org: properly document __GFP_ZERO behavior]
  Link: https://lkml.kernel.org/r/20240730185049.6244-4-dakr@kernel.org
Link: https://lkml.kernel.org/r/20240722163111.4766-1-dakr@kernel.org
Link: https://lkml.kernel.org/r/20240722163111.4766-2-dakr@kernel.org
Signed-off-by: Danilo Krummrich <dakr@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Uladzislau Rezki <urezki@gmail.com>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vmalloc.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index e4a631ec430b..ad2ce7a6ab7a 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -189,6 +189,10 @@ extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1
 extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2);
 #define vcalloc(...)		alloc_hooks(vcalloc_noprof(__VA_ARGS__))
 
+void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags)
+		__realloc_size(2);
+#define vrealloc(...)		alloc_hooks(vrealloc_noprof(__VA_ARGS__))
+
 extern void vfree(const void *addr);
 extern void vfree_atomic(const void *addr);
 
-- 
cgit v1.2.3


From 590b9d576caec6b4c46bba49ed36223a399c3fc5 Mon Sep 17 00:00:00 2001
From: Danilo Krummrich <dakr@kernel.org>
Date: Mon, 22 Jul 2024 18:29:24 +0200
Subject: mm: kvmalloc: align kvrealloc() with krealloc()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Besides the obvious (and desired) difference between krealloc() and
kvrealloc(), there is some inconsistency in their function signatures and
behavior:

 - krealloc() frees the memory when the requested size is zero, whereas
   kvrealloc() simply returns a pointer to the existing allocation.

 - krealloc() behaves like kmalloc() if a NULL pointer is passed, whereas
   kvrealloc() does not accept a NULL pointer at all and, if passed,
   would fault instead.

 - krealloc() is self-contained, whereas kvrealloc() relies on the caller
   to provide the size of the previous allocation.

Inconsistent behavior throughout allocation APIs is error prone, hence
make kvrealloc() behave like krealloc(), which seems superior in all
mentioned aspects.

Besides that, implementing kvrealloc() by making use of krealloc() and
vrealloc() provides oppertunities to grow (and shrink) allocations more
efficiently.  For instance, vrealloc() can be optimized to allocate and
map additional pages to grow the allocation or unmap and free unused pages
to shrink the allocation.

[dakr@kernel.org: document concurrency restrictions]
  Link: https://lkml.kernel.org/r/20240725125442.4957-1-dakr@kernel.org
[dakr@kernel.org: disable KASAN when switching to vmalloc]
  Link: https://lkml.kernel.org/r/20240730185049.6244-2-dakr@kernel.org
[dakr@kernel.org: properly document __GFP_ZERO behavior]
  Link: https://lkml.kernel.org/r/20240730185049.6244-5-dakr@kernel.org
Link: https://lkml.kernel.org/r/20240722163111.4766-3-dakr@kernel.org
Signed-off-by: Danilo Krummrich <dakr@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Uladzislau Rezki <urezki@gmail.com>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/slab.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index eb2bf4629157..c9cb42203183 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -841,8 +841,8 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
 #define kvcalloc_node(...)			alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__))
 #define kvcalloc(...)				alloc_hooks(kvcalloc_noprof(__VA_ARGS__))
 
-extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
-		      __realloc_size(3);
+void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
+		__realloc_size(2);
 #define kvrealloc(...)				alloc_hooks(kvrealloc_noprof(__VA_ARGS__))
 
 extern void kvfree(const void *addr);
-- 
cgit v1.2.3


From d58a2a581f132529eefac5377676011562b631b8 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Mon, 22 Jul 2024 13:43:18 +0800
Subject: mm: shmem: rename shmem_is_huge() to shmem_huge_global_enabled()

shmem_is_huge() is now used to check if the top-level huge page is
enabled, thus rename it to reflect its usage.

Link: https://lkml.kernel.org/r/da53296e0ab6359aa083561d9dc01e4223d60fbe.1721626645.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 1d06b1e5408a..405ee8d3589a 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -111,14 +111,15 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 int shmem_unuse(unsigned int type);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
-			  struct mm_struct *mm, unsigned long vm_flags);
+extern bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, bool shmem_huge_force,
+				      struct mm_struct *mm, unsigned long vm_flags);
 unsigned long shmem_allowable_huge_orders(struct inode *inode,
 				struct vm_area_struct *vma, pgoff_t index,
 				bool global_huge);
 #else
-static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
-					  struct mm_struct *mm, unsigned long vm_flags)
+static __always_inline bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
+						      bool shmem_huge_force, struct mm_struct *mm,
+						      unsigned long vm_flags)
 {
 	return false;
 }
-- 
cgit v1.2.3


From 6beeab870e70b2d4f49baf6c6be9da1b61c169f8 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Mon, 22 Jul 2024 13:43:19 +0800
Subject: mm: shmem: move shmem_huge_global_enabled() into
 shmem_allowable_huge_orders()

Move shmem_huge_global_enabled() into shmem_allowable_huge_orders(), so
that shmem_allowable_huge_orders() can also help to find the allowable
huge orders for tmpfs.  Moreover the shmem_huge_global_enabled() can
become static.  While we are at it, passing the vma instead of mm for
shmem_huge_global_enabled() makes code cleaner.

No functional changes.

Link: https://lkml.kernel.org/r/8e825146bb29ee1a1c7bd64d2968ff3e19be7815.1721626645.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 405ee8d3589a..1564d7d3ca61 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -111,21 +111,13 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 int shmem_unuse(unsigned int type);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, bool shmem_huge_force,
-				      struct mm_struct *mm, unsigned long vm_flags);
 unsigned long shmem_allowable_huge_orders(struct inode *inode,
 				struct vm_area_struct *vma, pgoff_t index,
-				bool global_huge);
+				bool shmem_huge_force);
 #else
-static __always_inline bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
-						      bool shmem_huge_force, struct mm_struct *mm,
-						      unsigned long vm_flags)
-{
-	return false;
-}
 static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
 				struct vm_area_struct *vma, pgoff_t index,
-				bool global_huge)
+				bool shmem_huge_force)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 4fd568faf6e7e21f206cd66dd4fca9a72e7d922c Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 18 Jul 2024 17:18:21 +0800
Subject: mm: kmem: remove mem_cgroup_from_obj()

There is no user of mem_cgroup_from_obj(), remove it.

Link: https://lkml.kernel.org/r/20240718091821.44740-1-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0e5bf25d324f..af7da7bd00af 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1717,7 +1717,6 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
 	return memcg ? memcg->kmemcg_id : -1;
 }
 
-struct mem_cgroup *mem_cgroup_from_obj(void *p);
 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);
 
 static inline void count_objcg_event(struct obj_cgroup *objcg,
@@ -1780,11 +1779,6 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
 	return -1;
 }
 
-static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
-{
-	return NULL;
-}
-
 static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
 {
 	return NULL;
-- 
cgit v1.2.3


From 2a28713a67fd28eedc13b32300df6b9c5a2381f5 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Wed, 24 Jul 2024 09:01:14 -0400
Subject: memory tiering: introduce folio_use_access_time() check

If memory tiering mode is on and a folio is not in the top tier memory,
folio's cpupid field is repurposed to store page access time.  Instead of
an open coded check, use a function to encapsulate the check.

Link: https://lkml.kernel.org/r/20240724130115.793641-3-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6549d0979b28..6c2078ca3391 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1745,6 +1745,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 		__set_bit(pid_bit, &vma->numab_state->pids_active[1]);
 	}
 }
+
+bool folio_use_access_time(struct folio *folio);
 #else /* !CONFIG_NUMA_BALANCING */
 static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
 {
@@ -1798,6 +1800,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
 static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 {
 }
+static inline bool folio_use_access_time(struct folio *folio)
+{
+	return false;
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
-- 
cgit v1.2.3


From c4a6fce85640beb4f79e8217272d1ef79839cc17 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Wed, 24 Jul 2024 20:33:21 +0000
Subject: vmstat: kernel stack usage histogram

As part of the dynamic kernel stack project, we need to know the amount of
data that can be saved by reducing the default kernel stack size [1].

Provide a kernel stack usage histogram to aid in optimizing kernel stack
sizes and minimizing memory waste in large-scale environments.  The
histogram divides stack usage into power-of-two buckets and reports the
results in /proc/vmstat.  This information is especially valuable in
environments with millions of machines, where even small optimizations can
have a significant impact.

The histogram data is presented in /proc/vmstat with entries like
"kstack_1k", "kstack_2k", and so on, indicating the number of threads that
exited with stack usage falling within each respective bucket.

Example outputs:
Intel:
$ grep kstack /proc/vmstat
kstack_1k 3
kstack_2k 188
kstack_4k 11391
kstack_8k 243
kstack_16k 0

ARM with 64K page_size:
$ grep kstack /proc/vmstat
kstack_1k 1
kstack_2k 340
kstack_4k 25212
kstack_8k 1659
kstack_16k 0
kstack_32k 0
kstack_64k 0

Note: once the dynamic kernel stack is implemented it will depend on the
implementation the usability of this feature: On hardware that supports
faults on kernel stacks, we will have other metrics that show the total
number of pages allocated for stacks.  On hardware where faults are not
supported, we will most likely have some optimization where only some
threads are extended, and for those, these metrics will still be very
useful.

[1] https://lwn.net/Articles/974367

Link: https://lkml.kernel.org/r/20240730150158.832783-3-pasha.tatashin@soleen.com
Link: https://lkml.kernel.org/r/20240724203322.2765486-3-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Kent Overstreet <kent.overstreet@linux.dev>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vm_event_item.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 6fff8ed3b1fd..aae5c7c5cfb4 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -155,6 +155,30 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		VMA_LOCK_RETRY,
 		VMA_LOCK_MISS,
 #endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+		KSTACK_1K,
+#if THREAD_SIZE > 1024
+		KSTACK_2K,
+#endif
+#if THREAD_SIZE > 2048
+		KSTACK_4K,
+#endif
+#if THREAD_SIZE > 4096
+		KSTACK_8K,
+#endif
+#if THREAD_SIZE > 8192
+		KSTACK_16K,
+#endif
+#if THREAD_SIZE > 16384
+		KSTACK_32K,
+#endif
+#if THREAD_SIZE > 32768
+		KSTACK_64K,
+#endif
+#if THREAD_SIZE > 65536
+		KSTACK_REST,
+#endif
+#endif /* CONFIG_DEBUG_STACK_USAGE */
 		NR_VM_EVENT_ITEMS
 };
 
-- 
cgit v1.2.3


From fbe76a6557a83af5ef3819fd7b7ffd0a5d3b4e51 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Wed, 24 Jul 2024 20:33:22 +0000
Subject: task_stack: uninline stack_not_used

Given that stack_not_used() is not performance critical function
uninline it.

Link: https://lkml.kernel.org/r/20240730150158.832783-4-pasha.tatashin@soleen.com
Link: https://lkml.kernel.org/r/20240724203322.2765486-4-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Li Zhijian <lizhijian@fujitsu.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/task_stack.h | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index ccd72b978e1f..bf10bdb487dd 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -95,23 +95,11 @@ static inline int object_is_on_stack(const void *obj)
 extern void thread_stack_cache_init(void);
 
 #ifdef CONFIG_DEBUG_STACK_USAGE
+unsigned long stack_not_used(struct task_struct *p);
+#else
 static inline unsigned long stack_not_used(struct task_struct *p)
 {
-	unsigned long *n = end_of_stack(p);
-
-	do { 	/* Skip over canary */
-# ifdef CONFIG_STACK_GROWSUP
-		n--;
-# else
-		n++;
-# endif
-	} while (!*n);
-
-# ifdef CONFIG_STACK_GROWSUP
-	return (unsigned long)end_of_stack(p) - (unsigned long)n;
-# else
-	return (unsigned long)n - (unsigned long)end_of_stack(p);
-# endif
+	return 0;
 }
 #endif
 extern void set_task_stack_end_magic(struct task_struct *tsk);
-- 
cgit v1.2.3


From f77bd4b14ccfd38dfcfe67eecad517b8ec1b7f37 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Fri, 26 Jul 2024 20:31:08 +0000
Subject: mm: memcg: don't call propagate_protected_usage() needlessly

Patch series "mm: memcg: page counters optimizations", v3.

This patchset contains 3 independent small optimizations of page counters.


This patch (of 3):

Memory protection (min/low) requires a constant tracking of protected
memory usage.  propagate_protected_usage() is called on each page counters
update and does a number of operations even in cases when the actual
memory protection functionality is not supported (e.g.  hugetlb cgroups or
memcg swap counters).

It's obviously inefficient and leads to a waste of CPU cycles.  It can be
addressed by calling propagate_protected_usage() only for the counters
which do support memory guarantees.  As of now it's only memcg->memory -
the unified memory memcg counter.

Link: https://lkml.kernel.org/r/20240726203110.1577216-2-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_counter.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 904c52f97284..0b8e993f9163 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -31,6 +31,7 @@ struct page_counter {
 	/* Keep all the read most fields in a separete cacheline. */
 	CACHELINE_PADDING(_pad2_);
 
+	bool protection_support;
 	unsigned long min;
 	unsigned long low;
 	unsigned long high;
@@ -44,12 +45,17 @@ struct page_counter {
 #define PAGE_COUNTER_MAX (LONG_MAX / PAGE_SIZE)
 #endif
 
+/*
+ * Protection is supported only for the first counter (with id 0).
+ */
 static inline void page_counter_init(struct page_counter *counter,
-				     struct page_counter *parent)
+				     struct page_counter *parent,
+				     bool protection_support)
 {
 	atomic_long_set(&counter->usage, 0);
 	counter->max = PAGE_COUNTER_MAX;
 	counter->parent = parent;
+	counter->protection_support = protection_support;
 }
 
 static inline unsigned long page_counter_read(struct page_counter *counter)
-- 
cgit v1.2.3


From 941ce635234162c420c9185816c59f7d5dd1ad07 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Fri, 26 Jul 2024 20:31:09 +0000
Subject: mm: page_counters: put page_counter_calculate_protection() under
 CONFIG_MEMCG

Put page_counter_calculate_protection() under CONFIG_MEMCG.

The protection functionality (min/low limits) is not supported by any
other cgroup subsystem, so page_counter_calculate_protection() and related
static effective_protection() can be compiled out if CONFIG_MEMCG is not
enabled.

Link: https://lkml.kernel.org/r/20240726203110.1577216-3-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_counter.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 0b8e993f9163..1b3e92c09b80 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -87,8 +87,14 @@ static inline void page_counter_reset_watermark(struct page_counter *counter)
 	counter->watermark = page_counter_read(counter);
 }
 
+#ifdef CONFIG_MEMCG
 void page_counter_calculate_protection(struct page_counter *root,
 				       struct page_counter *counter,
 				       bool recursive_protection);
+#else
+static inline void page_counter_calculate_protection(struct page_counter *root,
+						     struct page_counter *counter,
+						     bool recursive_protection) {}
+#endif
 
 #endif /* _LINUX_PAGE_COUNTER_H */
-- 
cgit v1.2.3


From 57979fabff554bf4d1edeeee69251b22ca9bf55e Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Fri, 26 Jul 2024 20:31:10 +0000
Subject: mm: page_counters: initialize usage using ATOMIC_LONG_INIT() macro

When a page_counter structure is initialized, there is no need to use an
atomic set operation to initialize the usage counter because at this point
the structure is not visible to anybody else.  ATOMIC_LONG_INIT() is what
should be used in such cases.

Link: https://lkml.kernel.org/r/20240726203110.1577216-4-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_counter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 1b3e92c09b80..66ebf9a73158 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -52,7 +52,7 @@ static inline void page_counter_init(struct page_counter *counter,
 				     struct page_counter *parent,
 				     bool protection_support)
 {
-	atomic_long_set(&counter->usage, 0);
+	counter->usage = (atomic_long_t)ATOMIC_LONG_INIT(0);
 	counter->max = PAGE_COUNTER_MAX;
 	counter->parent = parent;
 	counter->protection_support = protection_support;
-- 
cgit v1.2.3


From 394290cba9664ed3ab80d0e247402102a9b7287a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 26 Jul 2024 17:07:26 +0200
Subject: mm: turn USE_SPLIT_PTE_PTLOCKS / USE_SPLIT_PTE_PTLOCKS into Kconfig
 options

Patch series "mm: split PTE/PMD PT table Kconfig cleanups+clarifications".

This series is a follow up to the fixes:
	"[PATCH v1 0/2] mm/hugetlb: fix hugetlb vs. core-mm PT locking"

When working on the fixes, I wondered why 8xx is fine (-> never uses split
PT locks) and how PT locking even works properly with PMD page table
sharing (-> always requires split PMD PT locks).

Let's improve the split PT lock detection, make hugetlb properly depend on
it and make 8xx bail out if it would ever get enabled by accident.

As an alternative to patch #3 we could extend the Kconfig
SPLIT_PTE_PTLOCKS option from patch #2 -- but enforcing it closer to the
code that actually implements it feels a bit nicer for documentation
purposes, and there is no need to actually disable it because it should
always be disabled (!SMP).

Did a bunch of cross-compilations to make sure that split PTE/PMD PT locks
are still getting used where we would expect them.

[1] https://lkml.kernel.org/r/20240725183955.2268884-1-david@redhat.com


This patch (of 3):

Let's clean that up a bit and prepare for depending on
CONFIG_SPLIT_PMD_PTLOCKS in other Kconfig options.

More cleanups would be reasonable (like the arch-specific "depends on" for
CONFIG_SPLIT_PTE_PTLOCKS), but we'll leave that for another day.

Link: https://lkml.kernel.org/r/20240726150728.3159964-1-david@redhat.com
Link: https://lkml.kernel.org/r/20240726150728.3159964-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h            | 8 ++++----
 include/linux/mm_types.h      | 2 +-
 include/linux/mm_types_task.h | 3 ---
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6c2078ca3391..b7000fa300f3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2891,7 +2891,7 @@ static inline void pagetable_free(struct ptdesc *pt)
 	__free_pages(page, compound_order(page));
 }
 
-#if USE_SPLIT_PTE_PTLOCKS
+#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
 #if ALLOC_SPLIT_PTLOCKS
 void __init ptlock_cache_init(void);
 bool ptlock_alloc(struct ptdesc *ptdesc);
@@ -2949,7 +2949,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc)
 	return true;
 }
 
-#else	/* !USE_SPLIT_PTE_PTLOCKS */
+#else	/* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */
 /*
  * We use mm->page_table_lock to guard all pagetable pages of the mm.
  */
@@ -2964,7 +2964,7 @@ static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
 static inline void ptlock_cache_init(void) {}
 static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
 static inline void ptlock_free(struct ptdesc *ptdesc) {}
-#endif /* USE_SPLIT_PTE_PTLOCKS */
+#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */
 
 static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
 {
@@ -3024,7 +3024,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
 	((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
 		NULL: pte_offset_kernel(pmd, address))
 
-#if USE_SPLIT_PMD_PTLOCKS
+#if defined(CONFIG_SPLIT_PMD_PTLOCKS)
 
 static inline struct page *pmd_pgtable_page(pmd_t *pmd)
 {
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 485424979254..165c58b12ccc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -947,7 +947,7 @@ struct mm_struct {
 #ifdef CONFIG_MMU_NOTIFIER
 		struct mmu_notifier_subscriptions *notifier_subscriptions;
 #endif
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
 		pgtable_t pmd_huge_pte; /* protected by page_table_lock */
 #endif
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index a2f6179b672b..bff5706b76e1 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -16,9 +16,6 @@
 #include <asm/tlbbatch.h>
 #endif
 
-#define USE_SPLIT_PTE_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
-#define USE_SPLIT_PMD_PTLOCKS	(USE_SPLIT_PTE_PTLOCKS && \
-		IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
 #define ALLOC_SPLIT_PTLOCKS	(SPINLOCK_SIZE > BITS_PER_LONG/8)
 
 /*
-- 
cgit v1.2.3


From 188cac58a8bcdf82c7f63275b68f7a46871e45d6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 26 Jul 2024 17:07:27 +0200
Subject: mm/hugetlb: enforce that PMD PT sharing has split PMD PT locks

Sharing page tables between processes but falling back to per-MM page
table locks cannot possibly work.

So, let's make sure that we do have split PMD locks by adding a new
Kconfig option and letting that depend on CONFIG_SPLIT_PMD_PTLOCKS.

Link: https://lkml.kernel.org/r/20240726150728.3159964-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 45bf05ad5c53..9b7bcfce6920 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1251,7 +1251,7 @@ static inline __init void hugetlb_cma_reserve(int order)
 }
 #endif
 
-#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
 static inline bool hugetlb_pmd_shared(pte_t *pte)
 {
 	return page_count(virt_to_page(pte)) > 1;
@@ -1287,8 +1287,7 @@ bool __vma_private_lock(struct vm_area_struct *vma);
 static inline pte_t *
 hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
 {
-#if defined(CONFIG_HUGETLB_PAGE) && \
-	defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP)
+#if defined(CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING) && defined(CONFIG_LOCKDEP)
 	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
 	/*
-- 
cgit v1.2.3


From e5a41fc77771c7c7f6b9bef85a02b38b802b7371 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jul 2024 20:38:42 +0200
Subject: mm: simplify arch_make_folio_accessible()

Patch series "mm: remove arch_make_page_accessible()".

Now that s390x implements arch_make_folio_accessible(), let's convert
remaining users to use arch_make_folio_accessible() instead so we can
remove arch_make_page_accessible().


This patch (of 3):

Now that s390x implements HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE, let's turn
generic arch_make_folio_accessible() into a NOP: there are no other
targets that implement HAVE_ARCH_MAKE_PAGE_ACCESSIBLE but not
HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE.

Link: https://lkml.kernel.org/r/20240729183844.388481-1-david@redhat.com
Link: https://lkml.kernel.org/r/20240729183844.388481-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b7000fa300f3..fc7bd3864bcd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2215,16 +2215,7 @@ static inline int arch_make_page_accessible(struct page *page)
 #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
 static inline int arch_make_folio_accessible(struct folio *folio)
 {
-	int ret;
-	long i, nr = folio_nr_pages(folio);
-
-	for (i = 0; i < nr; i++) {
-		ret = arch_make_page_accessible(folio_page(folio, i));
-		if (ret)
-			break;
-	}
-
-	return ret;
+	return 0;
 }
 #endif
 
-- 
cgit v1.2.3


From 3290ef3c7f2a8171fc534e02fb95a512eeae689a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jul 2024 20:38:44 +0200
Subject: s390/uv: drop arch_make_page_accessible()

All code was converted to using arch_make_folio_accessible(), let's drop
arch_make_page_accessible().

Link: https://lkml.kernel.org/r/20240729183844.388481-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fc7bd3864bcd..153a4662a0cd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2205,13 +2205,6 @@ static inline bool folio_likely_mapped_shared(struct folio *folio)
 	return atomic_read(&folio->_mapcount) > 0;
 }
 
-#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
-static inline int arch_make_page_accessible(struct page *page)
-{
-	return 0;
-}
-#endif
-
 #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
 static inline int arch_make_folio_accessible(struct folio *folio)
 {
-- 
cgit v1.2.3


From c6f53ed8f213a66ae8bc40aa9112c32412c35a21 Mon Sep 17 00:00:00 2001
From: David Finkel <davidf@vimeo.com>
Date: Mon, 29 Jul 2024 10:37:42 -0400
Subject: mm, memcg: cg2 memory{.swap,}.peak write handlers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "mm, memcg: cg2 memory{.swap,}.peak write handlers", v7.


This patch (of 2):

Other mechanisms for querying the peak memory usage of either a process or
v1 memory cgroup allow for resetting the high watermark.  Restore parity
with those mechanisms, but with a less racy API.

For example:
 - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
   the high watermark.
 - writing "5" to the clear_refs pseudo-file in a processes's proc
   directory resets the peak RSS.

This change is an evolution of a previous patch, which mostly copied the
cgroup v1 behavior, however, there were concerns about races/ownership
issues with a global reset, so instead this change makes the reset
filedescriptor-local.

Writing any non-empty string to the memory.peak and memory.swap.peak
pseudo-files reset the high watermark to the current usage for subsequent
reads through that same FD.

Notably, following Johannes's suggestion, this implementation moves the
O(FDs that have written) behavior onto the FD write(2) path.  Instead, on
the page-allocation path, we simply add one additional watermark to
conditionally bump per-hierarchy level in the page-counter.

Additionally, this takes Longman's suggestion of nesting the
page-charging-path checks for the two watermarks to reduce the number of
common-case comparisons.

This behavior is particularly useful for work scheduling systems that need
to track memory usage of worker processes/cgroups per-work-item.  Since
memory can't be squeezed like CPU can (the OOM-killer has opinions), these
systems need to track the peak memory usage to compute system/container
fullness when binpacking workitems.

Most notably, Vimeo's use-case involves a system that's doing global
binpacking across many Kubernetes pods/containers, and while we can use
PSI for some local decisions about overload, we strive to avoid packing
workloads too tightly in the first place.  To facilitate this, we track
the peak memory usage.  However, since we run with long-lived workers (to
amortize startup costs) we need a way to track the high watermark while a
work-item is executing.  Polling runs the risk of missing short spikes
that last for timescales below the polling interval, and peak memory
tracking at the cgroup level is otherwise perfect for this use-case.

As this data is used to ensure that binpacked work ends up with sufficient
headroom, this use-case mostly avoids the inaccuracies surrounding
reclaimable memory.

Link: https://lkml.kernel.org/r/20240730231304.761942-1-davidf@vimeo.com
Link: https://lkml.kernel.org/r/20240729143743.34236-1-davidf@vimeo.com
Link: https://lkml.kernel.org/r/20240729143743.34236-2-davidf@vimeo.com
Signed-off-by: David Finkel <davidf@vimeo.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Waiman Long <longman@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cgroup-defs.h  |  5 +++++
 include/linux/cgroup.h       |  3 +++
 include/linux/memcontrol.h   |  5 +++++
 include/linux/page_counter.h | 11 ++++++++++-
 4 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ae04035b6cbe..7fc2d0195f56 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -775,6 +775,11 @@ struct cgroup_subsys {
 
 extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 
+struct cgroup_of_peak {
+	unsigned long		value;
+	struct list_head	list;
+};
+
 /**
  * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
  * @tsk: target task
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c60ba0ab1462..3e0563753cc3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -11,6 +11,7 @@
 
 #include <linux/sched.h>
 #include <linux/nodemask.h>
+#include <linux/list.h>
 #include <linux/rculist.h>
 #include <linux/cgroupstats.h>
 #include <linux/fs.h>
@@ -854,4 +855,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 
 struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);
 
+struct cgroup_of_peak *of_peak(struct kernfs_open_file *of);
+
 #endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index af7da7bd00af..1b79760af685 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -193,6 +193,11 @@ struct mem_cgroup {
 		struct page_counter memsw;	/* v1 only */
 	};
 
+	/* registered local peak watchers */
+	struct list_head memory_peaks;
+	struct list_head swap_peaks;
+	spinlock_t	 peaks_lock;
+
 	/* Range enforcement for interrupt charges */
 	struct work_struct high_work;
 
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 66ebf9a73158..79dbd8bc35a7 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -26,6 +26,8 @@ struct page_counter {
 	atomic_long_t children_low_usage;
 
 	unsigned long watermark;
+	/* Latest cg2 reset watermark */
+	unsigned long local_watermark;
 	unsigned long failcnt;
 
 	/* Keep all the read most fields in a separete cacheline. */
@@ -84,7 +86,14 @@ int page_counter_memparse(const char *buf, const char *max,
 
 static inline void page_counter_reset_watermark(struct page_counter *counter)
 {
-	counter->watermark = page_counter_read(counter);
+	unsigned long usage = page_counter_read(counter);
+
+	/*
+	 * Update local_watermark first, so it's always <= watermark
+	 * (modulo CPU/compiler re-ordering)
+	 */
+	counter->local_watermark = usage;
+	counter->watermark = usage;
 }
 
 #ifdef CONFIG_MEMCG
-- 
cgit v1.2.3


From a17c7d8fd2b097a422fc892b660e879e19c20214 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 29 Jul 2024 12:50:35 +0100
Subject: userfaultfd: move core VMA manipulation logic to mm/userfaultfd.c

Patch series "Make core VMA operations internal and testable", v4.

There are a number of "core" VMA manipulation functions implemented in
mm/mmap.c, notably those concerning VMA merging, splitting, modifying,
expanding and shrinking, which logically don't belong there.

More importantly this functionality represents an internal implementation
detail of memory management and should not be exposed outside of mm/
itself.

This patch series isolates core VMA manipulation functionality into its
own file, mm/vma.c, and provides an API to the rest of the mm code in
mm/vma.h.

Importantly, it also carefully implements mm/vma_internal.h, which
specifies which headers need to be imported by vma.c, leading to the very
useful property that vma.c depends only on mm/vma.h and mm/vma_internal.h.

This means we can then re-implement vma_internal.h in userland, adding
shims for kernel mechanisms as required, allowing us to unit test internal
VMA functionality.

This testing is useful as opposed to an e.g.  kunit implementation as this
way we can avoid all external kernel side-effects while testing, run tests
VERY quickly, and iterate on and debug problems quickly.

Excitingly this opens the door to, in the future, recreating precise
problems observed in production in userland and very quickly debugging
problems that might otherwise be very difficult to reproduce.

This patch series takes advantage of existing shim logic and full userland
maple tree support contained in tools/testing/radix-tree/ and
tools/include/linux/, separating out shared components of the radix tree
implementation to provide this testing.

Kernel functionality is stubbed and shimmed as needed in
tools/testing/vma/ which contains a fully functional userland
vma_internal.h file and which imports mm/vma.c and mm/vma.h to be directly
tested from userland.

A simple, skeleton testing implementation is provided in
tools/testing/vma/vma.c as a proof-of-concept, asserting that simple VMA
merge, modify (testing split), expand and shrink functionality work
correctly.


This patch (of 4):

This patch forms part of a patch series intending to separate out VMA
logic and render it testable from userspace, which requires that core
manipulation functions be exposed in an mm/-internal header file.

In order to do this, we must abstract APIs we wish to test, in this
instance functions which ultimately invoke vma_modify().

This patch therefore moves all logic which ultimately invokes vma_modify()
to mm/userfaultfd.c, trying to transfer code at a functional granularity
where possible.

[lorenzo.stoakes@oracle.com: fix user-after-free in userfaultfd_clear_vma()]
  Link: https://lkml.kernel.org/r/3c947ddc-b804-49b7-8fe9-3ea3ca13def5@lucifer.local
Link: https://lkml.kernel.org/r/cover.1722251717.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/50c3ed995fd81c45876c86304c8a00bf3e396cfd.1722251717.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Gow <davidgow@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Kees Cook <kees@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rae Moar <rmoar@google.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Pengfei Xu <pengfei.xu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/userfaultfd_k.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index a12bcf042551..9fc6ce15c499 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -267,6 +267,25 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm,
 extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
 extern bool userfaultfd_wp_async(struct vm_area_struct *vma);
 
+void userfaultfd_reset_ctx(struct vm_area_struct *vma);
+
+struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
+					     struct vm_area_struct *prev,
+					     struct vm_area_struct *vma,
+					     unsigned long start,
+					     unsigned long end);
+
+int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
+			       struct vm_area_struct *vma,
+			       unsigned long vm_flags,
+			       unsigned long start, unsigned long end,
+			       bool wp_async);
+
+void userfaultfd_release_new(struct userfaultfd_ctx *ctx);
+
+void userfaultfd_release_all(struct mm_struct *mm,
+			     struct userfaultfd_ctx *ctx);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
-- 
cgit v1.2.3


From fa04c08f3ce649b47781c7663e92398197f5b551 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 29 Jul 2024 12:50:36 +0100
Subject: mm: move vma_modify() and helpers to internal header

These are core VMA manipulation functions which invoke VMA splitting and
merging and should not be directly accessed from outside of mm/.

Link: https://lkml.kernel.org/r/5efde0c6342a8860d5ffc90b415f3989fd8ed0b2.1722251717.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Gow <davidgow@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Kees Cook <kees@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rae Moar <rmoar@google.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Pengfei Xu <pengfei.xu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 60 ------------------------------------------------------
 1 file changed, 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 153a4662a0cd..c4054e68dc09 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3279,66 +3279,6 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff,
 	bool *need_rmap_locks);
 extern void exit_mmap(struct mm_struct *);
-struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
-				  struct vm_area_struct *prev,
-				  struct vm_area_struct *vma,
-				  unsigned long start, unsigned long end,
-				  unsigned long vm_flags,
-				  struct mempolicy *policy,
-				  struct vm_userfaultfd_ctx uffd_ctx,
-				  struct anon_vma_name *anon_name);
-
-/* We are about to modify the VMA's flags. */
-static inline struct vm_area_struct
-*vma_modify_flags(struct vma_iterator *vmi,
-		  struct vm_area_struct *prev,
-		  struct vm_area_struct *vma,
-		  unsigned long start, unsigned long end,
-		  unsigned long new_flags)
-{
-	return vma_modify(vmi, prev, vma, start, end, new_flags,
-			  vma_policy(vma), vma->vm_userfaultfd_ctx,
-			  anon_vma_name(vma));
-}
-
-/* We are about to modify the VMA's flags and/or anon_name. */
-static inline struct vm_area_struct
-*vma_modify_flags_name(struct vma_iterator *vmi,
-		       struct vm_area_struct *prev,
-		       struct vm_area_struct *vma,
-		       unsigned long start,
-		       unsigned long end,
-		       unsigned long new_flags,
-		       struct anon_vma_name *new_name)
-{
-	return vma_modify(vmi, prev, vma, start, end, new_flags,
-			  vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
-}
-
-/* We are about to modify the VMA's memory policy. */
-static inline struct vm_area_struct
-*vma_modify_policy(struct vma_iterator *vmi,
-		   struct vm_area_struct *prev,
-		   struct vm_area_struct *vma,
-		   unsigned long start, unsigned long end,
-		   struct mempolicy *new_pol)
-{
-	return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
-			  new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
-}
-
-/* We are about to modify the VMA's flags and/or uffd context. */
-static inline struct vm_area_struct
-*vma_modify_flags_uffd(struct vma_iterator *vmi,
-		       struct vm_area_struct *prev,
-		       struct vm_area_struct *vma,
-		       unsigned long start, unsigned long end,
-		       unsigned long new_flags,
-		       struct vm_userfaultfd_ctx new_ctx)
-{
-	return vma_modify(vmi, prev, vma, start, end, new_flags,
-			  vma_policy(vma), new_ctx, anon_vma_name(vma));
-}
 
 static inline int check_data_rlimit(unsigned long rlim,
 				    unsigned long new,
-- 
cgit v1.2.3


From d61f0d59683d9c899211c300254d4140c482a6c0 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 29 Jul 2024 12:50:37 +0100
Subject: mm: move vma_shrink(), vma_expand() to internal header

The vma_shrink() and vma_expand() functions are internal VMA manipulation
functions which we ought to abstract for use outside of memory management
code.

To achieve this, we replace shift_arg_pages() in fs/exec.c with an
invocation of a new relocate_vma_down() function implemented in mm/mmap.c,
which enables us to also move move_page_tables() and vma_iter_prev_range()
to internal.h.

The purpose of doing this is to isolate key VMA manipulation functions in
order that we can both abstract them and later render them easily
testable.

Link: https://lkml.kernel.org/r/3cfcd9ec433e032a85f636fdc0d7d98fafbd19c5.1722251717.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Gow <davidgow@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Kees Cook <kees@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rae Moar <rmoar@google.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Pengfei Xu <pengfei.xu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c4054e68dc09..1c0d88446eb8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1005,12 +1005,6 @@ static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
 	return mas_prev(&vmi->mas, 0);
 }
 
-static inline
-struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
-{
-	return mas_prev_range(&vmi->mas, 0);
-}
-
 static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
 {
 	return vmi->mas.index;
@@ -2520,11 +2514,6 @@ int set_page_dirty_lock(struct page *page);
 
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
 
-extern unsigned long move_page_tables(struct vm_area_struct *vma,
-		unsigned long old_addr, struct vm_area_struct *new_vma,
-		unsigned long new_addr, unsigned long len,
-		bool need_rmap_locks, bool for_stack);
-
 /*
  * Flags used by change_protection().  For now we make it a bitmap so
  * that we can pass in multiple flags just like parameters.  However
@@ -3267,11 +3256,6 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
 
 /* mmap.c */
 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
-extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
-		      unsigned long start, unsigned long end, pgoff_t pgoff,
-		      struct vm_area_struct *next);
-extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
-		       unsigned long start, unsigned long end, pgoff_t pgoff);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void unlink_file_vma(struct vm_area_struct *);
@@ -3279,6 +3263,7 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff,
 	bool *need_rmap_locks);
 extern void exit_mmap(struct mm_struct *);
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
 
 static inline int check_data_rlimit(unsigned long rlim,
 				    unsigned long new,
-- 
cgit v1.2.3


From 49b1b8d6f6831026cb105b0eafa18f13db612d86 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 29 Jul 2024 12:50:38 +0100
Subject: mm: move internal core VMA manipulation functions to own file

This patch introduces vma.c and moves internal core VMA manipulation
functions to this file from mmap.c.

This allows us to isolate VMA functionality in a single place such that we
can create userspace testing code that invokes this functionality in an
environment where we can implement simple unit tests of core
functionality.

This patch ensures that core VMA functionality is explicitly marked as
such by its presence in mm/vma.h.

It also places the header includes required by vma.c in vma_internal.h,
which is simply imported by vma.c.  This makes the VMA functionality
testable, as userland testing code can simply stub out functionality as
required.

Link: https://lkml.kernel.org/r/c77a6aafb4c42aaadb8e7271a853658cbdca2e22.1722251717.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Gow <davidgow@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Kees Cook <kees@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rae Moar <rmoar@google.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Pengfei Xu <pengfei.xu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 35 -----------------------------------
 1 file changed, 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1c0d88446eb8..bd219ac9c026 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1005,21 +1005,6 @@ static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
 	return mas_prev(&vmi->mas, 0);
 }
 
-static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
-{
-	return vmi->mas.index;
-}
-
-static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
-{
-	return vmi->mas.last + 1;
-}
-static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
-				      unsigned long count)
-{
-	return mas_expected_entries(&vmi->mas, count);
-}
-
 static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
 			unsigned long start, unsigned long end, gfp_t gfp)
 {
@@ -2534,21 +2519,6 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen);
 #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
 					    MM_CP_UFFD_WP_RESOLVE)
 
-bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
-bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
-static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
-{
-	/*
-	 * We want to check manually if we can change individual PTEs writable
-	 * if we can't do that automatically for all PTEs in a mapping. For
-	 * private mappings, that's always the case when we have write
-	 * permissions as we properly have to handle COW.
-	 */
-	if (vma->vm_flags & VM_SHARED)
-		return vma_wants_writenotify(vma, vma->vm_page_prot);
-	return !!(vma->vm_flags & VM_WRITE);
-
-}
 bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte);
 extern long change_protection(struct mmu_gather *tlb,
@@ -3256,12 +3226,7 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
 
 /* mmap.c */
 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
-extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
-extern void unlink_file_vma(struct vm_area_struct *);
-extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
-	unsigned long addr, unsigned long len, pgoff_t pgoff,
-	bool *need_rmap_locks);
 extern void exit_mmap(struct mm_struct *);
 int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
 
-- 
cgit v1.2.3


From 29943248af0a8ac3edb808564baa417b40e35521 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Mon, 29 Jul 2024 14:47:17 +0530
Subject: mm: improve code consistency with zonelist_* helper functions

Replace direct access to zoneref->zone, zoneref->zone_idx, or
zone_to_nid(zoneref->zone) with the corresponding zonelist_* helper
functions for consistency.

No functional change.

Link: https://lkml.kernel.org/r/20240729091717.464-1-shivankg@amd.com
Co-developed-by: Shivank Garg <shivankg@amd.com>
Signed-off-by: Shivank Garg <shivankg@amd.com>
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1dc6248feb83..dbb69f96a152 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1688,7 +1688,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 			zone = zonelist_zone(z))
 
 #define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \
-	for (zone = z->zone;	\
+	for (zone = zonelist_zone(z);	\
 		zone;							\
 		z = next_zones_zonelist(++z, highidx, nodemask),	\
 			zone = zonelist_zone(z))
@@ -1724,7 +1724,7 @@ static inline bool movable_only_nodes(nodemask_t *nodes)
 	nid = first_node(*nodes);
 	zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
 	z = first_zones_zonelist(zonelist, ZONE_NORMAL,	nodes);
-	return (!z->zone) ? true : false;
+	return (!zonelist_zone(z)) ? true : false;
 }
 
 
-- 
cgit v1.2.3


From 9f101bef408a3f70c44b6e4de44d3d4e2655ed10 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Tue, 30 Jul 2024 19:13:39 +1200
Subject: mm: swap: add nr argument in swapcache_prepare and swapcache_clear to
 support large folios

Right now, swapcache_prepare() and swapcache_clear() supports one entry
only, to support large folios, we need to handle multiple swap entries.

To optimize stack usage, we iterate twice in __swap_duplicate(): the first
time to verify that all entries are valid, and the second time to apply
the modifications to the entries.

Currently, we're using nr=1 for the existing users.

[v-songbaohua@oppo.com: clarify swap_count_continued and improve readability for  __swap_duplicate]
  Link: https://lkml.kernel.org/r/20240802071817.47081-1-21cnbao@gmail.com
Link: https://lkml.kernel.org/r/20240730071339.107447-2-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Gao Xiang <xiang@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ba7ea95d1c57..5b920fa2315b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -480,7 +480,7 @@ extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
-extern int swapcache_prepare(swp_entry_t);
+extern int swapcache_prepare(swp_entry_t entry, int nr);
 extern void swap_free_nr(swp_entry_t entry, int nr_pages);
 extern void swapcache_free_entries(swp_entry_t *entries, int n);
 extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
@@ -554,7 +554,7 @@ static inline int swap_duplicate(swp_entry_t swp)
 	return 0;
 }
 
-static inline int swapcache_prepare(swp_entry_t swp)
+static inline int swapcache_prepare(swp_entry_t swp, int nr)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 94ccd21e9a5f41585bd16cc84dab2afa6cc21149 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 31 Jul 2024 16:20:00 +0200
Subject: mm/hugetlb: remove hugetlb_follow_page_mask() leftover

We removed hugetlb_follow_page_mask() in commit 9cb28da54643 ("mm/gup:
handle hugetlb in the generic follow_page_mask code") but forgot to
cleanup some leftovers.

While at it, simplify the hugetlb comment, it's overly detailed and rather
confusing.  Stating that we may end up in there during coredumping is
sufficient to explain the PF_DUMPCORE usage.

Link: https://lkml.kernel.org/r/20240731142000.625044-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 9b7bcfce6920..3100a52ceb73 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -127,9 +127,6 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 			     unsigned long len);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
 			    struct vm_area_struct *, struct vm_area_struct *);
-struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
-				      unsigned long address, unsigned int flags,
-				      unsigned int *page_mask);
 void unmap_hugepage_range(struct vm_area_struct *,
 			  unsigned long, unsigned long, struct page *,
 			  zap_flags_t);
-- 
cgit v1.2.3


From 17d5f38b33b66210c5a46af639c47fddfe1c5b4d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 31 Jul 2024 18:07:58 +0200
Subject: mm: clarify folio_likely_mapped_shared() documentation for KSM folios

For KSM folios, the function actually does what it is supposed to do: even
having multiple mappings inside the same MM is considered "sharing", as
there is no real relationship between these KSM page mappings -- in
contrast to mapping the same file range twice and having the same
pagecache page mapped twice.

Link: https://lkml.kernel.org/r/20240731160758.808925-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bd219ac9c026..2f6c08b53e4f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2132,14 +2132,19 @@ static inline size_t folio_size(const struct folio *folio)
  * MM ("mapped shared"), or if the folio is only mapped into a single MM
  * ("mapped exclusively").
  *
+ * For KSM folios, this function also returns "mapped shared" when a folio is
+ * mapped multiple times into the same MM, because the individual page mappings
+ * are independent.
+ *
  * As precise information is not easily available for all folios, this function
  * estimates the number of MMs ("sharers") that are currently mapping a folio
  * using the number of times the first page of the folio is currently mapped
  * into page tables.
  *
- * For small anonymous folios (except KSM folios) and anonymous hugetlb folios,
- * the return value will be exactly correct, because they can only be mapped
- * at most once into an MM, and they cannot be partially mapped.
+ * For small anonymous folios and anonymous hugetlb folios, the return
+ * value will be exactly correct: non-KSM folios can only be mapped at most once
+ * into an MM, and they cannot be partially mapped. KSM folios are
+ * considered shared even if mapped multiple times into the same MM.
  *
  * For other folios, the result can be fuzzy:
  *    #. For partially-mappable large folios (THP), the return value can wrongly
@@ -2148,9 +2153,6 @@ static inline size_t folio_size(const struct folio *folio)
  *    #. For pagecache folios (including hugetlb), the return value can wrongly
  *       indicate "mapped shared" (false positive) when two VMAs in the same MM
  *       cover the same file range.
- *    #. For (small) KSM folios, the return value can wrongly indicate "mapped
- *       shared" (false positive), when the folio is mapped multiple times into
- *       the same MM.
  *
  * Further, this function only considers current page table mappings that
  * are tracked using the folio mapcount(s).
-- 
cgit v1.2.3


From 03790c51a475c46647079e67e2460a149938bfd6 Mon Sep 17 00:00:00 2001
From: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
Date: Thu, 1 Aug 2024 23:25:47 +0000
Subject: mm: create promo_wmark_pages and clean up open-coded sites

Patch series "mm: print the promo watermark in zoneinfo", v2.


This patch (of 2):

Define promo_wmark_pages and convert current call sites of wmark_pages
with fixed WMARK_PROMO to using it instead.

Link: https://lkml.kernel.org/r/20240801232548.36604-1-kaiyang2@cs.cmu.edu
Link: https://lkml.kernel.org/r/20240801232548.36604-2-kaiyang2@cs.cmu.edu
Signed-off-by: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index dbb69f96a152..9f3589f7f05c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -669,6 +669,7 @@ enum zone_watermarks {
 #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
 #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
+#define promo_wmark_pages(z) (z->_watermark[WMARK_PROMO] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
 /*
-- 
cgit v1.2.3


From 620943d7ee69070df8844235d58843af48ac70e2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 1 Aug 2024 16:50:05 -0700
Subject: include/linux/mmzone.h: clean up watermark accessors

- we have a helper wmark_pages().  Teach min_wmark_pages(),
  low_wmark_pages(), high_wmark_pages() and promo_wmark_pages() to use
  it instead of open-coding its implementation.

- there's no reason to implement all these things as macros.  Redo them
  in C.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9f3589f7f05c..17506e4a2835 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -666,12 +666,6 @@ enum zone_watermarks {
 #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
 #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)
 
-#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
-#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
-#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
-#define promo_wmark_pages(z) (z->_watermark[WMARK_PROMO] + z->watermark_boost)
-#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
-
 /*
  * Flags used in pcp->flags field.
  *
@@ -1017,6 +1011,32 @@ enum zone_flags {
 	ZONE_BELOW_HIGH,		/* zone is below high watermark. */
 };
 
+static inline unsigned long wmark_pages(const struct zone *z,
+					enum zone_watermarks w)
+{
+	return z->_watermark[w] + z->watermark_boost;
+}
+
+static inline unsigned long min_wmark_pages(const struct zone *z)
+{
+	return wmark_pages(z, WMARK_MIN);
+}
+
+static inline unsigned long low_wmark_pages(const struct zone *z)
+{
+	return wmark_pages(z, WMARK_LOW);
+}
+
+static inline unsigned long high_wmark_pages(const struct zone *z)
+{
+	return wmark_pages(z, WMARK_HIGH);
+}
+
+static inline unsigned long promo_wmark_pages(const struct zone *z)
+{
+	return wmark_pages(z, WMARK_PROMO);
+}
+
 static inline unsigned long zone_managed_pages(struct zone *zone)
 {
 	return (unsigned long)atomic_long_read(&zone->managed_pages);
-- 
cgit v1.2.3


From aa39ca6940f1a0540f2984051b3089972f42959b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 2 Aug 2024 17:55:15 +0200
Subject: mm/pagewalk: introduce folio_walk_start() + folio_walk_end()

We want to get rid of follow_page(), and have a more reasonable way to
just lookup a folio mapped at a certain address, perform some checks while
still under PTL, and then only conditionally grab a folio reference if
really required.

Further, we might want to get rid of some walk_page_range*() users that
really only want to temporarily lookup a single folio at a single address.

So let's add a new page table walker that does exactly that, similarly to
GUP also being able to walk hugetlb VMAs.

Add folio_walk_end() as a macro for now: the compiler is not easy to
please with the pte_unmap()->kunmap_local().

Note that one difference between follow_page() and get_user_pages(1) is
that follow_page() will not trigger faults to get something mapped.  So
folio_walk is at least currently not a replacement for get_user_pages(1),
but could likely be extended/reused to achieve something similar in the
future.

Link: https://lkml.kernel.org/r/20240802155524.517137-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagewalk.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 27cd1e59ccf7..f5eb5a32aeed 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -130,4 +130,62 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
 		      pgoff_t nr, const struct mm_walk_ops *ops,
 		      void *private);
 
+typedef int __bitwise folio_walk_flags_t;
+
+/*
+ * Walk migration entries as well. Careful: a large folio might get split
+ * concurrently.
+ */
+#define FW_MIGRATION			((__force folio_walk_flags_t)BIT(0))
+
+/* Walk shared zeropages (small + huge) as well. */
+#define FW_ZEROPAGE			((__force folio_walk_flags_t)BIT(1))
+
+enum folio_walk_level {
+	FW_LEVEL_PTE,
+	FW_LEVEL_PMD,
+	FW_LEVEL_PUD,
+};
+
+/**
+ * struct folio_walk - folio_walk_start() / folio_walk_end() data
+ * @page:	exact folio page referenced (if applicable)
+ * @level:	page table level identifying the entry type
+ * @pte:	pointer to the page table entry (FW_LEVEL_PTE).
+ * @pmd:	pointer to the page table entry (FW_LEVEL_PMD).
+ * @pud:	pointer to the page table entry (FW_LEVEL_PUD).
+ * @ptl:	pointer to the page table lock.
+ *
+ * (see folio_walk_start() documentation for more details)
+ */
+struct folio_walk {
+	/* public */
+	struct page *page;
+	enum folio_walk_level level;
+	union {
+		pte_t *ptep;
+		pud_t *pudp;
+		pmd_t *pmdp;
+	};
+	union {
+		pte_t pte;
+		pud_t pud;
+		pmd_t pmd;
+	};
+	/* private */
+	struct vm_area_struct *vma;
+	spinlock_t *ptl;
+};
+
+struct folio *folio_walk_start(struct folio_walk *fw,
+		struct vm_area_struct *vma, unsigned long addr,
+		folio_walk_flags_t flags);
+
+#define folio_walk_end(__fw, __vma) do { \
+	spin_unlock((__fw)->ptl); \
+	if (likely((__fw)->level == FW_LEVEL_PTE)) \
+		pte_unmap((__fw)->ptep); \
+	vma_pgtable_walk_end(__vma); \
+} while (0)
+
 #endif /* _LINUX_PAGEWALK_H */
-- 
cgit v1.2.3


From 8710f6ed34e7bcf98971d65acfb5eaed5fc469b0 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 2 Aug 2024 17:55:20 +0200
Subject: mm/huge_memory: convert split_huge_pages_pid() from follow_page() to
 folio_walk

Let's remove yet another follow_page() user.  Note that we have to do the
split without holding the PTL, after folio_walk_end().  We don't care
about losing the secretmem check in follow_page().

[david@redhat.com: teach can_split_folio() that we are not holding an additional reference]
  Link: https://lkml.kernel.org/r/c75d1c6c-8ea6-424f-853c-1ccda6c77ba2@redhat.com
Link: https://lkml.kernel.org/r/20240802155524.517137-8-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e25d9ebfdf89..ce44caa40eed 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -314,7 +314,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
 		unsigned long len, unsigned long pgoff, unsigned long flags,
 		vm_flags_t vm_flags);
 
-bool can_split_folio(struct folio *folio, int *pextra_pins);
+bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
 int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		unsigned int new_order);
 static inline int split_huge_page(struct page *page)
@@ -470,7 +470,7 @@ thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
 }
 
 static inline bool
-can_split_folio(struct folio *folio, int *pextra_pins)
+can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
 {
 	return false;
 }
-- 
cgit v1.2.3


From 7290840de65e04c1fc59dab692468fc9600c6038 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 2 Aug 2024 17:55:23 +0200
Subject: mm: remove follow_page()

All users are gone, let's remove it and any leftovers in comments.  We'll
leave any FOLL/follow_page_() naming cleanups as future work.

Link: https://lkml.kernel.org/r/20240802155524.517137-11-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2f6c08b53e4f..ee8cea73d415 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3527,9 +3527,6 @@ static inline vm_fault_t vmf_fs_error(int err)
 	return VM_FAULT_SIGBUS;
 }
 
-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
-			 unsigned int foll_flags);
-
 static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
 {
 	if (vm_fault & VM_FAULT_OOM)
-- 
cgit v1.2.3


From e31c38e037621c445bb4393fd77e0a76e6e0899a Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Mon, 5 Aug 2024 16:22:42 -0700
Subject: zswap: implement a second chance algorithm for dynamic zswap shrinker

Patch series "improving dynamic zswap shrinker protection scheme", v3.

When experimenting with the memory-pressure based (i.e "dynamic") zswap
shrinker in production, we observed a sharp increase in the number of
swapins, which led to performance regression.  We were able to trace this
regression to the following problems with the shrinker's warm pages
protection scheme:

1. The protection decays way too rapidly, and the decaying is coupled with
   zswap stores, leading to anomalous patterns, in which a small batch of
   zswap stores effectively erase all the protection in place for the
   warmer pages in the zswap LRU.

   This observation has also been corroborated upstream by Takero Funaki
   (in [1]).

2. We inaccurately track the number of swapped in pages, missing the
   non-pivot pages that are part of the readahead window, while counting
   the pages that are found in the zswap pool.


To alleviate these two issues, this patch series improve the dynamic zswap
shrinker in the following manner:

1. Replace the protection size tracking scheme with a second chance
   algorithm. This new scheme removes the need for haphazard stats
   decaying, and automatically adjusts the pace of pages aging with memory
   pressure, and writeback rate with pool activities: slowing down when
   the pool is dominated with zswpouts, and speeding up when the pool is
   dominated with stale entries.

2. Fix the tracking of the number of swapins to take into account
   non-pivot pages in the readahead window.

With these two changes in place, in a kernel-building benchmark without
any cold data added, the number of swapins is reduced by 64.12%.  This
translate to a 10.32% reduction in build time.  We also observe a 3%
reduction in kernel CPU time.

In another benchmark, with cold data added (to gauge the new algorithm's
ability to offload cold data), the new second chance scheme outperforms
the old protection scheme by around 0.7%, and actually written back around
21% more pages to backing swap device.  So the new scheme is just as good,
if not even better than the old scheme on this front as well.

[1]: https://lore.kernel.org/linux-mm/CAPpodddcGsK=0Xczfuk8usgZ47xeyf4ZjiofdT+ujiyz6V2pFQ@mail.gmail.com/


This patch (of 2):

Current zswap shrinker's heuristics to prevent overshrinking is brittle
and inaccurate, specifically in the way we decay the protection size (i.e
making pages in the zswap LRU eligible for reclaim).

We currently decay protection aggressively in zswap_lru_add() calls.  This
leads to the following unfortunate effect: when a new batch of pages enter
zswap, the protection size rapidly decays to below 25% of the zswap LRU
size, which is way too low.

We have observed this effect in production, when experimenting with the
zswap shrinker: the rate of shrinking shoots up massively right after a
new batch of zswap stores.  This is somewhat the opposite of what we want
originally - when new pages enter zswap, we want to protect both these new
pages AND the pages that are already protected in the zswap LRU.

Replace existing heuristics with a second chance algorithm

1. When a new zswap entry is stored in the zswap pool, its referenced
   bit is set.
2. When the zswap shrinker encounters a zswap entry with the referenced
   bit set, give it a second chance - only flips the referenced bit and
   rotate it in the LRU.
3. If the shrinker encounters the entry again, this time with its
   referenced bit unset, then it can reclaim the entry.

In this manner, the aging of the pages in the zswap LRUs are decoupled
from zswap stores, and picks up the pace with increasing memory pressure
(which is what we want).

The second chance scheme allows us to modulate the writeback rate based on
recent pool activities.  Entries that recently entered the pool will be
protected, so if the pool is dominated by such entries the writeback rate
will reduce proportionally, protecting the workload's workingset.On the
other hand, stale entries will be written back quickly, which increases
the effective writeback rate.

The referenced bit is added at the hole after the `length` field of struct
zswap_entry, so there is no extra space overhead for this algorithm.

We will still maintain the count of swapins, which is consumed and
subtracted from the lru size in zswap_shrinker_count(), to further
penalize past overshrinking that led to disk swapins.  The idea is that
had we considered this many more pages in the LRU active/protected, they
would not have been written back and we would not have had to swapped them
in.

To test this new heuristics, I built the kernel under a cgroup with
memory.max set to 2G, on a host with 36 cores:

With the old shrinker:

real: 263.89s
user: 4318.11s
sys: 673.29s
swapins: 227300.5

With the second chance algorithm:

real: 244.85s
user: 4327.22s
sys: 664.39s
swapins: 94663

(average over 5 runs)

We observe an 1.3% reduction in kernel CPU usage, and around 7.2%
reduction in real time. Note that the number of swapped in pages
dropped by 58%.

[nphamcs@gmail.com: fix a small mistake in the referenced bit documentation]
  Link: https://lkml.kernel.org/r/20240806003403.3142387-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20240805232243.2896283-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20240805232243.2896283-2-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Takero Funaki <flintglass@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zswap.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 6cecb4a4f68b..9cd1beef0654 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -13,17 +13,15 @@ extern atomic_t zswap_stored_pages;
 
 struct zswap_lruvec_state {
 	/*
-	 * Number of pages in zswap that should be protected from the shrinker.
-	 * This number is an estimate of the following counts:
+	 * Number of swapped in pages from disk, i.e not found in the zswap pool.
 	 *
-	 * a) Recent page faults.
-	 * b) Recent insertion to the zswap LRU. This includes new zswap stores,
-	 *    as well as recent zswap LRU rotations.
-	 *
-	 * These pages are likely to be warm, and might incur IO if the are written
-	 * to swap.
+	 * This is consumed and subtracted from the lru size in
+	 * zswap_shrinker_count() to penalize past overshrinking that led to disk
+	 * swapins. The idea is that had we considered this many more pages in the
+	 * LRU active/protected and not written them back, we would not have had to
+	 * swapped them in.
 	 */
-	atomic_long_t nr_zswap_protected;
+	atomic_long_t nr_disk_swapins;
 };
 
 unsigned long zswap_total_pages(void);
-- 
cgit v1.2.3


From 17fe833b0de08a78bfb51388e0615969d73ea8ad Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Mon, 5 Aug 2024 14:52:03 +0200
Subject: mm: fix (harmless) type confusion in lock_vma_under_rcu()

There is a (harmless) type confusion in lock_vma_under_rcu(): After
vma_start_read(), we have taken the VMA lock but don't know yet whether
the VMA has already been detached and scheduled for RCU freeing.  At this
point, ->vm_start and ->vm_end are accessed.

vm_area_struct contains a union such that ->vm_rcu uses the same memory as
->vm_start and ->vm_end; so accessing ->vm_start and ->vm_end of a
detached VMA is illegal and leads to type confusion between union members.

Fix it by reordering the vma->detached check above the address checks, and
document the rules for RCU readers accessing VMAs.

This will probably change the number of observed VMA_LOCK_MISS events
(since previously, trying to access a detached VMA whose ->vm_rcu has been
scheduled would bail out when checking the fault address against the
rcu_head members reinterpreted as VMA bounds).

Link: https://lkml.kernel.org/r/20240805-fix-vma-lock-type-confusion-v1-1-9f25443a9a71@google.com
Fixes: 50ee32537206 ("mm: introduce lock_vma_under_rcu to be used from arch-specific code")
Signed-off-by: Jann Horn <jannh@google.com>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 165c58b12ccc..003619fab20e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -660,6 +660,9 @@ struct vma_numab_state {
  * per VM-area/task. A VM area is any part of the process virtual memory
  * space that has a special rule for the page-fault handlers (ie a shared
  * library, the executable area etc).
+ *
+ * Only explicitly marked struct members may be accessed by RCU readers before
+ * getting a stable reference.
  */
 struct vm_area_struct {
 	/* The first cache line has the info for VMA tree walking. */
@@ -675,7 +678,11 @@ struct vm_area_struct {
 #endif
 	};
 
-	struct mm_struct *vm_mm;	/* The address space we belong to. */
+	/*
+	 * The address space we belong to.
+	 * Unstable RCU readers are allowed to read this.
+	 */
+	struct mm_struct *vm_mm;
 	pgprot_t vm_page_prot;          /* Access permissions of this VMA. */
 
 	/*
@@ -688,7 +695,10 @@ struct vm_area_struct {
 	};
 
 #ifdef CONFIG_PER_VMA_LOCK
-	/* Flag to indicate areas detached from the mm->mm_mt tree */
+	/*
+	 * Flag to indicate areas detached from the mm->mm_mt tree.
+	 * Unstable RCU readers are allowed to read this.
+	 */
 	bool detached;
 
 	/*
@@ -706,6 +716,7 @@ struct vm_area_struct {
 	 * slowpath.
 	 */
 	int vm_lock_seq;
+	/* Unstable RCU readers are allowed to read this. */
 	struct vma_lock *vm_lock;
 #endif
 
-- 
cgit v1.2.3


From cc0a0f98553528791ae33ba5ee8c118b52ae2028 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 5 Aug 2024 14:39:39 +0200
Subject: kfence: introduce burst mode

Introduce burst mode, which can be configured with kfence.burst=$count,
where the burst count denotes the additional successive slab allocations
to be allocated through KFENCE for each sample interval.

The idea is that this can give developers an additional knob to make
KFENCE more aggressive when debugging specific issues of systems where
either rebooting or recompiling the kernel with KASAN is not possible.

Experiment: To assess the effectiveness of the new option, we randomly
picked a recent out-of-bounds [1] and use-after-free bug [2], each with a
reproducer provided by syzbot, that initially detected these bugs with
KASAN.  We then tried to reproduce the bugs with KFENCE below.

[1] Fixed by: 7c55b78818cf ("jfs: xattr: fix buffer overflow for invalid xattr")
    https://syzkaller.appspot.com/bug?id=9d1b59d4718239da6f6069d3891863c25f9f24a2
[2] Fixed by: f8ad00f3fb2a ("l2tp: fix possible UAF when cleaning up tunnels")
    https://syzkaller.appspot.com/bug?id=4f34adc84f4a3b080187c390eeef60611fd450e1

The following KFENCE configs were compared. A pool size of 1023 objects
was used for all configurations.

	Baseline
		kfence.sample_interval=100
		kfence.skip_covered_thresh=75
		kfence.burst=0

	Aggressive
		kfence.sample_interval=1
		kfence.skip_covered_thresh=10
		kfence.burst=0

	AggressiveBurst
		kfence.sample_interval=1
		kfence.skip_covered_thresh=10
		kfence.burst=1000

Each reproducer was run 10 times (after a fresh reboot), with the
following detection counts for each KFENCE config:

                    | Detection Count out of 10 |
                    |    OOB [1]  |    UAF [2]  |
  ------------------+-------------+-------------+
  Default           |     0/10    |     0/10    |
  Aggressive        |     0/10    |     0/10    |
  AggressiveBurst   |     8/10    |     8/10    |

With the Default and even the Aggressive configs the results are
unsurprising, given KFENCE has not been designed for deterministic bug
detection of small test cases.

However, when enabling burst mode with relatively large burst count,
KFENCE can start to detect heap memory-safety bugs even in simpler test
cases with high probability (in the above cases with ~80% probability).

Link: https://lkml.kernel.org/r/20240805124203.2692278-1-elver@google.com
Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kfence.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 88100cc9caba..0ad1ddbb8b99 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -124,7 +124,7 @@ static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp
 	if (!static_branch_likely(&kfence_allocation_key))
 		return NULL;
 #endif
-	if (likely(atomic_read(&kfence_allocation_gate)))
+	if (likely(atomic_read(&kfence_allocation_gate) > 0))
 		return NULL;
 	return __kfence_alloc(s, size, flags);
 }
-- 
cgit v1.2.3


From 47baed6a132f611228113851a70c73f6e6478d87 Mon Sep 17 00:00:00 2001
From: Jianhui Zhou <912460177@qq.com>
Date: Wed, 7 Aug 2024 15:44:48 +0800
Subject: percpu: remove pcpu_alloc_size()

pcpu_alloc_size() was added in 7ac5c53e0073 "mm/percpu.c: introduce
pcpu_alloc_size()", which is used to get the allocated memory size in bpf.
However, pcpu_alloc_size() is no longer used in "bpf: Use c->unit_size to
select target cache during free" because its actuall allocated memory size
may change at runtime due to its slab merging mechanism.  Therefore,
pcpu_alloc_size() can be removed.

Link: https://lkml.kernel.org/r/tencent_AD5C50E8D78C07A3CE539BD5F6BF39706507@qq.com
Signed-off-by: Jianhui Zhou <912460177@qq.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: JonasZhou <JonasZhou@zhaoxin.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 4b2047b78b67..b6321fc49159 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -135,7 +135,6 @@ extern void __init setup_per_cpu_areas(void);
 
 extern void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
 				   gfp_t gfp) __alloc_size(1);
-extern size_t pcpu_alloc_size(void __percpu *__pdata);
 
 #define __alloc_percpu_gfp(_size, _align, _gfp)				\
 	alloc_hooks(pcpu_alloc_noprof(_size, _align, false, _gfp))
-- 
cgit v1.2.3


From 09022bc196d23484a7a5d48cf373f8583e3fcf23 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 7 Aug 2024 20:35:26 +0100
Subject: mm: remove PG_error

The PG_error bit is now unused; delete it and free up a bit in
page->flags.

Link: https://lkml.kernel.org/r/20240807193528.1865100-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5769fe6e4950..a0a29bd092f8 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -66,8 +66,6 @@
  * PG_referenced, PG_reclaim are used for page reclaim for anonymous and
  * file-backed pagecache (see mm/vmscan.c).
  *
- * PG_error is set to indicate that an I/O error occurred on this page.
- *
  * PG_arch_1 is an architecture specific page state bit.  The generic code
  * guarantees that this bit is cleared for a page when it first is entered into
  * the page cache.
@@ -103,7 +101,6 @@ enum pageflags {
 	PG_waiters,		/* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
 	PG_active,
 	PG_workingset,
-	PG_error,
 	PG_owner_priv_1,	/* Owner use. If pagecache, fs may use*/
 	PG_arch_1,
 	PG_reserved,
@@ -183,7 +180,7 @@ enum pageflags {
 	 */
 
 	/* At least one page in this folio has the hwpoison flag set */
-	PG_has_hwpoisoned = PG_error,
+	PG_has_hwpoisoned = PG_active,
 	PG_large_rmappable = PG_workingset, /* anon or file-backed */
 };
 
@@ -506,7 +503,6 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
 
 __PAGEFLAG(Locked, locked, PF_NO_TAIL)
 FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE)
-PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
 FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE)
 	FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE)
 	__FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE)
-- 
cgit v1.2.3


From 310183de7bb2ed114a80d057690ddb23d0cfe814 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 9 Aug 2024 14:48:50 +0300
Subject: mm: introduce PageUnaccepted() page type

The new page type allows physical memory scanners to detect unaccepted
memory and handle it accordingly.

The page type is serialized with zone lock.

Link: https://lkml.kernel.org/r/20240809114854.3745464-5-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a0a29bd092f8..17175a328e6b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -939,6 +939,7 @@ enum pagetype {
 	PG_hugetlb	= 0x04000000,
 	PG_slab		= 0x02000000,
 	PG_zsmalloc	= 0x01000000,
+	PG_unaccepted	= 0x00800000,
 
 	PAGE_TYPE_BASE	= 0x80000000,
 
@@ -1072,6 +1073,13 @@ FOLIO_TEST_FLAG_FALSE(hugetlb)
 
 PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc)
 
+/*
+ * Mark pages that has to be accepted before touched for the first time.
+ *
+ * Serialized with zone lock.
+ */
+PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted)
+
 /**
  * PageHuge - Determine if the page belongs to hugetlbfs
  * @page: The page to test.
-- 
cgit v1.2.3


From 5adfeaecc487e7023f1c7bbdc081707d7a93110f Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 9 Aug 2024 14:48:51 +0300
Subject: mm: rework accept memory helpers

Make accept_memory() and range_contains_unaccepted_memory() take 'start'
and 'size' arguments instead of 'start' and 'end'.

Remove accept_page(), replacing it with direct calls to accept_memory().
The accept_page() name is going to be used for a different function.

Link: https://lkml.kernel.org/r/20240809114854.3745464-6-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ee8cea73d415..b1eed30fdc06 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4062,18 +4062,18 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 
 #ifdef CONFIG_UNACCEPTED_MEMORY
 
-bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end);
-void accept_memory(phys_addr_t start, phys_addr_t end);
+bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size);
+void accept_memory(phys_addr_t start, unsigned long size);
 
 #else
 
 static inline bool range_contains_unaccepted_memory(phys_addr_t start,
-						    phys_addr_t end)
+						    unsigned long size)
 {
 	return false;
 }
 
-static inline void accept_memory(phys_addr_t start, phys_addr_t end)
+static inline void accept_memory(phys_addr_t start, unsigned long size)
 {
 }
 
@@ -4081,9 +4081,7 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end)
 
 static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
 {
-	phys_addr_t paddr = pfn << PAGE_SHIFT;
-
-	return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
+	return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE);
 }
 
 void vma_pgtable_walk_begin(struct vm_area_struct *vma);
-- 
cgit v1.2.3


From 1c399e74a97ca733d0780cc51819aa81fb292c22 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 12 Aug 2024 14:12:23 -0400
Subject: mm/x86: implement arch_check_zapped_pud()

Introduce arch_check_zapped_pud() to sanity check shadow stack on PUD
zaps.  It has the same logic as the PMD helper.

One thing to mention is, it might be a good idea to use page_table_check
in the future for trapping wrong setups of shadow stack pgtable entries
[1].  That is left for the future as a separate effort.

[1] https://lore.kernel.org/all/59d518698f664e07c036a5098833d7b56b953305.camel@intel.com

Link: https://lkml.kernel.org/r/20240812181225.1360970-6-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 2a6a3cccfc36..780f3b439d98 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -447,6 +447,12 @@ static inline void arch_check_zapped_pmd(struct vm_area_struct *vma,
 }
 #endif
 
+#ifndef arch_check_zapped_pud
+static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
+{
+}
+#endif
+
 #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 				       unsigned long address,
-- 
cgit v1.2.3


From cb0f01beb16669e91510fcdb2cea213931aee017 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 12 Aug 2024 14:12:25 -0400
Subject: mm/mprotect: fix dax pud handlings

This is only relevant to the two archs that support PUD dax, aka, x86_64
and ppc64.  PUD THPs do not yet exist elsewhere, and hugetlb PUDs do not
count in this case.

DAX have had PUD mappings for years, but change protection path never
worked.  When the path is triggered in any form (a simple test program
would be: call mprotect() on a 1G dev_dax mapping), the kernel will report
"bad pud".  This patch should fix that.

The new change_huge_pud() tries to keep everything simple.  For example,
it doesn't optimize write bit as that will need even more PUD helpers.
It's not too bad anyway to have one more write fault in the worst case
once for 1G range; may be a bigger thing for each PAGE_SIZE, though.
Neither does it support userfault-wp bits, as there isn't such PUD
mappings that is supported; file mappings always need a split there.

The same to TLB shootdown: the pmd path (which was for x86 only) has the
trick of using _ad() version of pmdp_invalidate*() which can avoid one
redundant TLB, but let's also leave that for later.  Again, the larger the
mapping, the smaller of such effect.

There's some difference on handling "retry" for change_huge_pud() (where
it can return 0): it isn't like change_huge_pmd(), as the pmd version is
safe with all conditions handled in change_pte_range() later, thanks to
Hugh's new pte_offset_map_lock().  In short, change_pte_range() is simply
smarter.  For that, change_pud_range() will need proper retry if it races
with something else when a huge PUD changed from under us.

The last thing to mention is currently the PUD path ignores the huge pte
numa counter (NUMA_HUGE_PTE_UPDATES), not only because DAX is not
applicable to NUMA, but also that it's ambiguous on its own to decide how
to account pud in this case.  In one earlier version of this patchset I
proposed to remove the counter as it doesn't even look right to do the
accounting as of now [1], but then a further discussion suggests we can
leave that for later, as that doesn't block this series if we choose to
ignore that counter.  That's what this patch does, by ignoring it.

When at it, touch up the comment in pgtable_split_needed() to make it
generic to either pmd or pud file THPs.

[1] https://lore.kernel.org/all/20240715192142.3241557-3-peterx@redhat.com/
[2] https://lore.kernel.org/r/added2d0-b8be-4108-82ca-1367a388d0b1@redhat.com

Link: https://lkml.kernel.org/r/20240812181225.1360970-8-peterx@redhat.com
Fixes: a00cc7d9dd93 ("mm, x86: add support for PUD-sized transparent hugepages")
Fixes: 27af67f35631 ("powerpc/book3s64/mm: enable transparent pud hugepage")
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sean Christopherson <seanjc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ce44caa40eed..6370026689e0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -342,6 +342,17 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long address);
 
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
+		    pud_t *pudp, unsigned long addr, pgprot_t newprot,
+		    unsigned long cp_flags);
+#else
+static inline int
+change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
+		pud_t *pudp, unsigned long addr, pgprot_t newprot,
+		unsigned long cp_flags) { return 0; }
+#endif
+
 #define split_huge_pud(__vma, __pud, __address)				\
 	do {								\
 		pud_t *____pud = (__pud);				\
@@ -585,6 +596,19 @@ static inline int next_order(unsigned long *orders, int prev)
 {
 	return 0;
 }
+
+static inline void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
+				    unsigned long address)
+{
+}
+
+static inline int change_huge_pud(struct mmu_gather *tlb,
+				  struct vm_area_struct *vma, pud_t *pudp,
+				  unsigned long addr, pgprot_t newprot,
+				  unsigned long cp_flags)
+{
+	return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 static inline int split_folio_to_list_to_order(struct folio *folio,
-- 
cgit v1.2.3


From 223febc6e5573cda5e94e6b38fcdaded068db777 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 12 Aug 2024 18:26:02 +1000
Subject: mm: add optional close() to struct vm_special_mapping

Add an optional close() callback to struct vm_special_mapping.  It will be
used, by powerpc at least, to handle unmapping of the VDSO.

Although support for unmapping the VDSO was initially added for CRIU[1],
it is not desirable to guard that support behind
CONFIG_CHECKPOINT_RESTORE.

There are other known users of unmapping the VDSO which are not related to
CRIU, eg.  Valgrind [2] and void-ship [3].

The powerpc arch_unmap() hook has been in place for ~9 years, with no
ifdef, so there may be other unknown users that have come to rely on
unmapping the VDSO.  Even if the code was behind an ifdef, major distros
enable CHECKPOINT_RESTORE so users may not realise unmapping the VDSO
depends on that configuration option.

It's also undesirable to have such core mm behaviour behind a relatively
obscure CONFIG option.

Longer term the unmap behaviour should be standardised across
architectures, however that is complicated by the fact the VDSO pointer is
stored differently across architectures.  There was a previous attempt to
unify that handling [4], which could be revived.

See [5] for further discussion.

[1]: commit 83d3f0e90c6c ("powerpc/mm: tracking vDSO remap")
[2]: https://sourceware.org/git/?p=valgrind.git;a=commit;h=3a004915a2cbdcdebafc1612427576bf3321eef5
[3]: https://github.com/insanitybit/void-ship
[4]: https://lore.kernel.org/lkml/20210611180242.711399-17-dima@arista.com/
[5]: https://lore.kernel.org/linuxppc-dev/shiq5v3jrmyi6ncwke7wgl76ojysgbhrchsk32q4lbx2hadqqc@kzyy2igem256

Link: https://lkml.kernel.org/r/20240812082605.743814-1-mpe@ellerman.id.au
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Jeff Xu <jeffxu@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Pedro Falcato <pedro.falcato@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 003619fab20e..2419e60c9a7f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1324,6 +1324,9 @@ struct vm_special_mapping {
 
 	int (*mremap)(const struct vm_special_mapping *sm,
 		     struct vm_area_struct *new_vma);
+
+	void (*close)(const struct vm_special_mapping *sm,
+		      struct vm_area_struct *vma);
 };
 
 enum tlb_flush_reason {
-- 
cgit v1.2.3


From 497258dfafcc6b88e7c4c46a38a479721d44cf41 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 20 Aug 2024 15:14:47 -0700
Subject: mm: remove legacy install_special_mapping() code

All relevant architectures had already been converted to the new interface
(which just has an underscore in front of the name - not very imaginative
naming), this just force-converts the stragglers.

The modern interface is almost identical to the old one, except instead of
the page pointer it takes a "struct vm_special_mapping" that describes the
mapping (and contains the page pointer as one member), and it returns the
resulting 'vma' instead of just the error code.

Getting rid of the old interface also gets rid of some special casing,
which had caused problems with the mremap extensions to "struct
vm_special_mapping".

[akpm@linux-foundation.org: coding-style cleanups]
Link: https://lkml.kernel.org/r/CAHk-=whvR+z=0=0gzgdfUiK70JTa-=+9vxD-4T=3BagXR6dciA@mail.gmail.comTested-by: Rob Landley <rob@landley.net> # arch/sh/
Link: https://lore.kernel.org/all/20240819195120.GA1113263@thelio-3990X/
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Jeff Xu <jeffxu@google.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Pedro Falcato <pedro.falcato@gmail.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Landley <rob@landley.net>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b1eed30fdc06..0fde51c0532f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3263,10 +3263,6 @@ extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
 				   unsigned long addr, unsigned long len,
 				   unsigned long flags,
 				   const struct vm_special_mapping *spec);
-/* This is an obsolete alternative to _install_special_mapping. */
-extern int install_special_mapping(struct mm_struct *mm,
-				   unsigned long addr, unsigned long len,
-				   unsigned long flags, struct page **pages);
 
 unsigned long randomize_stack_top(unsigned long stack_top);
 unsigned long randomize_page(unsigned long start, unsigned long range);
-- 
cgit v1.2.3


From 02f4bbefcada38cab86b365e5c795e0f858356bc Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Wed, 14 Aug 2024 17:34:15 +0800
Subject: mm: kmem: add lockdep assertion to obj_cgroup_memcg

obj_cgroup_memcg() is supposed to safe to prevent the returned memory
cgroup from being freed only when the caller is holding the rcu read lock
or objcg_lock or cgroup_mutex.  It is very easy to ignore thoes conditions
when users call some upper APIs which call obj_cgroup_memcg() internally
like mem_cgroup_from_slab_obj() (See the link below).  So it is better to
add lockdep assertion to obj_cgroup_memcg() to find those issues ASAP.

Because there is no user of obj_cgroup_memcg() holding objcg_lock to make
the returned memory cgroup safe, do not add objcg_lock assertion (We
should export objcg_lock if we really want to do).  Additionally, this is
some internal implementation detail of memcg and should not be accessible
outside memcg code.

Some users like __mem_cgroup_uncharge() do not care the lifetime of the
returned memory cgroup, which just want to know if the folio is charged to
a memory cgroup, therefore, they do not need to hold the needed locks.  In
which case, introduce a new helper folio_memcg_charged() to do this.
Compare it to folio_memcg(), it could eliminate a memory access of
objcg->memcg for kmem, actually, a really small gain.

[songmuchun@bytedance.com: fix split_page_memcg()]
  Link: https://lkml.kernel.org/r/20240819080415.44964-1-songmuchun@bytedance.com
Link: https://lore.kernel.org/all/20240718083607.42068-1-songmuchun@bytedance.com/
Link: https://lkml.kernel.org/r/20240814093415.17634-1-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1b79760af685..07eadf7ecbba 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -366,11 +366,11 @@ static inline bool folio_memcg_kmem(struct folio *folio);
  * After the initialization objcg->memcg is always pointing at
  * a valid memcg, but can be atomically swapped to the parent memcg.
  *
- * The caller must ensure that the returned memcg won't be released:
- * e.g. acquire the rcu_read_lock or css_set_lock.
+ * The caller must ensure that the returned memcg won't be released.
  */
 static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
 {
+	lockdep_assert_once(rcu_read_lock_held() || lockdep_is_held(&cgroup_mutex));
 	return READ_ONCE(objcg->memcg);
 }
 
@@ -444,6 +444,19 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio)
 	return __folio_memcg(folio);
 }
 
+/*
+ * folio_memcg_charged - If a folio is charged to a memory cgroup.
+ * @folio: Pointer to the folio.
+ *
+ * Returns true if folio is charged to a memory cgroup, otherwise returns false.
+ */
+static inline bool folio_memcg_charged(struct folio *folio)
+{
+	if (folio_memcg_kmem(folio))
+		return __folio_objcg(folio) != NULL;
+	return __folio_memcg(folio) != NULL;
+}
+
 /**
  * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
  * @folio: Pointer to the folio.
@@ -460,7 +473,6 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
 	unsigned long memcg_data = READ_ONCE(folio->memcg_data);
 
 	VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
-	WARN_ON_ONCE(!rcu_read_lock_held());
 
 	if (memcg_data & MEMCG_DATA_KMEM) {
 		struct obj_cgroup *objcg;
@@ -469,6 +481,8 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
 		return obj_cgroup_memcg(objcg);
 	}
 
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
 	return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
 }
 
-- 
cgit v1.2.3


From bd164d81a767f33c30d5c5b50b775631699ee976 Mon Sep 17 00:00:00 2001
From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Date: Wed, 14 Aug 2024 12:19:28 -0400
Subject: maple_tree: introduce store_type enum

Patch series "Introduce a store type enum for the Maple tree", v4.

================================ OVERVIEW ================================

This series implements two work items[3]: "aligning mas_store_gfp() with
mas_preallocate()" and "enum for store type".

mas_store_gfp() is modified to preallocate nodes.  This simplies many of
the write helper functions by allowing them to use mas_store_gfp() rather
than open coding node allocation and error handling.

The enum defines the following store types:

enum store_type {
    wr_invalid,
    wr_new_root,
    wr_store_root,
    wr_exact_fit,
    wr_spanning_store,
    wr_split_store,
    wr_rebalance,
    wr_append,
    wr_node_store,
    wr_slot_store,
};

In the current maple tree code, a walk down the tree is done in
mas_preallocate() to determine the number of nodes needed for this write.
After node allocation, mas_wr_store_entry() will perform another walk to
determine which write helper function to use to complete the write.

Rather than performing the second walk, we can store the type of write in
the maple write state during node allocation and read this field to
complete the write.

Patches 1-16 implement this store type feature.
Patch 17 is a cleanup patch to change functions that have unused return
types to be void.

================================ RESULTS =================================

Phoronix t-test-1 (Seconds < Lower Is Better)
    v6.10-rc6
        Threads: 1
            33.15

        Threads: 2
            10.81

    v6.10-rc6 + this series
            Threads: 1
            32.69

        Threads: 2
            10.45

Stress-ng mmap
                    6.10_base  store_type_v4
Duration User        2744.65     2769.40
Duration System     10862.69    10817.59
Duration Elapsed     1477.58     1478.35


================================ TESTING =================================

Testing was done with the maple tree test suite.  A new test case is also
added to validate the order in which we test for and assign the store
type.

[1]: https://lore.kernel.org/linux-mm/80926b22-a8d2-9992-eb5e-27e2c99cf460@google.com/T/#m81044feb66765265f8ca7f21e4b4b3725b18780a
[2]: https://lore.kernel.org/linux-mm/80926b22-a8d2-9992-eb5e-27e2c99cf460@google.com/T/#mb36c6526486638e82518c0f37a428fb279c84d8a
[3]: https://lists.infradead.org/pipermail/maple-tree/2023-December/003098.html


This patch (of 17):

Add a store_type enum that is stored in ma_state.  This will be used to
keep track of partial walks of the tree so that subsequent walks can pick
up where a previous walk left off.

Link: https://lkml.kernel.org/r/20240814161944.55347-1-sidhartha.kumar@oracle.com
Link: https://lkml.kernel.org/r/20240814161944.55347-2-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index a53ad4dabd7e..8e1504a81cd2 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -148,6 +148,18 @@ enum maple_type {
 	maple_arange_64,
 };
 
+enum store_type {
+	wr_invalid,
+	wr_new_root,
+	wr_store_root,
+	wr_exact_fit,
+	wr_spanning_store,
+	wr_split_store,
+	wr_rebalance,
+	wr_append,
+	wr_node_store,
+	wr_slot_store,
+};
 
 /**
  * DOC: Maple tree flags
@@ -436,6 +448,7 @@ struct ma_state {
 	unsigned char offset;
 	unsigned char mas_flags;
 	unsigned char end;		/* The end of the node */
+	enum store_type store_type;	/* The type of store needed for this operation */
 };
 
 struct ma_wr_state {
@@ -477,6 +490,7 @@ struct ma_wr_state {
 		.max = ULONG_MAX,					\
 		.alloc = NULL,						\
 		.mas_flags = 0,						\
+		.store_type = wr_invalid,				\
 	}
 
 #define MA_WR_STATE(name, ma_state, wr_entry)				\
-- 
cgit v1.2.3


From 5d383b69a04e2eb2e0ae06e91821fd82cb9acf73 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Wed, 14 Aug 2024 22:04:47 -0700
Subject: memcg: move v1 only percpu stats in separate struct

Patch series "memcg: further decouple v1 code from v2".

Some of the v1 code is still in v2 code base due to v1 fields in the
struct memcg_vmstats_percpu.  This field decouples those fileds from v2
struct and move all the related code into v1 only code base.


This patch (of 7):

At the moment struct memcg_vmstats_percpu contains two v1 only fields
which consumes memory even when CONFIG_MEMCG_V1 is not enabled.  In
addition there are v1 only functions accessing them and are in the main
memcontrol source file and can not be moved to v1 only source file due to
these fields.  Let's move these fields into their own struct.  Later
patches will move the functions accessing them to v1 source file and only
allocate these fields when CONFIG_MEMCG_V1 is enabled.

Link: https://lkml.kernel.org/r/20240815050453.1298138-1-shakeel.butt@linux.dev
Link: https://lkml.kernel.org/r/20240815050453.1298138-2-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: T.J. Mercier <tjmercier@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 07eadf7ecbba..08b7121cf43a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -70,6 +70,7 @@ struct mem_cgroup_id {
 };
 
 struct memcg_vmstats_percpu;
+struct memcg1_events_percpu;
 struct memcg_vmstats;
 struct lruvec_stats_percpu;
 struct lruvec_stats;
@@ -254,6 +255,7 @@ struct mem_cgroup {
 	struct list_head objcg_list;
 
 	struct memcg_vmstats_percpu __percpu *vmstats_percpu;
+	struct memcg1_events_percpu __percpu *events_percpu;
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct list_head cgwb_list;
-- 
cgit v1.2.3


From 0ccaf421d6592bb99bb9b424e4ccca3c6367d799 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Wed, 14 Aug 2024 22:04:52 -0700
Subject: memcg: allocate v1 event percpu only on v1 deployment

Currently memcg->events_percpu gets allocated on v2 deployments.  Let's
move the allocation to v1 only codebase.  This is not needed in v2.

Link: https://lkml.kernel.org/r/20240815050453.1298138-7-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: T.J. Mercier <tjmercier@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 08b7121cf43a..ed170399179a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -255,7 +255,6 @@ struct mem_cgroup {
 	struct list_head objcg_list;
 
 	struct memcg_vmstats_percpu __percpu *vmstats_percpu;
-	struct memcg1_events_percpu __percpu *events_percpu;
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct list_head cgwb_list;
@@ -277,6 +276,8 @@ struct mem_cgroup {
 	struct page_counter kmem;		/* v1 only */
 	struct page_counter tcpmem;		/* v1 only */
 
+	struct memcg1_events_percpu __percpu *events_percpu;
+
 	unsigned long soft_limit;
 
 	/* protected by memcg_oom_lock */
-- 
cgit v1.2.3


From 836d13a6ef8a2eb0eab2bd2de06f2deabc62b060 Mon Sep 17 00:00:00 2001
From: Lasse Collin <lasse.collin@tukaani.org>
Date: Sun, 21 Jul 2024 16:36:18 +0300
Subject: xz: switch from public domain to BSD Zero Clause License (0BSD)

Remove the public domain notices and add SPDX license identifiers.

Change MODULE_LICENSE from "GPL" to "Dual BSD/GPL" because 0BSD should
count as a BSD license variant here.

The switch to 0BSD was done in the upstream XZ Embedded project because
public domain has (real or perceived) legal issues in some jurisdictions.

Link: https://lkml.kernel.org/r/20240721133633.47721-4-lasse.collin@tukaani.org
Signed-off-by: Lasse Collin <lasse.collin@tukaani.org>
Reviewed-by: Sam James <sam@gentoo.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Emil Renner Berthing <emil.renner.berthing@canonical.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Joel Stanley <joel@jms.id.au>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Jubin Zhong <zhongjubin@huawei.com>
Cc: Jules Maselbas <jmaselbas@zdiv.net>
Cc: Krzysztof Kozlowski <krzk@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Rui Li <me@lirui.org>
Cc: Simon Glass <sjg@chromium.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/decompress/unxz.h | 5 ++---
 include/linux/xz.h              | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/decompress/unxz.h b/include/linux/decompress/unxz.h
index f764e2a7201e..3dd2658a9dab 100644
--- a/include/linux/decompress/unxz.h
+++ b/include/linux/decompress/unxz.h
@@ -1,10 +1,9 @@
+/* SPDX-License-Identifier: 0BSD */
+
 /*
  * Wrapper for decompressing XZ-compressed kernel, initramfs, and initrd
  *
  * Author: Lasse Collin <lasse.collin@tukaani.org>
- *
- * This file has been put into the public domain.
- * You can do whatever you want with this file.
  */
 
 #ifndef DECOMPRESS_UNXZ_H
diff --git a/include/linux/xz.h b/include/linux/xz.h
index 7285ca5d56e9..5728d57aecc0 100644
--- a/include/linux/xz.h
+++ b/include/linux/xz.h
@@ -1,11 +1,10 @@
+/* SPDX-License-Identifier: 0BSD */
+
 /*
  * XZ decompressor
  *
  * Authors: Lasse Collin <lasse.collin@tukaani.org>
  *          Igor Pavlov <https://7-zip.org/>
- *
- * This file has been put into the public domain.
- * You can do whatever you want with this file.
  */
 
 #ifndef XZ_H
-- 
cgit v1.2.3


From ad8c67b870d108aa1286f4fc76d0c29a736fd75e Mon Sep 17 00:00:00 2001
From: Lasse Collin <lasse.collin@tukaani.org>
Date: Sun, 21 Jul 2024 16:36:20 +0300
Subject: xz: fix kernel-doc formatting errors in xz.h

The opaque structs xz_dec and xz_dec_microlzma are declared in xz.h but
their definitions are in xz_dec_lzma2.c without kernel-doc comments.  Use
regular comments for these structs in xz.h to avoid errors when building
the docs.

Add a few missing colons.

Link: https://lkml.kernel.org/r/20240721133633.47721-6-lasse.collin@tukaani.org
Signed-off-by: Lasse Collin <lasse.collin@tukaani.org>
Reviewed-by: Sam James <sam@gentoo.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Emil Renner Berthing <emil.renner.berthing@canonical.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Joel Stanley <joel@jms.id.au>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Jubin Zhong <zhongjubin@huawei.com>
Cc: Jules Maselbas <jmaselbas@zdiv.net>
Cc: Krzysztof Kozlowski <krzk@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Rui Li <me@lirui.org>
Cc: Simon Glass <sjg@chromium.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/xz.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/xz.h b/include/linux/xz.h
index 5728d57aecc0..af1e075d9add 100644
--- a/include/linux/xz.h
+++ b/include/linux/xz.h
@@ -142,7 +142,7 @@ struct xz_buf {
 	size_t out_size;
 };
 
-/**
+/*
  * struct xz_dec - Opaque type to hold the XZ decoder state
  */
 struct xz_dec;
@@ -240,15 +240,16 @@ XZ_EXTERN void xz_dec_end(struct xz_dec *s);
  * marked with XZ_EXTERN. This avoids warnings about static functions that
  * are never defined.
  */
-/**
+
+/*
  * struct xz_dec_microlzma - Opaque type to hold the MicroLZMA decoder state
  */
 struct xz_dec_microlzma;
 
 /**
  * xz_dec_microlzma_alloc() - Allocate memory for the MicroLZMA decoder
- * @mode        XZ_SINGLE or XZ_PREALLOC
- * @dict_size   LZMA dictionary size. This must be at least 4 KiB and
+ * @mode:       XZ_SINGLE or XZ_PREALLOC
+ * @dict_size:  LZMA dictionary size. This must be at least 4 KiB and
  *              at most 3 GiB.
  *
  * In contrast to xz_dec_init(), this function only allocates the memory
@@ -276,15 +277,15 @@ extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
 
 /**
  * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state
- * @s           Decoder state allocated using xz_dec_microlzma_alloc()
- * @comp_size   Compressed size of the input stream
- * @uncomp_size Uncompressed size of the input stream. A value smaller
+ * @s:          Decoder state allocated using xz_dec_microlzma_alloc()
+ * @comp_size:  Compressed size of the input stream
+ * @uncomp_size:  Uncompressed size of the input stream. A value smaller
  *              than the real uncompressed size of the input stream can
  *              be specified if uncomp_size_is_exact is set to false.
  *              uncomp_size can never be set to a value larger than the
  *              expected real uncompressed size because it would eventually
  *              result in XZ_DATA_ERROR.
- * @uncomp_size_is_exact  This is an int instead of bool to avoid
+ * @uncomp_size_is_exact:  This is an int instead of bool to avoid
  *              requiring stdbool.h. This should normally be set to true.
  *              When this is set to false, error detection is weaker.
  */
@@ -294,7 +295,7 @@ extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s,
 
 /**
  * xz_dec_microlzma_run() - Run the MicroLZMA decoder
- * @s           Decoder state initialized using xz_dec_microlzma_reset()
+ * @s:          Decoder state initialized using xz_dec_microlzma_reset()
  * @b:          Input and output buffers
  *
  * This works similarly to xz_dec_run() with a few important differences.
-- 
cgit v1.2.3


From 0f2c5996340b69d167d3f6ca38d3012204787ac1 Mon Sep 17 00:00:00 2001
From: Lasse Collin <lasse.collin@tukaani.org>
Date: Sun, 21 Jul 2024 16:36:21 +0300
Subject: xz: improve the MicroLZMA kernel-doc in xz.h

Move the description of the format into a "DOC:" comment.  Emphasize that
MicroLZMA functions aren't usually needed.

Link: https://lkml.kernel.org/r/20240721133633.47721-7-lasse.collin@tukaani.org
Signed-off-by: Lasse Collin <lasse.collin@tukaani.org>
Reviewed-by: Sam James <sam@gentoo.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Emil Renner Berthing <emil.renner.berthing@canonical.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Joel Stanley <joel@jms.id.au>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Jubin Zhong <zhongjubin@huawei.com>
Cc: Jules Maselbas <jmaselbas@zdiv.net>
Cc: Krzysztof Kozlowski <krzk@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Rui Li <me@lirui.org>
Cc: Simon Glass <sjg@chromium.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/xz.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/xz.h b/include/linux/xz.h
index af1e075d9add..701d62c02b9a 100644
--- a/include/linux/xz.h
+++ b/include/linux/xz.h
@@ -232,9 +232,18 @@ XZ_EXTERN void xz_dec_reset(struct xz_dec *s);
  */
 XZ_EXTERN void xz_dec_end(struct xz_dec *s);
 
-/*
- * Decompressor for MicroLZMA, an LZMA variant with a very minimal header.
- * See xz_dec_microlzma_alloc() below for details.
+/**
+ * DOC: MicroLZMA decompressor
+ *
+ * This MicroLZMA header format was created for use in EROFS but may be used
+ * by others too. **In most cases one needs the XZ APIs above instead.**
+ *
+ * The compressed format supported by this decoder is a raw LZMA stream
+ * whose first byte (always 0x00) has been replaced with bitwise-negation
+ * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is
+ * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00.
+ * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream
+ * marker must not be used. The unused values are reserved for future use.
  *
  * These functions aren't used or available in preboot code and thus aren't
  * marked with XZ_EXTERN. This avoids warnings about static functions that
@@ -262,15 +271,6 @@ struct xz_dec_microlzma;
  * On success, xz_dec_microlzma_alloc() returns a pointer to
  * struct xz_dec_microlzma. If memory allocation fails or
  * dict_size is invalid, NULL is returned.
- *
- * The compressed format supported by this decoder is a raw LZMA stream
- * whose first byte (always 0x00) has been replaced with bitwise-negation
- * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is
- * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00.
- * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream
- * marker must not be used. The unused values are reserved for future use.
- * This MicroLZMA header format was created for use in EROFS but may be used
- * by others too.
  */
 extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
 						       uint32_t dict_size);
-- 
cgit v1.2.3


From c6f371bab25edccd39caa5dd452b50d9dfdf4ff0 Mon Sep 17 00:00:00 2001
From: Lasse Collin <lasse.collin@tukaani.org>
Date: Wed, 24 Jul 2024 14:05:41 +0300
Subject: xz: remove XZ_EXTERN and extern from functions

XZ_EXTERN was used to make internal functions static in the preboot code.
However, in other decompressors this hasn't been done.  On x86-64, this
makes no difference to the kernel image size.

Omit XZ_EXTERN and let some of the internal functions be extern in the
preboot code.  Omitting XZ_EXTERN from include/linux/xz.h fixes warnings
in "make htmldocs" and makes the intradocument links to xz_dec functions
work in Documentation/staging/xz.rst.  The alternative would have been to
add "XZ_EXTERN" to c_id_attributes in Documentation/conf.py but omitting
XZ_EXTERN seemed cleaner.

Link: https://lore.kernel.org/lkml/20240723205437.3c0664b0@kaneli/
Link: https://lkml.kernel.org/r/20240724110544.16430-1-lasse.collin@tukaani.org
Signed-off-by: Lasse Collin <lasse.collin@tukaani.org>
Tested-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Sam James <sam@gentoo.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Emil Renner Berthing <emil.renner.berthing@canonical.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Joel Stanley <joel@jms.id.au>
Cc: Jubin Zhong <zhongjubin@huawei.com>
Cc: Jules Maselbas <jmaselbas@zdiv.net>
Cc: Krzysztof Kozlowski <krzk@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Rui Li <me@lirui.org>
Cc: Simon Glass <sjg@chromium.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/xz.h | 35 ++++++++++++-----------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/xz.h b/include/linux/xz.h
index 701d62c02b9a..58ae1d746c6f 100644
--- a/include/linux/xz.h
+++ b/include/linux/xz.h
@@ -18,11 +18,6 @@
 #	include <stdint.h>
 #endif
 
-/* In Linux, this is used to make extern functions static when needed. */
-#ifndef XZ_EXTERN
-#	define XZ_EXTERN extern
-#endif
-
 /**
  * enum xz_mode - Operation mode
  *
@@ -190,7 +185,7 @@ struct xz_dec;
  * ready to be used with xz_dec_run(). If memory allocation fails,
  * xz_dec_init() returns NULL.
  */
-XZ_EXTERN struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max);
+struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max);
 
 /**
  * xz_dec_run() - Run the XZ decoder
@@ -210,7 +205,7 @@ XZ_EXTERN struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max);
  * get that amount valid data from the beginning of the stream. You must use
  * the multi-call decoder if you don't want to uncompress the whole stream.
  */
-XZ_EXTERN enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b);
+enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b);
 
 /**
  * xz_dec_reset() - Reset an already allocated decoder state
@@ -223,14 +218,14 @@ XZ_EXTERN enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b);
  * xz_dec_run(). Thus, explicit call to xz_dec_reset() is useful only in
  * multi-call mode.
  */
-XZ_EXTERN void xz_dec_reset(struct xz_dec *s);
+void xz_dec_reset(struct xz_dec *s);
 
 /**
  * xz_dec_end() - Free the memory allocated for the decoder state
  * @s:          Decoder state allocated using xz_dec_init(). If s is NULL,
  *              this function does nothing.
  */
-XZ_EXTERN void xz_dec_end(struct xz_dec *s);
+void xz_dec_end(struct xz_dec *s);
 
 /**
  * DOC: MicroLZMA decompressor
@@ -244,10 +239,6 @@ XZ_EXTERN void xz_dec_end(struct xz_dec *s);
  * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00.
  * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream
  * marker must not be used. The unused values are reserved for future use.
- *
- * These functions aren't used or available in preboot code and thus aren't
- * marked with XZ_EXTERN. This avoids warnings about static functions that
- * are never defined.
  */
 
 /*
@@ -272,8 +263,8 @@ struct xz_dec_microlzma;
  * struct xz_dec_microlzma. If memory allocation fails or
  * dict_size is invalid, NULL is returned.
  */
-extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
-						       uint32_t dict_size);
+struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
+						uint32_t dict_size);
 
 /**
  * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state
@@ -289,9 +280,8 @@ extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode,
  *              requiring stdbool.h. This should normally be set to true.
  *              When this is set to false, error detection is weaker.
  */
-extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s,
-				   uint32_t comp_size, uint32_t uncomp_size,
-				   int uncomp_size_is_exact);
+void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, uint32_t comp_size,
+			    uint32_t uncomp_size, int uncomp_size_is_exact);
 
 /**
  * xz_dec_microlzma_run() - Run the MicroLZMA decoder
@@ -329,15 +319,14 @@ extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s,
  * may be changed normally like with XZ_PREALLOC. This way input data can be
  * provided from non-contiguous memory.
  */
-extern enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s,
-					struct xz_buf *b);
+enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s, struct xz_buf *b);
 
 /**
  * xz_dec_microlzma_end() - Free the memory allocated for the decoder state
  * @s:          Decoder state allocated using xz_dec_microlzma_alloc().
  *              If s is NULL, this function does nothing.
  */
-extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s);
+void xz_dec_microlzma_end(struct xz_dec_microlzma *s);
 
 /*
  * Standalone build (userspace build or in-kernel build for boot time use)
@@ -358,13 +347,13 @@ extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s);
  * This must be called before any other xz_* function to initialize
  * the CRC32 lookup table.
  */
-XZ_EXTERN void xz_crc32_init(void);
+void xz_crc32_init(void);
 
 /*
  * Update CRC32 value using the polynomial from IEEE-802.3. To start a new
  * calculation, the third argument must be zero. To continue the calculation,
  * the previously returned value is passed as the third argument.
  */
-XZ_EXTERN uint32_t xz_crc32(const uint8_t *buf, size_t size, uint32_t crc);
+uint32_t xz_crc32(const uint8_t *buf, size_t size, uint32_t crc);
 #endif
 #endif
-- 
cgit v1.2.3


From 9a42bfd255b288dad2d1a9df0a1fe58394d5da12 Mon Sep 17 00:00:00 2001
From: Deshan Zhang <deshan@nfschina.com>
Date: Thu, 25 Jul 2024 17:30:45 +0800
Subject: lib/lru_cache: fix spelling mistake "colision"->"collision"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a spelling mistake in a literal string and in cariable names.
Fix these.

Link: https://lkml.kernel.org/r/20240725093044.1742842-1-deshan@nfschina.com
Signed-off-by: Deshan Zhang <deshan@nfschina.com>
Cc: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
Cc: Lars Ellenberg <lars.ellenberg@linbit.com>
Cc: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/lru_cache.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
index c9afcdd9324c..ff82ef85a084 100644
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
@@ -119,7 +119,7 @@ write intent log information, three of which are mentioned here.
 */
 
 /* this defines an element in a tracked set
- * .colision is for hash table lookup.
+ * .collision is for hash table lookup.
  * When we process a new IO request, we know its sector, thus can deduce the
  * region number (label) easily.  To do the label -> object lookup without a
  * full list walk, we use a simple hash table.
@@ -145,7 +145,7 @@ write intent log information, three of which are mentioned here.
  * But it avoids high order page allocations in kmalloc.
  */
 struct lc_element {
-	struct hlist_node colision;
+	struct hlist_node collision;
 	struct list_head list;		 /* LRU list or free list */
 	unsigned refcnt;
 	/* back "pointer" into lc_cache->element[index],
-- 
cgit v1.2.3


From 6ce2082fd3a25d5a8c756120959237cace0379f1 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Tue, 13 Aug 2024 15:12:35 +0300
Subject: fault-inject: improve build for CONFIG_FAULT_INJECTION=n
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fault-inject.h users across the kernel need to add a lot of #ifdef
CONFIG_FAULT_INJECTION to cater for shortcomings in the header.  Make
fault-inject.h self-contained for CONFIG_FAULT_INJECTION=n, and add stubs
for DECLARE_FAULT_ATTR(), setup_fault_attr(), should_fail_ex(), and
should_fail() to allow removal of conditional compilation.

[akpm@linux-foundation.org: repair fallout from no longer including debugfs.h into fault-inject.h]
[akpm@linux-foundation.org: fix drivers/misc/xilinx_tmr_inject.c]
[akpm@linux-foundation.org: Add debugfs.h inclusion to more files, per Stephen]
Link: https://lkml.kernel.org/r/20240813121237.2382534-1-jani.nikula@intel.com
Fixes: 6ff1cb355e62 ("[PATCH] fault-injection capabilities infrastructure")
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fault-inject.h | 36 +++++++++++++++++++++++++++++-------
 include/linux/mmc/host.h     |  1 +
 2 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 354413950d34..8c829d28dcf3 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -2,13 +2,17 @@
 #ifndef _LINUX_FAULT_INJECT_H
 #define _LINUX_FAULT_INJECT_H
 
+#include <linux/err.h>
+#include <linux/types.h>
+
+struct dentry;
+struct kmem_cache;
+
 #ifdef CONFIG_FAULT_INJECTION
 
-#include <linux/types.h>
-#include <linux/debugfs.h>
+#include <linux/atomic.h>
 #include <linux/configfs.h>
 #include <linux/ratelimit.h>
-#include <linux/atomic.h>
 
 /*
  * For explanation of the elements of this struct, see
@@ -51,6 +55,28 @@ int setup_fault_attr(struct fault_attr *attr, char *str);
 bool should_fail_ex(struct fault_attr *attr, ssize_t size, int flags);
 bool should_fail(struct fault_attr *attr, ssize_t size);
 
+#else /* CONFIG_FAULT_INJECTION */
+
+struct fault_attr {
+};
+
+#define DECLARE_FAULT_ATTR(name) struct fault_attr name = {}
+
+static inline int setup_fault_attr(struct fault_attr *attr, char *str)
+{
+	return 0; /* Note: 0 means error for __setup() handlers! */
+}
+static inline bool should_fail_ex(struct fault_attr *attr, ssize_t size, int flags)
+{
+	return false;
+}
+static inline bool should_fail(struct fault_attr *attr, ssize_t size)
+{
+	return false;
+}
+
+#endif /* CONFIG_FAULT_INJECTION */
+
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 
 struct dentry *fault_create_debugfs_attr(const char *name,
@@ -87,10 +113,6 @@ static inline void fault_config_init(struct fault_config *config,
 
 #endif /* CONFIG_FAULT_INJECTION_CONFIGFS */
 
-#endif /* CONFIG_FAULT_INJECTION */
-
-struct kmem_cache;
-
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order);
 #else
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 88c6a76042ee..49470188fca7 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -10,6 +10,7 @@
 #include <linux/sched.h>
 #include <linux/device.h>
 #include <linux/fault-inject.h>
+#include <linux/debugfs.h>
 
 #include <linux/mmc/core.h>
 #include <linux/mmc/card.h>
-- 
cgit v1.2.3


From d994c238347d7ba4de15da00985e1bea75e91dc7 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Sat, 17 Aug 2024 14:37:54 +0200
Subject: ratelimit: convert flags to int to save 8 bytes in size

Only bit 1 is used, making an unsigned long a total overkill.

This brings it from 40 to 32 bytes, which in turn shrinks user_struct from
136 to 128 bytes.  Since the latter is allocated with hwalign, this means
the total usage goes down from 192 to 128 bytes per object.

No functional changes.

Link: https://lkml.kernel.org/r/20240817123754.240924-1-mjguzik@gmail.com
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ratelimit_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h
index 002266693e50..765232ce0b5e 100644
--- a/include/linux/ratelimit_types.h
+++ b/include/linux/ratelimit_types.h
@@ -19,8 +19,8 @@ struct ratelimit_state {
 	int		burst;
 	int		printed;
 	int		missed;
+	unsigned int	flags;
 	unsigned long	begin;
-	unsigned long	flags;
 };
 
 #define RATELIMIT_STATE_INIT_FLAGS(name, interval_init, burst_init, flags_init) { \
-- 
cgit v1.2.3


From 32cebfe1cc21a84e4a907575bb9fd1c3f6b091fd Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Tue, 27 Aug 2024 10:45:15 +0800
Subject: lib/string_choices: add str_true_false()/str_false_true() helper

Add str_true_false()/str_false_true() helper to retur a "true" or "false"
string literal.  We found more than 10 cases currently exist in the tree.
So these helpers can be used for these cases.


This patch (of 3):

Add str_true_false()/str_false_true() helper to return "true" or "false"
string literal.

Link: https://lkml.kernel.org/r/20240827024517.914100-1-lihongbo22@huawei.com
Link: https://lkml.kernel.org/r/20240827024517.914100-2-lihongbo22@huawei.com
Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Cc: Andy Shevchenko <andy@kernel.org>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Kees Cook <kees@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Trond Myklebust <trondmy@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/string_choices.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h
index d9ebe20229f8..4a2432313b8e 100644
--- a/include/linux/string_choices.h
+++ b/include/linux/string_choices.h
@@ -42,6 +42,12 @@ static inline const char *str_yes_no(bool v)
 	return v ? "yes" : "no";
 }
 
+static inline const char *str_true_false(bool v)
+{
+	return v ? "true" : "false";
+}
+#define str_false_true(v)		str_true_false(!(v))
+
 /**
  * str_plural - Return the simple pluralization based on English counts
  * @num: Number used for deciding pluralization
-- 
cgit v1.2.3


From c4a315eaf8eff0d3234600e13db7e7c71c0b3405 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 21 Aug 2024 14:14:56 +0200
Subject: gpio: ath79: remove support for platform data

There are no more board files defining platform data for this driver so
remove the header and support from the driver.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20240821121456.19553-4-brgl@bgdev.pl
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/platform_data/gpio-ath79.h | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 include/linux/platform_data/gpio-ath79.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/gpio-ath79.h b/include/linux/platform_data/gpio-ath79.h
deleted file mode 100644
index 3ea6dd942c27..000000000000
--- a/include/linux/platform_data/gpio-ath79.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- *  Atheros AR7XXX/AR9XXX GPIO controller platform data
- *
- * Copyright (C) 2015 Alban Bedel <albeu@free.fr>
- */
-
-#ifndef __LINUX_PLATFORM_DATA_GPIO_ATH79_H
-#define __LINUX_PLATFORM_DATA_GPIO_ATH79_H
-
-struct ath79_gpio_platform_data {
-	unsigned ngpios;
-	bool oe_inverted;
-};
-
-#endif
-- 
cgit v1.2.3


From 4b2b0a2ce8153d65d0829e45e73bf6acdc291344 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 28 Aug 2024 17:23:57 +0300
Subject: gpiolib: legacy: Kill GPIOF_INIT_* definitions

Besides the fact that (old) drivers use wrong definitions, e.g.,
GPIOF_INIT_HIGH instead of GPIOF_OUT_INIT_HIGH, shrink the legacy
definitions by killing those GPIOF_INIT_* completely.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Reviewed-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Link: https://lore.kernel.org/r/20240828142554.2424189-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/gpio.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 063f71b18a7c..4af8ad114557 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -20,12 +20,9 @@ struct device;
 #define GPIOF_DIR_OUT	(0 << 0)
 #define GPIOF_DIR_IN	(1 << 0)
 
-#define GPIOF_INIT_LOW	(0 << 1)
-#define GPIOF_INIT_HIGH	(1 << 1)
-
 #define GPIOF_IN		(GPIOF_DIR_IN)
-#define GPIOF_OUT_INIT_LOW	(GPIOF_DIR_OUT | GPIOF_INIT_LOW)
-#define GPIOF_OUT_INIT_HIGH	(GPIOF_DIR_OUT | GPIOF_INIT_HIGH)
+#define GPIOF_OUT_INIT_LOW	(GPIOF_DIR_OUT | (0 << 1))
+#define GPIOF_OUT_INIT_HIGH	(GPIOF_DIR_OUT | (1 << 1))
 
 /* Gpio pin is active-low */
 #define GPIOF_ACTIVE_LOW        (1 << 2)
-- 
cgit v1.2.3


From 8c045ca534d03bab1ce4b4de49d29a36276a1e35 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 28 Aug 2024 17:23:58 +0300
Subject: gpiolib: legacy: Kill GPIOF_DIR_* definitions

Besides the fact that (old) drivers use wrong definitions, e.g.,
GPIOF_DIR_IN instead of GPIOF_IN, shrink the legacy definitions
by killing those GPIOF_DIR_* completely.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Reviewed-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Link: https://lore.kernel.org/r/20240828142554.2424189-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/gpio.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 4af8ad114557..2d105be7bbc3 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -17,12 +17,9 @@
 struct device;
 
 /* make these flag values available regardless of GPIO kconfig options */
-#define GPIOF_DIR_OUT	(0 << 0)
-#define GPIOF_DIR_IN	(1 << 0)
-
-#define GPIOF_IN		(GPIOF_DIR_IN)
-#define GPIOF_OUT_INIT_LOW	(GPIOF_DIR_OUT | (0 << 1))
-#define GPIOF_OUT_INIT_HIGH	(GPIOF_DIR_OUT | (1 << 1))
+#define GPIOF_IN		((1 << 0))
+#define GPIOF_OUT_INIT_LOW	((0 << 0) | (0 << 1))
+#define GPIOF_OUT_INIT_HIGH	((0 << 0) | (1 << 1))
 
 /* Gpio pin is active-low */
 #define GPIOF_ACTIVE_LOW        (1 << 2)
-- 
cgit v1.2.3


From e220917fa50774fedb27c075df2261fd664e8ca3 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Thu, 22 Aug 2024 15:50:12 +0200
Subject: mm: split a folio in minimum folio order chunks

split_folio() and split_folio_to_list() assume order 0, to support
minorder for non-anonymous folios, we must expand these to check the
folio mapping order and use that.

Set new_order to be at least minimum folio order if it is set in
split_huge_page_to_list() so that we can maintain minimum folio order
requirement in the page cache.

Update the debugfs write files used for testing to ensure the order
is respected as well. We simply enforce the min order when a file
mapping is used.

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Link: https://lore.kernel.org/r/20240902124931.506061-2-kernel@pankajraghav.com # folded fix
Link: https://lore.kernel.org/r/20240822135018.1931258-5-kernel@pankajraghav.com
Tested-by: David Howells <dhowells@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/huge_mm.h | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e25d9ebfdf89..7a570e0437c9 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -96,6 +96,8 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
 #define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
 	(!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))
 
+#define split_folio(f) split_folio_to_list(f, NULL)
+
 #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
 #define HPAGE_PMD_SHIFT PMD_SHIFT
 #define HPAGE_PUD_SHIFT PUD_SHIFT
@@ -317,9 +319,24 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
 bool can_split_folio(struct folio *folio, int *pextra_pins);
 int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		unsigned int new_order);
+int min_order_for_split(struct folio *folio);
+int split_folio_to_list(struct folio *folio, struct list_head *list);
 static inline int split_huge_page(struct page *page)
 {
-	return split_huge_page_to_list_to_order(page, NULL, 0);
+	struct folio *folio = page_folio(page);
+	int ret = min_order_for_split(folio);
+
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * split_huge_page() locks the page before splitting and
+	 * expects the same page that has been split to be locked when
+	 * returned. split_folio(page_folio(page)) cannot be used here
+	 * because it converts the page to folio and passes the head
+	 * page to be split.
+	 */
+	return split_huge_page_to_list_to_order(page, NULL, ret);
 }
 void deferred_split_folio(struct folio *folio);
 
@@ -484,6 +501,12 @@ static inline int split_huge_page(struct page *page)
 {
 	return 0;
 }
+
+static inline int split_folio_to_list(struct folio *folio, struct list_head *list)
+{
+	return 0;
+}
+
 static inline void deferred_split_folio(struct folio *folio) {}
 #define split_huge_pmd(__vma, __pmd, __address)	\
 	do { } while (0)
@@ -598,7 +621,4 @@ static inline int split_folio_to_order(struct folio *folio, int new_order)
 	return split_folio_to_list_to_order(folio, NULL, new_order);
 }
 
-#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0)
-#define split_folio(f) split_folio_to_order(f, 0)
-
 #endif /* _LINUX_HUGE_MM_H */
-- 
cgit v1.2.3


From f46a6e16519712651e2304da03ec144cc0bb084c Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Fri, 30 Aug 2024 18:26:28 +0300
Subject: usb: Add tunnel_mode parameter to usb device structure

Add 'tunnel_mode' enum to usb device structure to describe if a USB3
link is tunneled over USB4, or connected directly using native USB2/USB3
protocols.

Tunneled devices depend on USB4 NHI host to maintain the tunnel.
Knowledge about tunneled devices is important to ensure correct
suspend and resume order between USB4 hosts and tunneled devices.
i.e. make sure tunnel is up before the USB device using it resumes.

USB hosts such as xHCI may have vendor specific ways to detect tunneled
connections. This 'tunnel_mode' parameter can be set by USB3 host driver
during hcd->driver->update_device(hcd, udev) callback.

tunnel_mode can be set to:
USB_LINK_UNKNOWN = 0
USB_LINK_NATIVE
USB_LINK_TUNNELED

USB_LINK_UNKNOWN is used in case host is not capable of detecting
tunneled links.

Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Link: https://lore.kernel.org/r/20240830152630.3943215-3-mathias.nyman@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 832997a9da0a..672d8fc2abdb 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -495,6 +495,12 @@ struct usb_dev_state;
 
 struct usb_tt;
 
+enum usb_link_tunnel_mode {
+	USB_LINK_UNKNOWN = 0,
+	USB_LINK_NATIVE,
+	USB_LINK_TUNNELED,
+};
+
 enum usb_port_connect_type {
 	USB_PORT_CONNECT_TYPE_UNKNOWN = 0,
 	USB_PORT_CONNECT_TYPE_HOT_PLUG,
@@ -605,6 +611,7 @@ struct usb3_lpm_parameters {
  *	WUSB devices are not, until we authorize them from user space.
  *	FIXME -- complete doc
  * @authenticated: Crypto authentication passed
+ * @tunnel_mode: Connection native or tunneled over USB4
  * @lpm_capable: device supports LPM
  * @lpm_devinit_allow: Allow USB3 device initiated LPM, exit latency is in range
  * @usb2_hw_lpm_capable: device can perform USB2 hardware LPM
@@ -714,6 +721,7 @@ struct usb_device {
 	unsigned do_remote_wakeup:1;
 	unsigned reset_resume:1;
 	unsigned port_is_suspended:1;
+	enum usb_link_tunnel_mode tunnel_mode;
 
 	int slot_id;
 	struct usb2_lpm_parameters l1_params;
-- 
cgit v1.2.3


From d9c61bb33fbbcbc2d5f69efa10db116dab4b56cc Mon Sep 17 00:00:00 2001
From: Michael Grzeschik <m.grzeschik@pengutronix.de>
Date: Sun, 1 Sep 2024 21:11:16 +0200
Subject: usb: gadget: function: move u_f.h to include/linux/usb/func_utils.h

We move the func_utils.h header to include/linux/usb to be
able to compile function drivers outside of the
drivers/usb/gadget/function directory.

Signed-off-by: Michael Grzeschik <m.grzeschik@pengutronix.de>
Link: https://lore.kernel.org/r/20240116-ml-topic-u9p-v12-1-9a27de5160e0@pengutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/func_utils.h | 86 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 include/linux/usb/func_utils.h

(limited to 'include/linux')

diff --git a/include/linux/usb/func_utils.h b/include/linux/usb/func_utils.h
new file mode 100644
index 000000000000..c8795c965109
--- /dev/null
+++ b/include/linux/usb/func_utils.h
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * func_utils.h
+ *
+ * Utility definitions for USB functions
+ *
+ * Copyright (c) 2013 Samsung Electronics Co., Ltd.
+ *		http://www.samsung.com
+ *
+ * Author: Andrzej Pietrasiewicz <andrzejtp2010@gmail.com>
+ */
+
+#ifndef _FUNC_UTILS_H_
+#define _FUNC_UTILS_H_
+
+#include <linux/usb/gadget.h>
+#include <linux/overflow.h>
+
+/* Variable Length Array Macros **********************************************/
+#define vla_group(groupname) size_t groupname##__next = 0
+#define vla_group_size(groupname) groupname##__next
+
+#define vla_item(groupname, type, name, n) \
+	size_t groupname##_##name##__offset = ({			       \
+		size_t offset = 0;					       \
+		if (groupname##__next != SIZE_MAX) {			       \
+			size_t align_mask = __alignof__(type) - 1;	       \
+			size_t size = array_size(n, sizeof(type));	       \
+			offset = (groupname##__next + align_mask) &	       \
+				  ~align_mask;				       \
+			if (check_add_overflow(offset, size,		       \
+					       &groupname##__next)) {          \
+				groupname##__next = SIZE_MAX;		       \
+				offset = 0;				       \
+			}						       \
+		}							       \
+		offset;							       \
+	})
+
+#define vla_item_with_sz(groupname, type, name, n) \
+	size_t groupname##_##name##__sz = array_size(n, sizeof(type));	        \
+	size_t groupname##_##name##__offset = ({			        \
+		size_t offset = 0;						\
+		if (groupname##__next != SIZE_MAX) {				\
+			size_t align_mask = __alignof__(type) - 1;		\
+			offset = (groupname##__next + align_mask) &		\
+				  ~align_mask;					\
+			if (check_add_overflow(offset, groupname##_##name##__sz,\
+							&groupname##__next)) {	\
+				groupname##__next = SIZE_MAX;			\
+				offset = 0;					\
+			}							\
+		}								\
+		offset;								\
+	})
+
+#define vla_ptr(ptr, groupname, name) \
+	((void *) ((char *)ptr + groupname##_##name##__offset))
+
+struct usb_ep;
+struct usb_request;
+
+/**
+ * alloc_ep_req - returns a usb_request allocated by the gadget driver and
+ * allocates the request's buffer.
+ *
+ * @ep: the endpoint to allocate a usb_request
+ * @len: usb_requests's buffer suggested size
+ *
+ * In case @ep direction is OUT, the @len will be aligned to ep's
+ * wMaxPacketSize. In order to avoid memory leaks or drops, *always* use
+ * usb_requests's length (req->length) to refer to the allocated buffer size.
+ * Requests allocated via alloc_ep_req() *must* be freed by free_ep_req().
+ */
+struct usb_request *alloc_ep_req(struct usb_ep *ep, size_t len);
+
+/* Frees a usb_request previously allocated by alloc_ep_req() */
+static inline void free_ep_req(struct usb_ep *ep, struct usb_request *req)
+{
+	WARN_ON(req->buf == NULL);
+	kfree(req->buf);
+	req->buf = NULL;
+	usb_ep_free_request(ep, req);
+}
+
+#endif /* _FUNC_UTILS_H_ */
-- 
cgit v1.2.3


From beb5a9bea8239cdf4adf6b62672e30db3e9fa5ce Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 29 Aug 2024 14:33:36 +0200
Subject: netdevice: convert private flags > BIT(31) to bitfields

Make dev->priv_flags `u32` back and define bits higher than 31 as
bitfield booleans as per Jakub's suggestion. This simplifies code
which accesses these bits with no optimization loss (testb both
before/after), allows to not extend &netdev_priv_flags each time,
but also scales better as bits > 63 in the future would only add
a new u64 to the structure with no complications, comparing to
that extending ::priv_flags would require converting it to a bitmap.
Note that I picked `unsigned long :1` to not lose any potential
optimizations comparing to `bool :1` etc.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fce70990b209..d6f35c9d8580 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1613,7 +1613,8 @@ struct net_device_ops {
  * userspace; this means that the order of these flags can change
  * during any kernel release.
  *
- * You should have a pretty good reason to be extending these flags.
+ * You should add bitfield booleans after either net_device::priv_flags
+ * (hotpath) or ::threaded (slowpath) instead of extending these flags.
  *
  * @IFF_802_1Q_VLAN: 802.1Q VLAN device
  * @IFF_EBRIDGE: Ethernet bridging device
@@ -1652,10 +1653,6 @@ struct net_device_ops {
  * @IFF_NO_ADDRCONF: prevent ipv6 addrconf
  * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
  *	skb_headlen(skb) == 0 (data starts from frag0)
- * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN
- * @IFF_SEE_ALL_HWTSTAMP_REQUESTS: device wants to see calls to
- *	ndo_hwtstamp_set() for all timestamp requests regardless of source,
- *	even if those aren't HWTSTAMP_SOURCE_NETDEV.
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1690,8 +1687,6 @@ enum netdev_priv_flags {
 	IFF_L3MDEV_RX_HANDLER		= 1<<29,
 	IFF_NO_ADDRCONF			= BIT_ULL(30),
 	IFF_TX_SKB_NO_LINEAR		= BIT_ULL(31),
-	IFF_CHANGE_PROTO_DOWN		= BIT_ULL(32),
-	IFF_SEE_ALL_HWTSTAMP_REQUESTS	= BIT_ULL(33),
 };
 
 /* Specifies the type of the struct net_device::ml_priv pointer */
@@ -1723,6 +1718,9 @@ enum netdev_reg_state {
  *	data with strictly "high-level" data, and it has to know about
  *	almost every data structure used in the INET module.
  *
+ *	@priv_flags:	flags invisible to userspace defined as bits, see
+ *			enum netdev_priv_flags for the definitions
+ *
  *	@name:	This is the first field of the "visible" part of this structure
  *		(i.e. as seen by users in the "Space.c" file).  It is the name
  *		of the interface.
@@ -1789,8 +1787,6 @@ enum netdev_reg_state {
  *
  *	@flags:		Interface flags (a la BSD)
  *	@xdp_features:	XDP capability supported by the device
- *	@priv_flags:	Like 'flags' but invisible to userspace,
- *			see if.h for the definitions
  *	@gflags:	Global flags ( kept as legacy )
  *	@priv_len:	Size of the ->priv flexible array
  *	@priv:		Flexible array containing private data
@@ -1964,6 +1960,12 @@ enum netdev_reg_state {
  *
  *	@threaded:	napi threaded mode is enabled
  *
+ *	@see_all_hwtstamp_requests: device wants to see calls to
+ *			ndo_hwtstamp_set() for all timestamp requests
+ *			regardless of source, even if those aren't
+ *			HWTSTAMP_SOURCE_NETDEV
+ *	@change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN
+ *
  *	@net_notifier_list:	List of per-net netdev notifier block
  *				that follow this device when it is moved
  *				to another network namespace.
@@ -2014,7 +2016,9 @@ struct net_device {
 
 	/* TX read-mostly hotpath */
 	__cacheline_group_begin(net_device_read_tx);
-	unsigned long long	priv_flags;
+	struct_group(priv_flags_fast,
+		unsigned long		priv_flags:32;
+	);
 	const struct net_device_ops *netdev_ops;
 	const struct header_ops *header_ops;
 	struct netdev_queue	*_tx;
@@ -2350,6 +2354,10 @@ struct net_device {
 	bool			proto_down;
 	bool			threaded;
 
+	/* priv_flags_slow, ungrouped to save space */
+	unsigned long		see_all_hwtstamp_requests:1;
+	unsigned long		change_proto_down:1;
+
 	struct list_head	net_notifier_list;
 
 #if IS_ENABLED(CONFIG_MACSEC)
-- 
cgit v1.2.3


From 00d066a4d4edbe559ba6c35153da71d4b2b8a383 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 29 Aug 2024 14:33:37 +0200
Subject: netdev_features: convert NETIF_F_LLTX to dev->lltx

NETIF_F_LLTX can't be changed via Ethtool and is not a feature,
rather an attribute, very similar to IFF_NO_QUEUE (and hot).
Free one netdev_features_t bit and make it a "hot" private flag.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdev_features.h |  6 ++----
 include/linux/netdevice.h       | 10 +++++++---
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 7c2d77d75a88..a2e20b517584 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -24,8 +24,7 @@ enum {
 	NETIF_F_HW_VLAN_CTAG_FILTER_BIT,/* Receive filtering on VLAN CTAGs */
 	NETIF_F_VLAN_CHALLENGED_BIT,	/* Device cannot handle VLAN packets */
 	NETIF_F_GSO_BIT,		/* Enable software GSO. */
-	NETIF_F_LLTX_BIT,		/* LockLess TX - deprecated. Please */
-					/* do not use LLTX in new drivers */
+	__UNUSED_NETIF_F_12,
 	NETIF_F_NETNS_LOCAL_BIT,	/* Does not change network namespaces */
 	NETIF_F_GRO_BIT,		/* Generic receive offload */
 	NETIF_F_LRO_BIT,		/* large receive offload */
@@ -120,7 +119,6 @@ enum {
 #define NETIF_F_HW_VLAN_CTAG_TX	__NETIF_F(HW_VLAN_CTAG_TX)
 #define NETIF_F_IP_CSUM		__NETIF_F(IP_CSUM)
 #define NETIF_F_IPV6_CSUM	__NETIF_F(IPV6_CSUM)
-#define NETIF_F_LLTX		__NETIF_F(LLTX)
 #define NETIF_F_LOOPBACK	__NETIF_F(LOOPBACK)
 #define NETIF_F_LRO		__NETIF_F(LRO)
 #define NETIF_F_NETNS_LOCAL	__NETIF_F(NETNS_LOCAL)
@@ -193,7 +191,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start)
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
 #define NETIF_F_NEVER_CHANGE	(NETIF_F_VLAN_CHALLENGED | \
-				 NETIF_F_LLTX | NETIF_F_NETNS_LOCAL)
+				 NETIF_F_NETNS_LOCAL)
 
 /* remember that ((t)1 << t_BITS) is undefined in C99 */
 #define NETIF_F_ETHTOOL_BITS	((__NETIF_F_BIT(NETDEV_FEATURE_COUNT - 1) | \
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d6f35c9d8580..e22ca5e07490 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1720,6 +1720,9 @@ enum netdev_reg_state {
  *
  *	@priv_flags:	flags invisible to userspace defined as bits, see
  *			enum netdev_priv_flags for the definitions
+ *	@lltx:		device supports lockless Tx. Deprecated for real HW
+ *			drivers. Mainly used by logical interfaces, such as
+ *			bonding and tunnels
  *
  *	@name:	This is the first field of the "visible" part of this structure
  *		(i.e. as seen by users in the "Space.c" file).  It is the name
@@ -2018,6 +2021,7 @@ struct net_device {
 	__cacheline_group_begin(net_device_read_tx);
 	struct_group(priv_flags_fast,
 		unsigned long		priv_flags:32;
+		unsigned long		lltx:1;
 	);
 	const struct net_device_ops *netdev_ops;
 	const struct header_ops *header_ops;
@@ -4433,7 +4437,7 @@ static inline void netif_tx_unlock_bh(struct net_device *dev)
 }
 
 #define HARD_TX_LOCK(dev, txq, cpu) {			\
-	if ((dev->features & NETIF_F_LLTX) == 0) {	\
+	if (!(dev)->lltx) {				\
 		__netif_tx_lock(txq, cpu);		\
 	} else {					\
 		__netif_tx_acquire(txq);		\
@@ -4441,12 +4445,12 @@ static inline void netif_tx_unlock_bh(struct net_device *dev)
 }
 
 #define HARD_TX_TRYLOCK(dev, txq)			\
-	(((dev->features & NETIF_F_LLTX) == 0) ?	\
+	(!(dev)->lltx ?					\
 		__netif_tx_trylock(txq) :		\
 		__netif_tx_acquire(txq))
 
 #define HARD_TX_UNLOCK(dev, txq) {			\
-	if ((dev->features & NETIF_F_LLTX) == 0) {	\
+	if (!(dev)->lltx) {				\
 		__netif_tx_unlock(txq);			\
 	} else {					\
 		__netif_tx_release(txq);		\
-- 
cgit v1.2.3


From 05c1280a2bcfca187fe7fa90bb240602cf54af0a Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 29 Aug 2024 14:33:38 +0200
Subject: netdev_features: convert NETIF_F_NETNS_LOCAL to dev->netns_local

"Interface can't change network namespaces" is rather an attribute,
not a feature, and it can't be changed via Ethtool.
Make it a "cold" private flag instead of a netdev_feature and free
one more bit.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdev_features.h | 6 ++----
 include/linux/netdevice.h       | 2 ++
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index a2e20b517584..d5a3836f4793 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -25,7 +25,7 @@ enum {
 	NETIF_F_VLAN_CHALLENGED_BIT,	/* Device cannot handle VLAN packets */
 	NETIF_F_GSO_BIT,		/* Enable software GSO. */
 	__UNUSED_NETIF_F_12,
-	NETIF_F_NETNS_LOCAL_BIT,	/* Does not change network namespaces */
+	__UNUSED_NETIF_F_13,
 	NETIF_F_GRO_BIT,		/* Generic receive offload */
 	NETIF_F_LRO_BIT,		/* large receive offload */
 
@@ -121,7 +121,6 @@ enum {
 #define NETIF_F_IPV6_CSUM	__NETIF_F(IPV6_CSUM)
 #define NETIF_F_LOOPBACK	__NETIF_F(LOOPBACK)
 #define NETIF_F_LRO		__NETIF_F(LRO)
-#define NETIF_F_NETNS_LOCAL	__NETIF_F(NETNS_LOCAL)
 #define NETIF_F_NOCACHE_COPY	__NETIF_F(NOCACHE_COPY)
 #define NETIF_F_NTUPLE		__NETIF_F(NTUPLE)
 #define NETIF_F_RXCSUM		__NETIF_F(RXCSUM)
@@ -190,8 +189,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start)
 
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
-#define NETIF_F_NEVER_CHANGE	(NETIF_F_VLAN_CHALLENGED | \
-				 NETIF_F_NETNS_LOCAL)
+#define NETIF_F_NEVER_CHANGE	NETIF_F_VLAN_CHALLENGED
 
 /* remember that ((t)1 << t_BITS) is undefined in C99 */
 #define NETIF_F_ETHTOOL_BITS	((__NETIF_F_BIT(NETDEV_FEATURE_COUNT - 1) | \
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e22ca5e07490..a698e2402420 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1968,6 +1968,7 @@ enum netdev_reg_state {
  *			regardless of source, even if those aren't
  *			HWTSTAMP_SOURCE_NETDEV
  *	@change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN
+ *	@netns_local: interface can't change network namespaces
  *
  *	@net_notifier_list:	List of per-net netdev notifier block
  *				that follow this device when it is moved
@@ -2361,6 +2362,7 @@ struct net_device {
 	/* priv_flags_slow, ungrouped to save space */
 	unsigned long		see_all_hwtstamp_requests:1;
 	unsigned long		change_proto_down:1;
+	unsigned long		netns_local:1;
 
 	struct list_head	net_notifier_list;
 
-- 
cgit v1.2.3


From 782dbbf589cd9082effaec522e3f1b4ce1594803 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 29 Aug 2024 14:33:39 +0200
Subject: netdev_features: convert NETIF_F_FCOE_MTU to dev->fcoe_mtu

Ability to handle maximum FCoE frames of 2158 bytes can never be changed
and thus more of an attribute, not a toggleable feature.
Move it from netdev_features_t to "cold" priv flags (bitfield bool) and
free yet another feature bit.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdev_features.h | 6 ++----
 include/linux/netdevice.h       | 2 ++
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index d5a3836f4793..37af2c6e7caf 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -58,7 +58,7 @@ enum {
 
 	NETIF_F_FCOE_CRC_BIT,		/* FCoE CRC32 */
 	NETIF_F_SCTP_CRC_BIT,		/* SCTP checksum offload */
-	NETIF_F_FCOE_MTU_BIT,		/* Supports max FCoE MTU, 2158 bytes*/
+	__UNUSED_NETIF_F_37,
 	NETIF_F_NTUPLE_BIT,		/* N-tuple filters supported */
 	NETIF_F_RXHASH_BIT,		/* Receive hashing offload */
 	NETIF_F_RXCSUM_BIT,		/* Receive checksumming offload */
@@ -105,7 +105,6 @@ enum {
 #define __NETIF_F(name)		__NETIF_F_BIT(NETIF_F_##name##_BIT)
 
 #define NETIF_F_FCOE_CRC	__NETIF_F(FCOE_CRC)
-#define NETIF_F_FCOE_MTU	__NETIF_F(FCOE_MTU)
 #define NETIF_F_FRAGLIST	__NETIF_F(FRAGLIST)
 #define NETIF_F_FSO		__NETIF_F(FSO)
 #define NETIF_F_GRO		__NETIF_F(GRO)
@@ -210,8 +209,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start)
 #define NETIF_F_ALL_TSO 	(NETIF_F_TSO | NETIF_F_TSO6 | \
 				 NETIF_F_TSO_ECN | NETIF_F_TSO_MANGLEID)
 
-#define NETIF_F_ALL_FCOE	(NETIF_F_FCOE_CRC | NETIF_F_FCOE_MTU | \
-				 NETIF_F_FSO)
+#define NETIF_F_ALL_FCOE	(NETIF_F_FCOE_CRC | NETIF_F_FSO)
 
 /* List of features with software fallbacks. */
 #define NETIF_F_GSO_SOFTWARE	(NETIF_F_ALL_TSO | NETIF_F_GSO_SCTP |	     \
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a698e2402420..ca5f0dda733b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1969,6 +1969,7 @@ enum netdev_reg_state {
  *			HWTSTAMP_SOURCE_NETDEV
  *	@change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN
  *	@netns_local: interface can't change network namespaces
+ *	@fcoe_mtu:	device supports maximum FCoE MTU, 2158 bytes
  *
  *	@net_notifier_list:	List of per-net netdev notifier block
  *				that follow this device when it is moved
@@ -2363,6 +2364,7 @@ struct net_device {
 	unsigned long		see_all_hwtstamp_requests:1;
 	unsigned long		change_proto_down:1;
 	unsigned long		netns_local:1;
+	unsigned long		fcoe_mtu:1;
 
 	struct list_head	net_notifier_list;
 
-- 
cgit v1.2.3


From a61fec1c87be682b5c0fdba6074649c469c675a3 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 29 Aug 2024 14:33:40 +0200
Subject: netdev_features: remove NETIF_F_ALL_FCOE

NETIF_F_ALL_FCOE is used only in vlan_dev.c, 2 times. Now that it's only
2 bits, open-code it and remove the definition from netdev_features.h.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdev_features.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 37af2c6e7caf..66e7d26b70a4 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -209,8 +209,6 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start)
 #define NETIF_F_ALL_TSO 	(NETIF_F_TSO | NETIF_F_TSO6 | \
 				 NETIF_F_TSO_ECN | NETIF_F_TSO_MANGLEID)
 
-#define NETIF_F_ALL_FCOE	(NETIF_F_FCOE_CRC | NETIF_F_FSO)
-
 /* List of features with software fallbacks. */
 #define NETIF_F_GSO_SOFTWARE	(NETIF_F_ALL_TSO | NETIF_F_GSO_SCTP |	     \
 				 NETIF_F_GSO_UDP_L4 | NETIF_F_GSO_FRAGLIST)
-- 
cgit v1.2.3


From b45ed06f46737f8c2ee65698f4305409f2386674 Mon Sep 17 00:00:00 2001
From: Zijun Hu <quic_zijuhu@quicinc.com>
Date: Tue, 13 Aug 2024 22:19:32 +0800
Subject: drivers/base: Introduce device_match_t for device finding APIs

There are several drivers/base APIs for finding a specific device, and
they currently use the following good type for the @match parameter:
int (*match)(struct device *dev, const void *data)

Since these operations do not modify the caller-provided @*data, this
type is worthy of a dedicated typedef:
typedef int (*device_match_t)(struct device *dev, const void *data)

Advantages of using device_match_t:
 - Shorter API declarations and definitions
 - Prevent further APIs from using a bad type for @match

So introduce device_match_t and apply it to the existing
(bus|class|driver|auxiliary)_find_device() APIs.

Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
Link: https://lore.kernel.org/r/20240813-dev_match_api-v3-1-6c6878a99b9f@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/auxiliary_bus.h | 2 +-
 include/linux/device/bus.h    | 6 ++++--
 include/linux/device/class.h  | 2 +-
 include/linux/device/driver.h | 2 +-
 4 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h
index 662b8ae54b6a..31762324bcc9 100644
--- a/include/linux/auxiliary_bus.h
+++ b/include/linux/auxiliary_bus.h
@@ -271,6 +271,6 @@ void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv);
 
 struct auxiliary_device *auxiliary_find_device(struct device *start,
 					       const void *data,
-					       int (*match)(struct device *dev, const void *data));
+					       device_match_t match);
 
 #endif /* _AUXILIARY_BUS_H_ */
diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h
index 807831d6bf0f..cdc4757217f9 100644
--- a/include/linux/device/bus.h
+++ b/include/linux/device/bus.h
@@ -126,6 +126,9 @@ struct bus_attribute {
 int __must_check bus_create_file(const struct bus_type *bus, struct bus_attribute *attr);
 void bus_remove_file(const struct bus_type *bus, struct bus_attribute *attr);
 
+/* Matching function type for drivers/base APIs to find a specific device */
+typedef int (*device_match_t)(struct device *dev, const void *data);
+
 /* Generic device matching functions that all busses can use to match with */
 int device_match_name(struct device *dev, const void *name);
 int device_match_of_node(struct device *dev, const void *np);
@@ -139,8 +142,7 @@ int device_match_any(struct device *dev, const void *unused);
 int bus_for_each_dev(const struct bus_type *bus, struct device *start, void *data,
 		     int (*fn)(struct device *dev, void *data));
 struct device *bus_find_device(const struct bus_type *bus, struct device *start,
-			       const void *data,
-			       int (*match)(struct device *dev, const void *data));
+			       const void *data, device_match_t match);
 /**
  * bus_find_device_by_name - device iterator for locating a particular device
  * of a specific name.
diff --git a/include/linux/device/class.h b/include/linux/device/class.h
index c576b49c55c2..518c9c83d64b 100644
--- a/include/linux/device/class.h
+++ b/include/linux/device/class.h
@@ -95,7 +95,7 @@ void class_dev_iter_exit(struct class_dev_iter *iter);
 int class_for_each_device(const struct class *class, const struct device *start, void *data,
 			  int (*fn)(struct device *dev, void *data));
 struct device *class_find_device(const struct class *class, const struct device *start,
-				 const void *data, int (*match)(struct device *, const void *));
+				 const void *data, device_match_t match);
 
 /**
  * class_find_device_by_name - device iterator for locating a particular device
diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index 1fc8b68786de..5c04b8e3833b 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -157,7 +157,7 @@ int __must_check driver_for_each_device(struct device_driver *drv, struct device
 					void *data, int (*fn)(struct device *dev, void *));
 struct device *driver_find_device(const struct device_driver *drv,
 				  struct device *start, const void *data,
-				  int (*match)(struct device *dev, const void *data));
+				  device_match_t match);
 
 /**
  * driver_find_device_by_name - device iterator for locating a particular device
-- 
cgit v1.2.3


From 24e041e1e48d06f25a12caaf73728a4ec2e511fe Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Fri, 23 Aug 2024 15:55:44 +0800
Subject: platform: Make platform_bus_type constant

Since commit d492cc2573a0 ("driver core: device.h: make struct
bus_type a const *"), the driver core can properly handle constant
struct bus_type, move the platform_bus_type variable to be a constant
structure as well, placing it into read-only memory which can not be
modified at runtime.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Link: https://lore.kernel.org/r/20240823075544.144426-1-kunwu.chan@linux.dev
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/platform_device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index d422db6eec63..7132623e4658 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -52,7 +52,7 @@ struct platform_device {
 extern int platform_device_register(struct platform_device *);
 extern void platform_device_unregister(struct platform_device *);
 
-extern struct bus_type platform_bus_type;
+extern const struct bus_type platform_bus_type;
 extern struct device platform_bus;
 
 extern struct resource *platform_get_resource(struct platform_device *,
-- 
cgit v1.2.3


From 8064952c65045f05ee2671fe437770e50c151776 Mon Sep 17 00:00:00 2001
From: Stuart Hayes <stuart.w.hayes@gmail.com>
Date: Thu, 22 Aug 2024 15:28:04 -0500
Subject: driver core: shut down devices asynchronously

Add code to allow asynchronous shutdown of devices, ensuring that each
device is shut down before its parents & suppliers.

Only devices with drivers that have async_shutdown_enable enabled will be
shut down asynchronously.

This can dramatically reduce system shutdown/reboot time on systems that
have multiple devices that take many seconds to shut down (like certain
NVMe drives). On one system tested, the shutdown time went from 11 minutes
without this patch to 55 seconds with the patch.

Signed-off-by: Stuart Hayes <stuart.w.hayes@gmail.com>
Signed-off-by: David Jeffery <djeffery@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Tested-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20240822202805.6379-4-stuart.w.hayes@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/driver.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index 5c04b8e3833b..14c9211b82d6 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -56,6 +56,7 @@ enum probe_type {
  * @mod_name:	Used for built-in modules.
  * @suppress_bind_attrs: Disables bind/unbind via sysfs.
  * @probe_type:	Type of the probe (synchronous or asynchronous) to use.
+ * @async_shutdown_enable: Enables devices to be shutdown asynchronously.
  * @of_match_table: The open firmware table.
  * @acpi_match_table: The ACPI match table.
  * @probe:	Called to query the existence of a specific device,
@@ -102,6 +103,7 @@ struct device_driver {
 
 	bool suppress_bind_attrs;	/* disables bind/unbind via sysfs */
 	enum probe_type probe_type;
+	bool async_shutdown_enable;
 
 	const struct of_device_id	*of_match_table;
 	const struct acpi_device_id	*acpi_match_table;
-- 
cgit v1.2.3


From 8ab0f4605d5ce3c0d386b3828b07719f1e8e0505 Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Fri, 23 Aug 2024 14:24:40 +0800
Subject: bus: fsl-mc: make fsl_mc_bus_type const

Since commit d492cc2573a0 ("driver core: device.h: make struct
bus_type a const *"), the driver core can properly handle constant
struct bus_type, move the fsl_mc_bus_type variable to be a constant
structure as well, placing it into read-only memory which can not be
modified at runtime.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Acked-by: Christophe Leroy <christophe.leroy@csgroup.eu> # for
Link: https://lore.kernel.org/r/20240823062440.113628-1-kunwu.chan@linux.dev
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/fsl/mc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h
index 083c860fd28e..c90ec889bfc2 100644
--- a/include/linux/fsl/mc.h
+++ b/include/linux/fsl/mc.h
@@ -436,7 +436,7 @@ void fsl_mc_free_irqs(struct fsl_mc_device *mc_dev);
 struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev,
 					  u16 if_id);
 
-extern struct bus_type fsl_mc_bus_type;
+extern const struct bus_type fsl_mc_bus_type;
 
 extern struct device_type fsl_mc_bus_dprc_type;
 extern struct device_type fsl_mc_bus_dpni_type;
-- 
cgit v1.2.3


From 4c0a6a0ac902dbf184ae7be1f1fce225cc0b7380 Mon Sep 17 00:00:00 2001
From: Chanwoo Lee <cw9316.lee@samsung.com>
Date: Thu, 29 Aug 2024 11:47:09 +0900
Subject: mmc: core: Replace the argument of mmc_sd_switch() with defines

Replace with already defined values for readability. While at it, let's
also change the mode-parameter from an int to bool, as the only used values
are 0 or 1.

Signed-off-by: Chanwoo Lee <cw9316.lee@samsung.com>
Link: https://lore.kernel.org/r/20240829024709.402285-1-cw9316.lee@samsung.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/host.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index ac01cd1622ef..6a31ed02d3ff 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -648,7 +648,8 @@ static inline void mmc_debugfs_err_stats_inc(struct mmc_host *host,
 	host->err_stats[stat] += 1;
 }
 
-int mmc_sd_switch(struct mmc_card *card, int mode, int group, u8 value, u8 *resp);
+int mmc_sd_switch(struct mmc_card *card, bool mode, int group,
+		u8 value, u8 *resp);
 int mmc_send_status(struct mmc_card *card, u32 *status);
 int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error);
 int mmc_send_abort_tuning(struct mmc_host *host, u32 opcode);
-- 
cgit v1.2.3


From 7df7c204c678e24cd32d33360538670b7b90e330 Mon Sep 17 00:00:00 2001
From: Pankaj Raghav <p.raghav@samsung.com>
Date: Thu, 22 Aug 2024 15:50:18 +0200
Subject: xfs: enable block size larger than page size support

Page cache now has the ability to have a minimum order when allocating
a folio which is a prerequisite to add support for block size > page
size.

Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Link: https://lore.kernel.org/r/20240827-xfs-fix-wformat-bs-gt-ps-v1-1-aec6717609e0@kernel.org # fix folded
Link: https://lore.kernel.org/r/20240822135018.1931258-11-kernel@pankajraghav.com
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pagemap.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4cc170949e9c..55b254d951da 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -374,6 +374,19 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
 #define MAX_XAS_ORDER		(XA_CHUNK_SHIFT * 2 - 1)
 #define MAX_PAGECACHE_ORDER	min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)
 
+/*
+ * mapping_max_folio_size_supported() - Check the max folio size supported
+ *
+ * The filesystem should call this function at mount time if there is a
+ * requirement on the folio mapping size in the page cache.
+ */
+static inline size_t mapping_max_folio_size_supported(void)
+{
+	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+		return 1U << (PAGE_SHIFT + MAX_PAGECACHE_ORDER);
+	return PAGE_SIZE;
+}
+
 /*
  * mapping_set_folio_order_range() - Set the orders supported by a file.
  * @mapping: The address space of the file.
-- 
cgit v1.2.3


From 31754ea6cbbc08d5bbe1fa290320c3048d8d98a3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 27 Aug 2024 06:51:36 -0400
Subject: iomap: add a private argument for iomap_file_buffered_write

In order to switch fuse over to using iomap for buffered writes we need
to be able to have the struct file for the original write, in case we
have to read in the page to make it uptodate.  Handle this by using the
existing private field in the iomap_iter, and add the argument to
iomap_file_buffered_write.  This will allow us to pass the file in
through the iomap buffered write path, and is flexible for any other
file systems needs.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/r/7f55c7c32275004ba00cddf862d970e6e633f750.1724755651.git.josef@toxicpanda.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/iomap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6fc1c858013d..f792b37f7627 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -257,7 +257,7 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i)
 }
 
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
-		const struct iomap_ops *ops);
+		const struct iomap_ops *ops, void *private);
 int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
 		struct iomap *iomap, loff_t pos, loff_t length, ssize_t written,
 		int (*punch)(struct inode *inode, loff_t pos, loff_t length));
-- 
cgit v1.2.3


From 6f634eb080161baa4811a39f5ec07923ede092bc Mon Sep 17 00:00:00 2001
From: Pankaj Raghav <p.raghav@samsung.com>
Date: Tue, 27 Aug 2024 10:42:07 +0200
Subject: filemap: fix htmldoc warning for mapping_align_index()

Stephen reported that there is a kernel build warning due to a missing
description of a parameter in mapping_align_index().

Add the missing index parameter in the comment description.

Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Link: https://lore.kernel.org/r/20240827084206.106347-2-kernel@pankajraghav.com
Fixes: ab95d23bab22 ("filemap: allocate mapping_min_order folios in the page cache")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pagemap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 55b254d951da..6c0dde447c98 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -470,6 +470,7 @@ mapping_min_folio_nrpages(struct address_space *mapping)
 /**
  * mapping_align_index() - Align index for this mapping.
  * @mapping: The address_space.
+ * @index: The page index.
  *
  * The index of a folio must be naturally aligned.  If you are adding a
  * new folio to the page cache and need to know what index to give it,
-- 
cgit v1.2.3


From 4686cc598f669dea1b50dde1568e6c65c355bc67 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 14 Aug 2024 00:25:51 +0200
Subject: sched: Clean up DL server vs core sched

Abide by the simple rule:

  pick_next_task() := pick_task() + set_next_task(.first = true)

This allows us to trivially get rid of server_pick_next() and things
collapse nicely.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240813224015.837303391@infradead.org
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3709dedbab59..57cf27a3045c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -692,7 +692,6 @@ struct sched_dl_entity {
 	 */
 	struct rq			*rq;
 	dl_server_has_tasks_f		server_has_tasks;
-	dl_server_pick_f		server_pick_next;
 	dl_server_pick_f		server_pick_task;
 
 #ifdef CONFIG_RT_MUTEXES
-- 
cgit v1.2.3


From 631598c419985327110135efb696d9f9fe67bffd Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 14 Aug 2024 11:26:29 +0200
Subject: iio: dac: ad5449: drop support for platform data

There are no longer any users of the platform data struct. Remove
support for it from the driver.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Link: https://patch.msgid.link/20240814092629.9862-1-brgl@bgdev.pl
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/platform_data/ad5449.h | 39 ------------------------------------
 1 file changed, 39 deletions(-)
 delete mode 100644 include/linux/platform_data/ad5449.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/ad5449.h b/include/linux/platform_data/ad5449.h
deleted file mode 100644
index d687ef5726c2..000000000000
--- a/include/linux/platform_data/ad5449.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * AD5415, AD5426, AD5429, AD5432, AD5439, AD5443, AD5449 Digital to Analog
- * Converter driver.
- *
- * Copyright 2012 Analog Devices Inc.
- *  Author: Lars-Peter Clausen <lars@metafoo.de>
- */
-
-#ifndef __LINUX_PLATFORM_DATA_AD5449_H__
-#define __LINUX_PLATFORM_DATA_AD5449_H__
-
-/**
- * enum ad5449_sdo_mode - AD5449 SDO pin configuration
- * @AD5449_SDO_DRIVE_FULL: Drive the SDO pin with full strength.
- * @AD5449_SDO_DRIVE_WEAK: Drive the SDO pin with not full strength.
- * @AD5449_SDO_OPEN_DRAIN: Operate the SDO pin in open-drain mode.
- * @AD5449_SDO_DISABLED: Disable the SDO pin, in this mode it is not possible to
- *			read back from the device.
- */
-enum ad5449_sdo_mode {
-	AD5449_SDO_DRIVE_FULL = 0x0,
-	AD5449_SDO_DRIVE_WEAK = 0x1,
-	AD5449_SDO_OPEN_DRAIN = 0x2,
-	AD5449_SDO_DISABLED = 0x3,
-};
-
-/**
- * struct ad5449_platform_data - Platform data for the ad5449 DAC driver
- * @sdo_mode: SDO pin mode
- * @hardware_clear_to_midscale: Whether asserting the hardware CLR pin sets the
- *			outputs to midscale (true) or to zero scale(false).
- */
-struct ad5449_platform_data {
-	enum ad5449_sdo_mode sdo_mode;
-	bool hardware_clear_to_midscale;
-};
-
-#endif
-- 
cgit v1.2.3


From 7cb9b5fa218caa899f4288865c4598357bcce4e9 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 3 Sep 2024 15:38:25 -0500
Subject: PCI: endpoint: Fix enum pci_epc_bar_type kerneldoc

e01c9797c0eb ("PCI: endpoint: Clean up hardware description for BARs")
added enum pci_epc_bar_type with incomplete kerneldoc.  Add the missing
piece.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci-epc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 85bdf2adb760..697cc190747d 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -149,6 +149,7 @@ struct pci_epc {
 };
 
 /**
+ * enum pci_epc_bar_type - configurability of endpoint BAR
  * @BAR_PROGRAMMABLE: The BAR mask can be configured by the EPC.
  * @BAR_FIXED: The BAR mask is fixed by the hardware.
  * @BAR_RESERVED: The BAR should not be touched by an EPF driver.
-- 
cgit v1.2.3


From 364ea7ccaef917a3068236a19a4b31a0623b561a Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sat, 31 Aug 2024 16:20:39 +0200
Subject: power: supply: Change usb_types from an array into a bitmask

The bit_types array just hold a list of valid enum power_supply_usb_type
values which map to 0 - 9. This can easily be represented as a bitmap.

This reduces the size of struct power_supply_desc and further reduces
the data section size by drivers no longer needing to store the array.

This also unifies how usb_types are handled with charge_behaviours,
which allows power_supply_show_usb_type() to be removed.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20240831142039.28830-7-hdegoede@redhat.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/linux/power_supply.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 72dc7e45c90c..910d407ebe63 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -243,8 +243,7 @@ struct power_supply_desc {
 	const char *name;
 	enum power_supply_type type;
 	u8 charge_behaviours;
-	const enum power_supply_usb_type *usb_types;
-	size_t num_usb_types;
+	u32 usb_types;
 	const enum power_supply_property *properties;
 	size_t num_properties;
 
-- 
cgit v1.2.3


From de6c85bf918ea52d5c680f0d130b37ee2ff152d6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 28 Aug 2024 09:02:47 +0300
Subject: dma-mapping: clearly mark DMA ops as an architecture feature

DMA ops are a helper for architectures and not for drivers to override
the DMA implementation.

Unfortunately driver authors keep ignoring this.  Make the fact more
clear by renaming the symbol to ARCH_HAS_DMA_OPS and having the two drivers
overriding their dma_ops depend on that.  These drivers should probably be
marked broken, but we can give them a bit of a grace period for that.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com> # for IPU6
Acked-by: Robin Murphy <robin.murphy@arm.com>
---
 include/linux/device.h      | 2 +-
 include/linux/dma-map-ops.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 1c5280d28bc3..b4bde8d22697 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -750,7 +750,7 @@ struct device {
 	struct dev_pin_info	*pins;
 #endif
 	struct dev_msi_info	msi;
-#ifdef CONFIG_DMA_OPS
+#ifdef CONFIG_ARCH_HAS_DMA_OPS
 	const struct dma_map_ops *dma_ops;
 #endif
 	u64		*dma_mask;	/* dma mask (if dma'able device) */
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 077b15c93bb8..9668ddf3696e 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -75,7 +75,7 @@ struct dma_map_ops {
 	unsigned long (*get_merge_boundary)(struct device *dev);
 };
 
-#ifdef CONFIG_DMA_OPS
+#ifdef CONFIG_ARCH_HAS_DMA_OPS
 #include <asm/dma-mapping.h>
 
 static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
@@ -90,7 +90,7 @@ static inline void set_dma_ops(struct device *dev,
 {
 	dev->dma_ops = dma_ops;
 }
-#else /* CONFIG_DMA_OPS */
+#else /* CONFIG_ARCH_HAS_DMA_OPS */
 static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 {
 	return NULL;
@@ -99,7 +99,7 @@ static inline void set_dma_ops(struct device *dev,
 			       const struct dma_map_ops *dma_ops)
 {
 }
-#endif /* CONFIG_DMA_OPS */
+#endif /* CONFIG_ARCH_HAS_DMA_OPS */
 
 #ifdef CONFIG_DMA_CMA
 extern struct cma *dma_contiguous_default_area;
-- 
cgit v1.2.3


From 73ed0baae66df50359c876f65f41179d6ebd2716 Mon Sep 17 00:00:00 2001
From: Chris Li <chrisl@kernel.org>
Date: Tue, 30 Jul 2024 23:49:13 -0700
Subject: mm: swap: swap cluster switch to double link list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "mm: swap: mTHP swap allocator base on swap cluster order",
v5.

This is the short term solutions "swap cluster order" listed in my "Swap
Abstraction" discussion slice 8 in the recent LSF/MM conference.

When commit 845982eb264bc "mm: swap: allow storage of all mTHP orders" is
introduced, it only allocates the mTHP swap entries from the new empty
cluster list.   It has a fragmentation issue reported by Barry.

https://lore.kernel.org/all/CAGsJ_4zAcJkuW016Cfi6wicRr8N9X+GJJhgMQdSMp+Ah+NSgNQ@mail.gmail.com/

The reason is that all the empty clusters have been exhausted while there
are plenty of free swap entries in the cluster that are not 100% free.

Remember the swap allocation order in the cluster.  Keep track of the per
order non full cluster list for later allocation.

This series gives the swap SSD allocation a new separate code path from
the HDD allocation.  The new allocator use cluster list only and do not
global scan swap_map[] without lock any more.

This streamline the swap allocation for SSD.  The code matches the
execution flow much better.

User impact: For users that allocate and free mix order mTHP swapping, It
greatly improves the success rate of the mTHP swap allocation after the
initial phase.

It also performs faster when the swapfile is close to full, because the
allocator can get the non full cluster from a list rather than scanning a
lot of swap_map entries. 

With Barry's mthp test program V2:

Without:
$ ./thp_swap_allocator_test -a
Iteration 1: swpout inc: 32, swpout fallback inc: 192, Fallback percentage: 85.71%
Iteration 2: swpout inc: 0, swpout fallback inc: 231, Fallback percentage: 100.00%
Iteration 3: swpout inc: 0, swpout fallback inc: 227, Fallback percentage: 100.00%
...
Iteration 98: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00%
Iteration 99: swpout inc: 0, swpout fallback inc: 215, Fallback percentage: 100.00%
Iteration 100: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%

$ ./thp_swap_allocator_test -a -s
Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00%
Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00%
Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%
..
Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00%
Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00%
Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00%

$ ./thp_swap_allocator_test -s
Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00%
Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00%
Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%
..
Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00%
Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00%
Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00%

$ ./thp_swap_allocator_test
Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00%
Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00%
Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%
..
Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00%
Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00%
Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00%

With: # with all 0.00% filter out
$ ./thp_swap_allocator_test -a | grep -v "0.00%"
$ # all result are 0.00%

$ ./thp_swap_allocator_test -a -s | grep -v "0.00%"
./thp_swap_allocator_test -a -s | grep -v "0.00%"
Iteration 14: swpout inc: 223, swpout fallback inc: 3, Fallback percentage: 1.33%
Iteration 19: swpout inc: 219, swpout fallback inc: 7, Fallback percentage: 3.10%
Iteration 28: swpout inc: 225, swpout fallback inc: 1, Fallback percentage: 0.44%
Iteration 29: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44%
Iteration 34: swpout inc: 220, swpout fallback inc: 8, Fallback percentage: 3.51%
Iteration 35: swpout inc: 222, swpout fallback inc: 11, Fallback percentage: 4.72%
Iteration 38: swpout inc: 217, swpout fallback inc: 4, Fallback percentage: 1.81%
Iteration 40: swpout inc: 222, swpout fallback inc: 6, Fallback percentage: 2.63%
Iteration 42: swpout inc: 221, swpout fallback inc: 2, Fallback percentage: 0.90%
Iteration 43: swpout inc: 215, swpout fallback inc: 7, Fallback percentage: 3.15%
Iteration 47: swpout inc: 226, swpout fallback inc: 2, Fallback percentage: 0.88%
Iteration 49: swpout inc: 217, swpout fallback inc: 1, Fallback percentage: 0.46%
Iteration 52: swpout inc: 221, swpout fallback inc: 8, Fallback percentage: 3.49%
Iteration 56: swpout inc: 224, swpout fallback inc: 4, Fallback percentage: 1.75%
Iteration 58: swpout inc: 214, swpout fallback inc: 5, Fallback percentage: 2.28%
Iteration 62: swpout inc: 220, swpout fallback inc: 3, Fallback percentage: 1.35%
Iteration 64: swpout inc: 224, swpout fallback inc: 1, Fallback percentage: 0.44%
Iteration 67: swpout inc: 221, swpout fallback inc: 1, Fallback percentage: 0.45%
Iteration 75: swpout inc: 220, swpout fallback inc: 9, Fallback percentage: 3.93%
Iteration 82: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44%
Iteration 86: swpout inc: 211, swpout fallback inc: 12, Fallback percentage: 5.38%
Iteration 89: swpout inc: 226, swpout fallback inc: 2, Fallback percentage: 0.88%
Iteration 93: swpout inc: 220, swpout fallback inc: 1, Fallback percentage: 0.45%
Iteration 94: swpout inc: 224, swpout fallback inc: 1, Fallback percentage: 0.44%
Iteration 96: swpout inc: 221, swpout fallback inc: 6, Fallback percentage: 2.64%
Iteration 98: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44%
Iteration 99: swpout inc: 227, swpout fallback inc: 3, Fallback percentage: 1.30%

$ ./thp_swap_allocator_test
./thp_swap_allocator_test
Iteration 1: swpout inc: 233, swpout fallback inc: 0, Fallback percentage: 0.00%
Iteration 2: swpout inc: 131, swpout fallback inc: 101, Fallback percentage: 43.53%
Iteration 3: swpout inc: 71, swpout fallback inc: 155, Fallback percentage: 68.58%
Iteration 4: swpout inc: 55, swpout fallback inc: 168, Fallback percentage: 75.34%
Iteration 5: swpout inc: 35, swpout fallback inc: 191, Fallback percentage: 84.51%
Iteration 6: swpout inc: 25, swpout fallback inc: 199, Fallback percentage: 88.84%
Iteration 7: swpout inc: 23, swpout fallback inc: 205, Fallback percentage: 89.91%
Iteration 8: swpout inc: 9, swpout fallback inc: 219, Fallback percentage: 96.05%
Iteration 9: swpout inc: 13, swpout fallback inc: 213, Fallback percentage: 94.25%
Iteration 10: swpout inc: 12, swpout fallback inc: 216, Fallback percentage: 94.74%
Iteration 11: swpout inc: 16, swpout fallback inc: 213, Fallback percentage: 93.01%
Iteration 12: swpout inc: 10, swpout fallback inc: 210, Fallback percentage: 95.45%
Iteration 13: swpout inc: 16, swpout fallback inc: 212, Fallback percentage: 92.98%
Iteration 14: swpout inc: 12, swpout fallback inc: 212, Fallback percentage: 94.64%
Iteration 15: swpout inc: 15, swpout fallback inc: 211, Fallback percentage: 93.36%
Iteration 16: swpout inc: 15, swpout fallback inc: 200, Fallback percentage: 93.02%
Iteration 17: swpout inc: 9, swpout fallback inc: 220, Fallback percentage: 96.07%

$ ./thp_swap_allocator_test -s
 ./thp_swap_allocator_test -s
Iteration 1: swpout inc: 233, swpout fallback inc: 0, Fallback percentage: 0.00%
Iteration 2: swpout inc: 97, swpout fallback inc: 135, Fallback percentage: 58.19%
Iteration 3: swpout inc: 42, swpout fallback inc: 192, Fallback percentage: 82.05%
Iteration 4: swpout inc: 19, swpout fallback inc: 214, Fallback percentage: 91.85%
Iteration 5: swpout inc: 12, swpout fallback inc: 213, Fallback percentage: 94.67%
Iteration 6: swpout inc: 11, swpout fallback inc: 217, Fallback percentage: 95.18%
Iteration 7: swpout inc: 9, swpout fallback inc: 214, Fallback percentage: 95.96%
Iteration 8: swpout inc: 8, swpout fallback inc: 213, Fallback percentage: 96.38%
Iteration 9: swpout inc: 2, swpout fallback inc: 223, Fallback percentage: 99.11%
Iteration 10: swpout inc: 2, swpout fallback inc: 228, Fallback percentage: 99.13%
Iteration 11: swpout inc: 4, swpout fallback inc: 214, Fallback percentage: 98.17%
Iteration 12: swpout inc: 5, swpout fallback inc: 226, Fallback percentage: 97.84%
Iteration 13: swpout inc: 3, swpout fallback inc: 212, Fallback percentage: 98.60%
Iteration 14: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00%
Iteration 15: swpout inc: 3, swpout fallback inc: 222, Fallback percentage: 98.67%
Iteration 16: swpout inc: 4, swpout fallback inc: 223, Fallback percentage: 98.24%

=========
Kernel compile under tmpfs with cgroup memory.max = 470M.
12 core 24 hyperthreading, 32 jobs. 10 Run each group

SSD swap 10 runs average, 20G swap partition:
With:
user    2929.064
system  1479.381 : 1376.89 1398.22 1444.64 1477.39 1479.04 1497.27
1504.47 1531.4 1532.92 1551.57
real    1441.324

Without:
user    2910.872
system  1482.732 : 1440.01 1451.4 1462.01 1467.47 1467.51 1469.3
1470.19 1496.32 1544.1 1559.01
real    1580.822

Two zram swap: zram0 3.0G zram1 20G.

The idea is forcing the zram0 almost full then overflow to zram1:

With:
user    4320.301
system  4272.403 : 4236.24 4262.81 4264.75 4269.13 4269.44 4273.06
4279.85 4285.98 4289.64 4293.13
real    431.759

Without
user    4301.393
system  4387.672 : 4374.47 4378.3 4380.95 4382.84 4383.06 4388.05
4389.76 4397.16 4398.23 4403.9
real    433.979

------ more test result from Kaiui ----------

Test with build linux kernel using a 4G ZRAM, 1G memory.max limit on top of shmem:

System info: 32 Core AMD Zen2, 64G total memory.

Test 3 times using only 4K pages:
=================================

With:
-----
1838.74user 2411.21system 2:37.86elapsed 2692%CPU (0avgtext+0avgdata 847060maxresident)k
1839.86user 2465.77system 2:39.35elapsed 2701%CPU (0avgtext+0avgdata 847060maxresident)k
1840.26user 2454.68system 2:39.43elapsed 2693%CPU (0avgtext+0avgdata 847060maxresident)k

Summary (~4.6% improment of system time):
User: 1839.62
System: 2443.89: 2465.77 2454.68 2411.21
Real: 158.88

Without:
--------
1837.99user 2575.95system 2:43.09elapsed 2706%CPU (0avgtext+0avgdata 846520maxresident)k
1838.32user 2555.15system 2:42.52elapsed 2709%CPU (0avgtext+0avgdata 846520maxresident)k
1843.02user 2561.55system 2:43.35elapsed 2702%CPU (0avgtext+0avgdata 846520maxresident)k

Summary:
User: 1839.78
System: 2564.22: 2575.95 2555.15 2561.55
Real: 162.99

Test 5 times using enabled all mTHP pages:
==========================================

With:
-----
1796.44user 2937.33system 2:59.09elapsed 2643%CPU (0avgtext+0avgdata 846936maxresident)k
1802.55user 3002.32system 2:54.68elapsed 2750%CPU (0avgtext+0avgdata 847072maxresident)k
1806.59user 2986.53system 2:55.17elapsed 2736%CPU (0avgtext+0avgdata 847092maxresident)k
1803.27user 2982.40system 2:54.49elapsed 2742%CPU (0avgtext+0avgdata 846796maxresident)k
1807.43user 3036.08system 2:56.06elapsed 2751%CPU (0avgtext+0avgdata 846488maxresident)k

Summary (~8.4% improvement of system time):
User: 1803.25
System: 2988.93: 2937.33 3002.32 2986.53 2982.40 3036.08
Real: 175.90

mTHP swapout status:
/sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpout:347721
/sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpout_fallback:3110
/sys/kernel/mm/transparent_hugepage/hugepages-512kB/stats/swpout:3365
/sys/kernel/mm/transparent_hugepage/hugepages-512kB/stats/swpout_fallback:8269
/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/stats/swpout:24
/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/stats/swpout_fallback:3341
/sys/kernel/mm/transparent_hugepage/hugepages-1024kB/stats/swpout:145
/sys/kernel/mm/transparent_hugepage/hugepages-1024kB/stats/swpout_fallback:5038
/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout:322737
/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback:36808
/sys/kernel/mm/transparent_hugepage/hugepages-16kB/stats/swpout:380455
/sys/kernel/mm/transparent_hugepage/hugepages-16kB/stats/swpout_fallback:1010
/sys/kernel/mm/transparent_hugepage/hugepages-256kB/stats/swpout:24973
/sys/kernel/mm/transparent_hugepage/hugepages-256kB/stats/swpout_fallback:13223
/sys/kernel/mm/transparent_hugepage/hugepages-128kB/stats/swpout:197348
/sys/kernel/mm/transparent_hugepage/hugepages-128kB/stats/swpout_fallback:80541

Without:
--------
1794.41user 3151.29system 3:05.97elapsed 2659%CPU (0avgtext+0avgdata 846704maxresident)k
1810.27user 3304.48system 3:05.38elapsed 2759%CPU (0avgtext+0avgdata 846636maxresident)k
1809.84user 3254.85system 3:03.83elapsed 2755%CPU (0avgtext+0avgdata 846952maxresident)k
1813.54user 3259.56system 3:04.28elapsed 2752%CPU (0avgtext+0avgdata 846848maxresident)k
1829.97user 3338.40system 3:07.32elapsed 2759%CPU (0avgtext+0avgdata 847024maxresident)k

Summary:
User: 1811.61
System: 3261.72 : 3151.29 3304.48 3254.85 3259.56 3338.40
Real: 185.356

mTHP swapout status:
hugepages-32kB/stats/swpout:35630
hugepages-32kB/stats/swpout_fallback:1809908
hugepages-512kB/stats/swpout:523
hugepages-512kB/stats/swpout_fallback:55235
hugepages-2048kB/stats/swpout:53
hugepages-2048kB/stats/swpout_fallback:17264
hugepages-1024kB/stats/swpout:85
hugepages-1024kB/stats/swpout_fallback:24979
hugepages-64kB/stats/swpout:30117
hugepages-64kB/stats/swpout_fallback:1825399
hugepages-16kB/stats/swpout:42775
hugepages-16kB/stats/swpout_fallback:1951123
hugepages-256kB/stats/swpout:2326
hugepages-256kB/stats/swpout_fallback:170165
hugepages-128kB/stats/swpout:17925
hugepages-128kB/stats/swpout_fallback:1309757


This patch (of 9):

Previously, the swap cluster used a cluster index as a pointer to
construct a custom single link list type "swap_cluster_list".  The next
cluster pointer is shared with the cluster->count.  It prevents puting the
non free cluster into a list.

Change the cluster to use the standard double link list instead.  This
allows tracing the nonfull cluster in the follow up patch.  That way, it
is faster to get to the nonfull cluster of that order.

Remove the cluster getter/setter for accessing the cluster struct member.

The list operation is protected by the swap_info_struct->lock.

Change cluster code to use "struct swap_cluster_info *" to reference the
cluster rather than by using index.  That is more consistent with the list
manipulation.  It avoids the repeat adding index to the cluser_info.  The
code is easier to understand.

Remove the cluster next pointer is NULL flag, the double link list can
handle the empty list pretty well.

The "swap_cluster_info" struct is two pointer bigger, because 512 swap
entries share one swap_cluster_info struct, it has very little impact on
the average memory usage per swap entry.  For 1TB swapfile, the swap
cluster data structure increases from 8MB to 24MB.

Other than the list conversion, there is no real function change in this
patch.

Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org
Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-1-cb9c148b9297@kernel.org
Signed-off-by: Chris Li <chrisl@kernel.org>
Reported-by: Barry Song <21cnbao@gmail.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 5b920fa2315b..ead568f04f81 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -243,22 +243,20 @@ enum {
  * free clusters are organized into a list. We fetch an entry from the list to
  * get a free cluster.
  *
- * The data field stores next cluster if the cluster is free or cluster usage
- * counter otherwise. The flags field determines if a cluster is free. This is
- * protected by swap_info_struct.lock.
+ * The flags field determines if a cluster is free. This is
+ * protected by cluster lock.
  */
 struct swap_cluster_info {
 	spinlock_t lock;	/*
 				 * Protect swap_cluster_info fields
-				 * and swap_info_struct->swap_map
-				 * elements correspond to the swap
-				 * cluster
+				 * other than list, and swap_info_struct->swap_map
+				 * elements corresponding to the swap cluster.
 				 */
-	unsigned int data:24;
-	unsigned int flags:8;
+	u16 count;
+	u8 flags;
+	struct list_head list;
 };
 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
-#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
 
 /*
  * The first page in the swap file is the swap header, which is always marked
@@ -283,11 +281,6 @@ struct percpu_cluster {
 	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
 };
 
-struct swap_cluster_list {
-	struct swap_cluster_info head;
-	struct swap_cluster_info tail;
-};
-
 /*
  * The in-memory structure used to track swap areas.
  */
@@ -300,7 +293,7 @@ struct swap_info_struct {
 	unsigned int	max;		/* extent of the swap_map */
 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
-	struct swap_cluster_list free_clusters; /* free clusters list */
+	struct list_head free_clusters; /* free clusters list */
 	unsigned int lowest_bit;	/* index of first free in swap_map */
 	unsigned int highest_bit;	/* index of last free in swap_map */
 	unsigned int pages;		/* total of usable pages of swap */
@@ -331,7 +324,7 @@ struct swap_info_struct {
 					 * list.
 					 */
 	struct work_struct discard_work; /* discard worker */
-	struct swap_cluster_list discard_clusters; /* discard clusters list */
+	struct list_head discard_clusters; /* discard clusters list */
 	struct plist_node avail_lists[]; /*
 					   * entries in swap_avail_heads, one
 					   * entry per node.
-- 
cgit v1.2.3


From d07a46a4ac18786e7f4c98fb08525ed80dd1f642 Mon Sep 17 00:00:00 2001
From: Chris Li <chrisl@kernel.org>
Date: Tue, 30 Jul 2024 23:49:14 -0700
Subject: mm: swap: mTHP allocate swap entries from nonfull list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Track the nonfull cluster as well as the empty cluster on lists.  Each
order has one nonfull cluster list.

The cluster will remember which order it was used during new cluster
allocation.

When the cluster has free entry, add to the nonfull[order] list.   When
the free cluster list is empty, also allocate from the nonempty list of
that order.

This improves the mTHP swap allocation success rate.

There are limitations if the distribution of numbers of different orders
of mTHP changes a lot.  e.g.  there are a lot of nonfull cluster assign to
order A while later time there are a lot of order B allocation while very
little allocation in order A.  Currently the cluster used by order A will
not reused by order B unless the cluster is 100% empty.

Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-2-cb9c148b9297@kernel.org
Signed-off-by: Chris Li <chrisl@kernel.org>
Reported-by: Barry Song <21cnbao@gmail.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ead568f04f81..7007a3c2336c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -254,9 +254,11 @@ struct swap_cluster_info {
 				 */
 	u16 count;
 	u8 flags;
+	u8 order;
 	struct list_head list;
 };
 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
+#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
 
 /*
  * The first page in the swap file is the swap header, which is always marked
@@ -294,6 +296,8 @@ struct swap_info_struct {
 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
 	struct list_head free_clusters; /* free clusters list */
+	struct list_head nonfull_clusters[SWAP_NR_ORDERS];
+					/* list of cluster that contains at least one free slot */
 	unsigned int lowest_bit;	/* index of first free in swap_map */
 	unsigned int highest_bit;	/* index of last free in swap_map */
 	unsigned int pages;		/* total of usable pages of swap */
-- 
cgit v1.2.3


From 477cb7ba28892eda112c79d8f75d10edabfc3050 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Tue, 30 Jul 2024 23:49:19 -0700
Subject: mm: swap: add a fragment cluster list

Now swap cluster allocator arranges the clusters in LRU style, so the
"cold" cluster stay at the head of nonfull lists are the ones that were
used for allocation long time ago and still partially occupied.  So if
allocator can't find enough contiguous slots to satisfy an high order
allocation, it's unlikely there will be slot being free on them to satisfy
the allocation, at least in a short period.

As a result, nonfull cluster scanning will waste time repeatly scanning
the unusable head of the list.

Also, multiple CPUs could content on the same head cluster of nonfull
list.  Unlike free clusters which are removed from the list when any CPU
starts using it, nonfull cluster stays on the head.

So introduce a new list frag list, all scanned nonfull clusters will be
moved to this list.  Both for avoiding repeated scanning and contention.

Frag list is still used as fallback for allocations, so if one CPU failed
to allocate one order of slots, it can still steal other CPU's clusters.
And order 0 will favor the fragmented clusters to better protect nonfull
clusters

If any slots on a fragment list are being freed, move the fragment list
back to nonfull list indicating it worth another scan on the cluster.
Compared to scan upon freeing a slot, this keep the scanning lazy and save
some CPU if there are still other clusters to use.

It may seems unneccessay to keep the fragmented cluster on list at all if
they can't be used for specific order allocation.  But this will start to
make sense once reclaim dring scanning is ready.

Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-7-cb9c148b9297@kernel.org
Signed-off-by: Kairui Song <kasong@tencent.com>
Reported-by: Barry Song <21cnbao@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7007a3c2336c..c526cd3e7b31 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -259,6 +259,7 @@ struct swap_cluster_info {
 };
 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
 #define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
+#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */
 
 /*
  * The first page in the swap file is the swap header, which is always marked
@@ -298,6 +299,8 @@ struct swap_info_struct {
 	struct list_head free_clusters; /* free clusters list */
 	struct list_head nonfull_clusters[SWAP_NR_ORDERS];
 					/* list of cluster that contains at least one free slot */
+	struct list_head frag_clusters[SWAP_NR_ORDERS];
+					/* list of cluster that are fragmented or contented */
 	unsigned int lowest_bit;	/* index of first free in swap_map */
 	unsigned int highest_bit;	/* index of last free in swap_map */
 	unsigned int pages;		/* total of usable pages of swap */
-- 
cgit v1.2.3


From 661383c6111a38c88df61af6bfbcfacd2ff20a67 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Tue, 30 Jul 2024 23:49:20 -0700
Subject: mm: swap: relaim the cached parts that got scanned

This commit implements reclaim during scan for cluster allocator.

Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which could
result in low allocation success rate or early OOM.

So to ensure maximum allocation success rate, integrate reclaiming with
scanning.  If found a range of suitable swap slots but fragmented due to
HAS_CACHE, just try to reclaim the slots.

Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-8-cb9c148b9297@kernel.org
Signed-off-by: Kairui Song <kasong@tencent.com>
Reported-by: Barry Song <21cnbao@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index c526cd3e7b31..e372122e4f45 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -301,6 +301,7 @@ struct swap_info_struct {
 					/* list of cluster that contains at least one free slot */
 	struct list_head frag_clusters[SWAP_NR_ORDERS];
 					/* list of cluster that are fragmented or contented */
+	unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
 	unsigned int lowest_bit;	/* index of first free in swap_map */
 	unsigned int highest_bit;	/* index of last free in swap_map */
 	unsigned int pages;		/* total of usable pages of swap */
-- 
cgit v1.2.3


From 2cacbdfdee65b18f9952620e762eab043d71b564 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Tue, 30 Jul 2024 23:49:21 -0700
Subject: mm: swap: add a adaptive full cluster cache reclaim

Link all full cluster with one full list, and reclaim from it when the
allocation have ran out of all usable clusters.

There are many reason a folio can end up being in the swap cache while
having no swap count reference.  So the best way to search for such slots
is still by iterating the swap clusters.

With the list as an LRU, iterating from the oldest cluster and keep them
rotating is a very doable and clean way to free up potentially not inuse
clusters.

When any allocation failure, try reclaim and rotate only one cluster.
This is adaptive for high order allocations they can tolerate fallback.
So this avoids latency, and give the full cluster list an fair chance to
get reclaimed.  It release the usage stress for the fallback order 0
allocation or following up high order allocation.

If the swap device is getting very full, reclaim more aggresively to
ensure no OOM will happen.  This ensures order 0 heavy workload won't go
OOM as order 0 won't fail if any cluster still have any space.

[ryncsn@gmail.com: fix discard of full cluster]
  Link: https://lkml.kernel.org/r/CAMgjq7CWwK75_2Zi5P40K08pk9iqOcuWKL6khu=x4Yg_nXaQag@mail.gmail.com
Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-9-cb9c148b9297@kernel.org
Signed-off-by: Kairui Song <kasong@tencent.com>
Reported-by: Barry Song <21cnbao@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Kairui Song <ryncsn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index e372122e4f45..1c8f844a9f0f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -260,6 +260,7 @@ struct swap_cluster_info {
 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
 #define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */
 #define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */
+#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */
 
 /*
  * The first page in the swap file is the swap header, which is always marked
@@ -297,6 +298,7 @@ struct swap_info_struct {
 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
 	struct list_head free_clusters; /* free clusters list */
+	struct list_head full_clusters; /* full clusters list */
 	struct list_head nonfull_clusters[SWAP_NR_ORDERS];
 					/* list of cluster that contains at least one free slot */
 	struct list_head frag_clusters[SWAP_NR_ORDERS];
-- 
cgit v1.2.3


From 46bcce503197d1019ee5c49ccde978e31298e35f Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 7 Aug 2024 09:40:51 +0300
Subject: arch, mm: move definition of node_data to generic code

Every architecture that supports NUMA defines node_data in the same way:

	struct pglist_data *node_data[MAX_NUMNODES];

No reason to keep multiple copies of this definition and its forward
declarations, especially when such forward declaration is the only thing
in include/asm/mmzone.h for many architectures.

Add definition and declaration of node_data to generic code and drop
architecture-specific versions.

Link: https://lkml.kernel.org/r/20240807064110.1003856-8-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU]
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/numa.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/numa.h b/include/linux/numa.h
index eb19503604fe..e5841d4057ab 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -30,6 +30,9 @@ static inline bool numa_valid_node(int nid)
 #ifdef CONFIG_NUMA
 #include <asm/sparsemem.h>
 
+extern struct pglist_data *node_data[];
+#define NODE_DATA(nid)	(node_data[nid])
+
 /* Generic implementation available */
 int numa_nearest_node(int node, unsigned int state);
 
-- 
cgit v1.2.3


From ec164cf1dd3df4ae58931b9b491b06a90d5c22d3 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 7 Aug 2024 09:40:52 +0300
Subject: mm: drop CONFIG_HAVE_ARCH_NODEDATA_EXTENSION

There are no users of HAVE_ARCH_NODEDATA_EXTENSION left, so
arch_alloc_nodedata() and arch_refresh_nodedata() are not needed anymore.

Replace the call to arch_alloc_nodedata() in free_area_init() with a new
helper alloc_offline_node_data(), remove arch_refresh_nodedata() and
cleanup include/linux/memory_hotplug.h from the associated ifdefery.

Link: https://lkml.kernel.org/r/20240807064110.1003856-9-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 48 ------------------------------------------
 include/linux/numa.h           |  4 ++++
 2 files changed, 4 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ebe876930e78..b27ddce5d324 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -16,54 +16,6 @@ struct resource;
 struct vmem_altmap;
 struct dev_pagemap;
 
-#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION
-/*
- * For supporting node-hotadd, we have to allocate a new pgdat.
- *
- * If an arch has generic style NODE_DATA(),
- * node_data[nid] = kzalloc() works well. But it depends on the architecture.
- *
- * In general, generic_alloc_nodedata() is used.
- *
- */
-extern pg_data_t *arch_alloc_nodedata(int nid);
-extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
-
-#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
-
-#define arch_alloc_nodedata(nid)	generic_alloc_nodedata(nid)
-
-#ifdef CONFIG_NUMA
-/*
- * XXX: node aware allocation can't work well to get new node's memory at this time.
- *	Because, pgdat for the new node is not allocated/initialized yet itself.
- *	To use new node's memory, more consideration will be necessary.
- */
-#define generic_alloc_nodedata(nid)				\
-({								\
-	memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES);	\
-})
-
-extern pg_data_t *node_data[];
-static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
-{
-	node_data[nid] = pgdat;
-}
-
-#else /* !CONFIG_NUMA */
-
-/* never called */
-static inline pg_data_t *generic_alloc_nodedata(int nid)
-{
-	BUG();
-	return NULL;
-}
-static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
-{
-}
-#endif /* CONFIG_NUMA */
-#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
-
 #ifdef CONFIG_MEMORY_HOTPLUG
 struct page *pfn_to_online_page(unsigned long pfn);
 
diff --git a/include/linux/numa.h b/include/linux/numa.h
index e5841d4057ab..b41b1569781b 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -33,6 +33,8 @@ static inline bool numa_valid_node(int nid)
 extern struct pglist_data *node_data[];
 #define NODE_DATA(nid)	(node_data[nid])
 
+void __init alloc_offline_node_data(int nid);
+
 /* Generic implementation available */
 int numa_nearest_node(int node, unsigned int state);
 
@@ -60,6 +62,8 @@ static inline int phys_to_target_node(u64 start)
 {
 	return 0;
 }
+
+static inline void alloc_offline_node_data(int nid) {}
 #endif
 
 #define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE)
-- 
cgit v1.2.3


From 3515863d9f29dd476cdad875fbd558fc85b4c511 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 7 Aug 2024 09:40:53 +0300
Subject: arch, mm: pull out allocation of NODE_DATA to generic code

Architectures that support NUMA duplicate the code that allocates
NODE_DATA on the node-local memory with slight variations in reporting of
the addresses where the memory was allocated.

Use x86 version as the basis for the generic alloc_node_data() function
and call this function in architecture specific numa initialization.

Round up node data size to SMP_CACHE_BYTES rather than to PAGE_SIZE like
x86 used to do since the bootmem era when allocation granularity was
PAGE_SIZE anyway.

Link: https://lkml.kernel.org/r/20240807064110.1003856-10-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU]
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/numa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/numa.h b/include/linux/numa.h
index b41b1569781b..3567e40329eb 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -33,6 +33,7 @@ static inline bool numa_valid_node(int nid)
 extern struct pglist_data *node_data[];
 #define NODE_DATA(nid)	(node_data[nid])
 
+void __init alloc_node_data(int nid);
 void __init alloc_offline_node_data(int nid);
 
 /* Generic implementation available */
-- 
cgit v1.2.3


From 87482708210ff3333adab4dc5f1a491793159d43 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 7 Aug 2024 09:41:01 +0300
Subject: mm: introduce numa_memblks

Move code dealing with numa_memblks from arch/x86 to mm/ and add Kconfig
options to let x86 select it in its Kconfig.

This code will be later reused by arch_numa.

No functional changes.

Link: https://lkml.kernel.org/r/20240807064110.1003856-18-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU]
Acked-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/numa_memblks.h | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 include/linux/numa_memblks.h

(limited to 'include/linux')

diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h
new file mode 100644
index 000000000000..6981cf97d2c9
--- /dev/null
+++ b/include/linux/numa_memblks.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NUMA_MEMBLKS_H
+#define __NUMA_MEMBLKS_H
+
+#ifdef CONFIG_NUMA_MEMBLKS
+#include <linux/types.h>
+
+#define NR_NODE_MEMBLKS		(MAX_NUMNODES * 2)
+
+struct numa_memblk {
+	u64			start;
+	u64			end;
+	int			nid;
+};
+
+struct numa_meminfo {
+	int			nr_blks;
+	struct numa_memblk	blk[NR_NODE_MEMBLKS];
+};
+
+extern struct numa_meminfo numa_meminfo __initdata_or_meminfo;
+extern struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
+
+int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
+
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
+int __init numa_register_meminfo(struct numa_meminfo *mi);
+
+void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
+				       const struct numa_meminfo *mi);
+
+#endif /* CONFIG_NUMA_MEMBLKS */
+
+#endif	/* __NUMA_MEMBLKS_H */
-- 
cgit v1.2.3


From 75f9d4cc4eb5e15dd5ce843e33de7f6346a0d4fa Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 7 Aug 2024 09:41:02 +0300
Subject: mm: move numa_distance and related code from x86 to numa_memblks

Move code dealing with numa_distance array from arch/x86 to
mm/numa_memblks.c

This code will be later reused by arch_numa.

No functional changes.

Link: https://lkml.kernel.org/r/20240807064110.1003856-19-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU]
Acked-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/numa_memblks.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h
index 6981cf97d2c9..968a590535ac 100644
--- a/include/linux/numa_memblks.h
+++ b/include/linux/numa_memblks.h
@@ -7,6 +7,10 @@
 
 #define NR_NODE_MEMBLKS		(MAX_NUMNODES * 2)
 
+extern int numa_distance_cnt;
+void __init numa_set_distance(int from, int to, int distance);
+void __init numa_reset_distance(void);
+
 struct numa_memblk {
 	u64			start;
 	u64			end;
-- 
cgit v1.2.3


From b0c4e27c687177aa36048110a12cba94bf291452 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 7 Aug 2024 09:41:03 +0300
Subject: mm: introduce numa_emulation

Move numa_emulation code from arch/x86 to mm/numa_emulation.c

This code will be later reused by arch_numa.

No functional changes.

Link: https://lkml.kernel.org/r/20240807064110.1003856-20-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU]
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/numa_memblks.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h
index 968a590535ac..f81f98678074 100644
--- a/include/linux/numa_memblks.h
+++ b/include/linux/numa_memblks.h
@@ -34,6 +34,23 @@ int __init numa_register_meminfo(struct numa_meminfo *mi);
 void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
 				       const struct numa_meminfo *mi);
 
+#ifdef CONFIG_NUMA_EMU
+int numa_emu_cmdline(char *str);
+void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
+					unsigned int nr_emu_nids);
+u64 __init numa_emu_dma_end(void);
+void __init numa_emulation(struct numa_meminfo *numa_meminfo,
+			   int numa_dist_cnt);
+#else
+static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
+				  int numa_dist_cnt)
+{ }
+static inline int numa_emu_cmdline(char *str)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_NUMA_EMU */
+
 #endif /* CONFIG_NUMA_MEMBLKS */
 
 #endif	/* __NUMA_MEMBLKS_H */
-- 
cgit v1.2.3


From 692d73d2f0f75e58828ab05872b3789ae1f486ef Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 7 Aug 2024 09:41:04 +0300
Subject: mm: numa_memblks: introduce numa_memblks_init

Move most of x86::numa_init() to numa_memblks so that the latter will be
more self-contained.

With this numa_memblk data structures should not be exposed to the
architecture specific code.

Link: https://lkml.kernel.org/r/20240807064110.1003856-21-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU]
Acked-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/numa_memblks.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h
index f81f98678074..07381320848f 100644
--- a/include/linux/numa_memblks.h
+++ b/include/linux/numa_memblks.h
@@ -34,6 +34,9 @@ int __init numa_register_meminfo(struct numa_meminfo *mi);
 void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
 				       const struct numa_meminfo *mi);
 
+int __init numa_memblks_init(int (*init_func)(void),
+			     bool memblock_force_top_down);
+
 #ifdef CONFIG_NUMA_EMU
 int numa_emu_cmdline(char *str);
 void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
-- 
cgit v1.2.3


From 317ef4598bdc48c0196adc02ed4edaf82af279f2 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 7 Aug 2024 09:41:05 +0300
Subject: mm: numa_memblks: make several functions and variables static

Make functions and variables that are exclusively used by numa_memblks
static.

Move numa_nodemask_from_meminfo() before its callers to avoid forward
declaration.

Link: https://lkml.kernel.org/r/20240807064110.1003856-22-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU]
Acked-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/numa_memblks.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h
index 07381320848f..5c6e12ad0b7a 100644
--- a/include/linux/numa_memblks.h
+++ b/include/linux/numa_memblks.h
@@ -7,7 +7,6 @@
 
 #define NR_NODE_MEMBLKS		(MAX_NUMNODES * 2)
 
-extern int numa_distance_cnt;
 void __init numa_set_distance(int from, int to, int distance);
 void __init numa_reset_distance(void);
 
@@ -22,17 +21,10 @@ struct numa_meminfo {
 	struct numa_memblk	blk[NR_NODE_MEMBLKS];
 };
 
-extern struct numa_meminfo numa_meminfo __initdata_or_meminfo;
-extern struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
-
 int __init numa_add_memblk(int nodeid, u64 start, u64 end);
 void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
 
 int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
-int __init numa_register_meminfo(struct numa_meminfo *mi);
-
-void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
-				       const struct numa_meminfo *mi);
 
 int __init numa_memblks_init(int (*init_func)(void),
 			     bool memblock_force_top_down);
-- 
cgit v1.2.3


From 1b5695b02444660297cc54cab44f123ff28de2cc Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 7 Aug 2024 09:41:09 +0300
Subject: mm: make range-to-target_node lookup facility a part of numa_memblks

The x86 implementation of range-to-target_node lookup (i.e.
phys_to_target_node() and memory_add_physaddr_to_nid()) relies on
numa_memblks.

Since numa_memblks are now part of the generic code, move these functions
from x86 to mm/numa_memblks.c and select CONFIG_NUMA_KEEP_MEMINFO when
CONFIG_NUMA_MEMBLKS=y for dax and cxl.

[rppt@kernel.org: fix build]
  Link: https://lkml.kernel.org/r/ZtVfSt_zloPdDqVB@kernel.org
Link: https://lkml.kernel.org/r/20240807064110.1003856-26-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU]
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/numa_memblks.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h
index 5c6e12ad0b7a..cfad6ce7e1bd 100644
--- a/include/linux/numa_memblks.h
+++ b/include/linux/numa_memblks.h
@@ -46,6 +46,13 @@ static inline int numa_emu_cmdline(char *str)
 }
 #endif /* CONFIG_NUMA_EMU */
 
+#ifdef CONFIG_NUMA_KEEP_MEMINFO
+extern int phys_to_target_node(u64 start);
+#define phys_to_target_node phys_to_target_node
+extern int memory_add_physaddr_to_nid(u64 start);
+#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
+#endif /* CONFIG_NUMA_KEEP_MEMINFO */
+
 #endif /* CONFIG_NUMA_MEMBLKS */
 
 #endif	/* __NUMA_MEMBLKS_H */
-- 
cgit v1.2.3


From 650180760be6bb448609d4d155eef3b728ace641 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Mon, 12 Aug 2024 15:42:02 +0800
Subject: mm: swap: extend swap_shmem_alloc() to support batch SWAP_MAP_SHMEM
 flag setting

Patch series "support large folio swap-out and swap-in for shmem", v5.

Shmem will support large folio allocation [1] [2] to get a better
performance, however, the memory reclaim still splits the precious large
folios when trying to swap-out shmem, which may lead to the memory
fragmentation issue and can not take advantage of the large folio for
shmeme.

Moreover, the swap code already supports for swapping out large folio
without split, and large folio swap-in[3] series is queued into
mm-unstable branch.  Hence this patch set also supports the large folio
swap-out and swap-in for shmem.


This patch (of 9):

To support shmem large folio swap operations, add a new parameter to
swap_shmem_alloc() that allows batch SWAP_MAP_SHMEM flag setting for shmem
swap entries.

While we are at it, using folio_nr_pages() to get the number of pages of
the folio as a preparation.

Link: https://lkml.kernel.org/r/cover.1723434324.git.baolin.wang@linux.alibaba.com
Link: https://lkml.kernel.org/r/99f64115d04b285e009580eb177352c57119ffd0.1723434324.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1c8f844a9f0f..248db1dd7812 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -481,7 +481,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry);
 extern swp_entry_t get_swap_page_of_type(int);
 extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
-extern void swap_shmem_alloc(swp_entry_t);
+extern void swap_shmem_alloc(swp_entry_t, int);
 extern int swap_duplicate(swp_entry_t);
 extern int swapcache_prepare(swp_entry_t entry, int nr);
 extern void swap_free_nr(swp_entry_t entry, int nr_pages);
@@ -548,7 +548,7 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
 	return 0;
 }
 
-static inline void swap_shmem_alloc(swp_entry_t swp)
+static inline void swap_shmem_alloc(swp_entry_t swp, int nr)
 {
 }
 
-- 
cgit v1.2.3


From 809bc86517cc408b5b8cb8e08e69096639432bc8 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Mon, 12 Aug 2024 15:42:10 +0800
Subject: mm: shmem: support large folio swap out

Shmem will support large folio allocation [1] [2] to get a better
performance, however, the memory reclaim still splits the precious large
folios when trying to swap out shmem, which may lead to the memory
fragmentation issue and can not take advantage of the large folio for
shmeme.

Moreover, the swap code already supports for swapping out large folio
without split, hence this patch set supports the large folio swap out for
shmem.

Note the i915_gem_shmem driver still need to be split when swapping, thus
add a new flag 'split_large_folio' for writeback_control to indicate
spliting the large folio.

[1] https://lore.kernel.org/all/cover.1717495894.git.baolin.wang@linux.alibaba.com/
[2] https://lore.kernel.org/all/20240515055719.32577-1-da.gomez@samsung.com/

[hughd@google.com: shmem_writepage() split folio at EOF before swapout]
  Link: https://lkml.kernel.org/r/aef55f8d-6040-692d-65e3-16150cce4440@google.com
[baolin.wang@linux.alibaba.com: remove the wbc->split_large_folio per Hugh]
  Link: https://lkml.kernel.org/r/1236a002daa301b3b9ba73d6c0fab348427cf295.1724833399.git.baolin.wang@linux.alibaba.com
Link: https://lkml.kernel.org/r/d80c21abd20e1b0f5ca66b330f074060fb2f082d.1723434324.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/writeback.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 1a54676d843a..51278327b3c6 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -79,6 +79,9 @@ struct writeback_control {
 	 */
 	struct swap_iocb **swap_plug;
 
+	/* Target list for splitting a large folio */
+	struct list_head *list;
+
 	/* internal fields used by the ->writepages implementation: */
 	struct folio_batch fbatch;
 	pgoff_t index;
-- 
cgit v1.2.3


From f77f0c7514789577125c1b2df145703161736359 Mon Sep 17 00:00:00 2001
From: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
Date: Wed, 14 Aug 2024 17:42:27 +0000
Subject: mm,memcg: provide per-cgroup counters for NUMA balancing operations

The ability to observe the demotion and promotion decisions made by the
kernel on a per-cgroup basis is important for monitoring and tuning
containerized workloads on machines equipped with tiered memory.

Different containers in the system may experience drastically different
memory tiering actions that cannot be distinguished from the global
counters alone.

For example, a container running a workload that has a much hotter memory
accesses will likely see more promotions and fewer demotions, potentially
depriving a colocated container of top tier memory to such an extent that
its performance degrades unacceptably.

For another example, some containers may exhibit longer periods between
data reuse, causing much more numa_hint_faults than numa_pages_migrated.
In this case, tuning hot_threshold_ms may be appropriate, but the signal
can easily be lost if only global counters are available.

In the long term, we hope to introduce per-cgroup control of promotion and
demotion actions to implement memory placement policies in tiering.

This patch set adds seven counters to memory.stat in a cgroup:
numa_pages_migrated, numa_pte_updates, numa_hint_faults, pgdemote_kswapd,
pgdemote_khugepaged, pgdemote_direct and pgpromote_success.  pgdemote_*
and pgpromote_success are also available in memory.numa_stat.

count_memcg_events_mm() is added to count multiple event occurrences at
once, and get_mem_cgroup_from_folio() is added because we need to get a
reference to the memcg of a folio before it's migrated to track
numa_pages_migrated.  The accounting of PGDEMOTE_* is moved to
shrink_inactive_list() before being changed to per-cgroup.

[kaiyang2@cs.cmu.edu: add documentation of the memcg counters in cgroup-v2.rst]
  Link: https://lkml.kernel.org/r/20240814235122.252309-1-kaiyang2@cs.cmu.edu
Link: https://lkml.kernel.org/r/20240814174227.30639-1-kaiyang2@cs.cmu.edu
Signed-off-by: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Wei Xu <weixugc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 24 +++++++++++++++++++++---
 include/linux/vmstat.h     |  1 +
 2 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ed170399179a..fe05fdb92779 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -784,6 +784,8 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
 struct mem_cgroup *get_mem_cgroup_from_current(void);
 
+struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio);
+
 struct lruvec *folio_lruvec_lock(struct folio *folio);
 struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
@@ -1028,8 +1030,8 @@ static inline void count_memcg_folio_events(struct folio *folio,
 		count_memcg_events(memcg, idx, nr);
 }
 
-static inline void count_memcg_event_mm(struct mm_struct *mm,
-					enum vm_event_item idx)
+static inline void count_memcg_events_mm(struct mm_struct *mm,
+					enum vm_event_item idx, unsigned long count)
 {
 	struct mem_cgroup *memcg;
 
@@ -1039,10 +1041,16 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 	if (likely(memcg))
-		count_memcg_events(memcg, idx, 1);
+		count_memcg_events(memcg, idx, count);
 	rcu_read_unlock();
 }
 
+static inline void count_memcg_event_mm(struct mm_struct *mm,
+					enum vm_event_item idx)
+{
+	count_memcg_events_mm(mm, idx, 1);
+}
+
 static inline void memcg_memory_event(struct mem_cgroup *memcg,
 				      enum memcg_memory_event event)
 {
@@ -1262,6 +1270,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
 	return NULL;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
+{
+	return NULL;
+}
+
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
 {
@@ -1484,6 +1497,11 @@ static inline void count_memcg_folio_events(struct folio *folio,
 {
 }
 
+static inline void count_memcg_events_mm(struct mm_struct *mm,
+					enum vm_event_item idx, unsigned long count)
+{
+}
+
 static inline
 void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 9eb77c9007e6..d2761bf8ff32 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -32,6 +32,7 @@ struct reclaim_stat {
 	unsigned nr_ref_keep;
 	unsigned nr_unmap_fail;
 	unsigned nr_lazyfree_fail;
+	unsigned nr_demoted;
 };
 
 /* Stat data for system wide items */
-- 
cgit v1.2.3


From e98337d11bbdfa3e3f0fb99aa93e40f97549e0cd Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Tue, 13 Aug 2024 21:54:49 -0600
Subject: mm/contig_alloc: support __GFP_COMP

Patch series "mm/hugetlb: alloc/free gigantic folios", v2.

Use __GFP_COMP for gigantic folios can greatly reduce not only the amount
of code but also the allocation and free time.

Approximate LOC to mm/hugetlb.c: +60, -240

Allocate and free 500 1GB hugeTLB memory without HVO by:
  time echo 500 >/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
  time echo 0 >/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages

       Before  After
Alloc  ~13s    ~10s
Free   ~15s    <1s

The above magnitude generally holds for multiple x86 and arm64 CPU
models.

Perf profile before:
  Alloc
    - 99.99% alloc_pool_huge_folio
       - __alloc_fresh_hugetlb_folio
          - 83.23% alloc_contig_pages_noprof
             - 47.46% alloc_contig_range_noprof
                - 20.96% isolate_freepages_range
                     16.10% split_page
                - 14.10% start_isolate_page_range
                - 12.02% undo_isolate_page_range

  Free
    - update_and_free_pages_bulk
       - 87.71% free_contig_range
          - 76.02% free_unref_page
             - 41.30% free_unref_page_commit
                - 32.58% free_pcppages_bulk
                   - 24.75% __free_one_page
               13.96% _raw_spin_trylock
         12.27% __update_and_free_hugetlb_folio

Perf profile after:
  Alloc
    - 99.99% alloc_pool_huge_folio
         alloc_gigantic_folio
       - alloc_contig_pages_noprof
          - 59.15% alloc_contig_range_noprof
             - 20.72% start_isolate_page_range
               20.64% prep_new_page
             - 17.13% undo_isolate_page_range

  Free
    - update_and_free_pages_bulk
       - __folio_put
       - __free_pages_ok
            7.46% free_tail_page_prepare
          - 1.97% free_one_page
               1.86% __free_one_page

This patch (of 3):

Support __GFP_COMP in alloc_contig_range().  When the flag is set, upon
success the function returns a large folio prepared by prep_new_page(),
rather than a range of order-0 pages prepared by split_free_pages() (which
is renamed from split_map_pages()).

alloc_contig_range() can be used to allocate folios larger than
MAX_PAGE_ORDER, e.g., gigantic hugeTLB folios.  So on the free path,
free_one_page() needs to handle that by split_large_buddy().

[akpm@linux-foundation.org: fix folio_alloc_gigantic_noprof() WARN expression, per Yu Liao]
Link: https://lkml.kernel.org/r/20240814035451.773331-1-yuzhao@google.com
Link: https://lkml.kernel.org/r/20240814035451.773331-2-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Frank van der Linden <fvdl@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f53f76e0b17e..03ba9563c6db 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -446,4 +446,27 @@ extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_
 #endif
 void free_contig_range(unsigned long pfn, unsigned long nr_pages);
 
+#ifdef CONFIG_CONTIG_ALLOC
+static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
+							int nid, nodemask_t *node)
+{
+	struct page *page;
+
+	if (WARN_ON(!order || !(gfp & __GFP_COMP)))
+		return NULL;
+
+	page = alloc_contig_pages_noprof(1 << order, gfp, nid, node);
+
+	return page ? page_folio(page) : NULL;
+}
+#else
+static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
+							int nid, nodemask_t *node)
+{
+	return NULL;
+}
+#endif
+/* This should be paired with folio_put() rather than free_contig_range(). */
+#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__))
+
 #endif /* __LINUX_GFP_H */
-- 
cgit v1.2.3


From 463586e9ff398f951a9a63d98a3b93f44434d20c Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Tue, 13 Aug 2024 21:54:50 -0600
Subject: mm/cma: add cma_{alloc,free}_folio()

With alloc_contig_range() and free_contig_range() supporting large folios,
CMA can allocate and free large folios too, by cma_alloc_folio() and
cma_free_folio().

[yuzhao@google.com: fix WARN in cma_alloc_folio()]
  Link: https://lkml.kernel.org/r/Zsd0PgAQmbpR8jS6@google.com
Link: https://lkml.kernel.org/r/20240814035451.773331-3-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cma.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cma.h b/include/linux/cma.h
index 9db877506ea8..d15b64f51336 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -52,4 +52,20 @@ extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long
 extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
 
 extern void cma_reserve_pages_on_error(struct cma *cma);
+
+#ifdef CONFIG_CMA
+struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp);
+bool cma_free_folio(struct cma *cma, const struct folio *folio);
+#else
+static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp)
+{
+	return NULL;
+}
+
+static inline bool cma_free_folio(struct cma *cma, const struct folio *folio)
+{
+	return false;
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From cf54f310d0d313bce6505d010a555948b0ae0e2a Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Tue, 13 Aug 2024 21:54:51 -0600
Subject: mm/hugetlb: use __GFP_COMP for gigantic folios

Use __GFP_COMP for gigantic folios to greatly reduce not only the amount
of code but also the allocation and free time.

LOC (approximately): +60, -240

Allocate and free 500 1GB hugeTLB memory without HVO by:
  time echo 500 >/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
  time echo 0 >/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages

       Before  After
Alloc  ~13s    ~10s
Free   ~15s    <1s

The above magnitude generally holds for multiple x86 and arm64 CPU models.

Link: https://lkml.kernel.org/r/20240814035451.773331-4-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reported-by: Frank van der Linden <fvdl@google.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3100a52ceb73..98c47c394b89 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -896,10 +896,11 @@ static inline bool hugepage_movable_supported(struct hstate *h)
 /* Movability of hugepages depends on migration support. */
 static inline gfp_t htlb_alloc_mask(struct hstate *h)
 {
-	if (hugepage_movable_supported(h))
-		return GFP_HIGHUSER_MOVABLE;
-	else
-		return GFP_HIGHUSER;
+	gfp_t gfp = __GFP_COMP | __GFP_NOWARN;
+
+	gfp |= hugepage_movable_supported(h) ? GFP_HIGHUSER_MOVABLE : GFP_HIGHUSER;
+
+	return gfp;
 }
 
 static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
-- 
cgit v1.2.3


From d0b003ce97ad6518dea4b0ed70081df95de04179 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 16 Aug 2024 12:32:46 +0200
Subject: mm/rmap: use folio->_mapcount for small folios

We have some cases left whereby we operate on small folios and still refer
to page->_mapcount.  Let's just use folio->_mapcount instead, which
currently still overlays page->_mapcount, so no change.

This change will make it easier to later spot any remaining users of
page->_mapcount that target tail pages.

Link: https://lkml.kernel.org/r/20240816103246.719209-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 0978c64f49d8..91b5935e8485 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -331,7 +331,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
 	switch (level) {
 	case RMAP_LEVEL_PTE:
 		if (!folio_test_large(folio)) {
-			atomic_inc(&page->_mapcount);
+			atomic_inc(&folio->_mapcount);
 			break;
 		}
 
@@ -425,7 +425,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
 		if (!folio_test_large(folio)) {
 			if (PageAnonExclusive(page))
 				ClearPageAnonExclusive(page);
-			atomic_inc(&page->_mapcount);
+			atomic_inc(&folio->_mapcount);
 			break;
 		}
 
-- 
cgit v1.2.3


From 489a744e5fb16503b495ad1fa3dd1abf25b224e7 Mon Sep 17 00:00:00 2001
From: Danilo Krummrich <dakr@kernel.org>
Date: Tue, 13 Aug 2024 00:34:35 +0200
Subject: mm: krealloc: clarify valid usage of __GFP_ZERO

Properly document that if __GFP_ZERO logic is requested, callers must
ensure that, starting with the initial memory allocation, every subsequent
call to this API for the same memory allocation is flagged with
__GFP_ZERO.  Otherwise, it is possible that __GFP_ZERO is not fully
honored by this API.

Link: https://lkml.kernel.org/r/20240812223707.32049-2-dakr@kernel.org
Signed-off-by: Danilo Krummrich <dakr@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/slab.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index c9cb42203183..2282e67a01c7 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -733,6 +733,16 @@ static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t siz
  * @new_n: new number of elements to alloc
  * @new_size: new size of a single member of the array
  * @flags: the type of memory to allocate (see kmalloc)
+ *
+ * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
+ * initial memory allocation, every subsequent call to this API for the same
+ * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
+ * __GFP_ZERO is not fully honored by this API.
+ *
+ * See krealloc_noprof() for further details.
+ *
+ * In any case, the contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.
  */
 static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p,
 								       size_t new_n,
-- 
cgit v1.2.3


From a759e37fb46708029c9c3c56c3b62e6f24d85cf5 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Sun, 18 Aug 2024 23:01:51 +0200
Subject: err.h: add ERR_PTR_PCPU(), PTR_ERR_PCPU() and IS_ERR_PCPU() macros

Add ERR_PTR_PCPU(), PTR_ERR_PCPU() and IS_ERR_PCPU() macros that operate
on pointers in the percpu address space.

These macros remove the need for (__force void *) function argument casts
(to avoid sparse -Wcast-from-as warnings).

The patch will also avoid future build errors due to pointer address space
mismatch with enabled strict percpu address space checks.

Link: https://lkml.kernel.org/r/20240818210235.33481-1-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/err.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/err.h b/include/linux/err.h
index b5d9bb2a2349..a4dacd745fcf 100644
--- a/include/linux/err.h
+++ b/include/linux/err.h
@@ -41,6 +41,9 @@ static inline void * __must_check ERR_PTR(long error)
 	return (void *) error;
 }
 
+/* Return the pointer in the percpu address space. */
+#define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error))
+
 /**
  * PTR_ERR - Extract the error code from an error pointer.
  * @ptr: An error pointer.
@@ -51,6 +54,9 @@ static inline long __must_check PTR_ERR(__force const void *ptr)
 	return (long) ptr;
 }
 
+/* Read an error pointer from the percpu address space. */
+#define PTR_ERR_PCPU(ptr) (PTR_ERR((const void *)(__force const unsigned long)(ptr)))
+
 /**
  * IS_ERR - Detect an error pointer.
  * @ptr: The pointer to check.
@@ -61,6 +67,9 @@ static inline bool __must_check IS_ERR(__force const void *ptr)
 	return IS_ERR_VALUE((unsigned long)ptr);
 }
 
+/* Read an error pointer from the percpu address space. */
+#define IS_ERR_PCPU(ptr) (IS_ERR((const void *)(__force const unsigned long)(ptr)))
+
 /**
  * IS_ERR_OR_NULL - Detect an error pointer or a null pointer.
  * @ptr: The pointer to check.
-- 
cgit v1.2.3


From ef5f379de302884b9b7ad9b62587a942a9f0bb55 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 20 Aug 2024 14:22:10 +0200
Subject: mm: always inline _compound_head() with
 CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y

We already force-inline page_fixed_fake_head(), page_is_fake_head() and
PageTail(), however the compiler might decide that _compound_head() is not
worthy to be inlined, because of page_fixed_fake_head().

The result is that, for example, PageAnonExclusive() now might involve a
function call when checking PageHuge(), which performs a
page_folio()->_compound_head() call.  This can lead to a slight regression
of the stress-ng.clone benchmark.

This is not super-urgent to fix, but always inlining _compound_head()
seems like the obvious thing to do for this primitive, similar to the
other ones.

This change restores the slight regression and a compilation with
CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y shows no relevant bloat [2]:

	add/remove: 15/14 grow/shrink: 79/87 up/down: 12836/-13917 (-1081)
	...
	Total: Before=32786363, After=32785282, chg -0.00%

[1] https://lkml.kernel.org/r/817150f2-abf7-430f-9973-540bd6cdd26f@intel.com
[2] https://lore.kernel.org/all/116e117c-2821-401d-8e62-b85cdec37f4a@redhat.com/

Link: https://lkml.kernel.org/r/20240820122210.660140-1-david@redhat.com
Fixes: c0bff412e67b ("mm: allow anon exclusive check over hugetlb tail pages")
Signed-off-by: David Hildenbrand <david@redhat.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202407301049.5051dc19-oliver.sang@intel.com
Tested-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 17175a328e6b..2515ae85f191 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -232,7 +232,7 @@ static __always_inline int page_is_fake_head(const struct page *page)
 	return page_fixed_fake_head(page) != page;
 }
 
-static inline unsigned long _compound_head(const struct page *page)
+static __always_inline unsigned long _compound_head(const struct page *page)
 {
 	unsigned long head = READ_ONCE(page->compound_head);
 
-- 
cgit v1.2.3


From 0a2d82946be67e02fdf85a4010606bdc0546ba44 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Tue, 20 Aug 2024 10:26:39 +0800
Subject: mm: allow read-ahead with IOCB_NOWAIT set

Readahead support for IOCB_NOWAIT was introduced in commit 2e85abf053b9
("mm: allow read-ahead with IOCB_NOWAIT set").  However, this
implementation broke the semantics of IOCB_NOWAIT by potentially causing
it to wait on I/O during memory reclamation.  This behavior was later
modified in commit efa8480a8316 ("fs: RWF_NOWAIT should imply IOCB_NOIO").

To resolve the blocking issue during memory reclamation, we can use
memalloc_noio_{save,restore} to ensure non-blocking behavior.  This change
restores the original functionality, allowing preadv2(IOCB_NOWAIT) to
trigger readahead if the file content is not present in the page cache.

While this process may trigger direct memory reclamation, the
__GFP_NORETRY flag is set in the readahead GFP flags, ensuring it won't
block.

A use case for this change is when we want to trigger readahead in the
preadv2(2) syscall if the file cache is absent, but without waiting for
certain filesystem locks, like xfs_ilock.  A simple example is as follows:

retry:
    if (preadv2(fd, iovec, cnt, offset, RWF_NOWAIT) < 0) {
        do_other_work();
        goto retry;
    }

Link: https://lore.gnuweeb.org/io-uring/20200624164127.GP21350@casper.infradead.org/
Link: https://lkml.kernel.org/r/20240820022639.89562-1-laoar.shao@gmail.com
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Christian Brauner <brauner@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6ca11e241a24..8610242ec2ab 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3461,7 +3461,6 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags,
 	if (flags & RWF_NOWAIT) {
 		if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
 			return -EOPNOTSUPP;
-		kiocb_flags |= IOCB_NOIO;
 	}
 	if (flags & RWF_ATOMIC) {
 		if (rw_type != WRITE)
-- 
cgit v1.2.3


From e880034cf7182aa35962bd30dfc9fa60476d56a4 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 21 Aug 2024 18:39:10 +0100
Subject: mm: introduce page_mapcount_is_type()

Resolve the awkward "and add one to this opaque constant" test into a
self-documenting inline function.

Link: https://lkml.kernel.org/r/20240821173914.2270383-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h         |  3 +--
 include/linux/page-flags.h | 12 +++++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0fde51c0532f..6f6053d7ea6f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1228,8 +1228,7 @@ static inline int folio_mapcount(const struct folio *folio)
 
 	if (likely(!folio_test_large(folio))) {
 		mapcount = atomic_read(&folio->_mapcount) + 1;
-		/* Handle page_has_type() pages */
-		if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
+		if (page_mapcount_is_type(mapcount))
 			mapcount = 0;
 		return mapcount;
 	}
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 2515ae85f191..0b5484fdbca1 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -956,12 +956,18 @@ enum pagetype {
 #define folio_test_type(folio, flag)					\
 	((READ_ONCE(folio->page.page_type) & (PAGE_TYPE_BASE | flag))  == PAGE_TYPE_BASE)
 
-static inline int page_type_has_type(unsigned int page_type)
+static inline bool page_type_has_type(int page_type)
 {
-	return (int)page_type < PAGE_MAPCOUNT_RESERVE;
+	return page_type < PAGE_MAPCOUNT_RESERVE;
 }
 
-static inline int page_has_type(const struct page *page)
+/* This takes a mapcount which is one more than page->_mapcount */
+static inline bool page_mapcount_is_type(unsigned int mapcount)
+{
+	return page_type_has_type(mapcount - 1);
+}
+
+static inline bool page_has_type(const struct page *page)
 {
 	return page_type_has_type(READ_ONCE(page->page_type));
 }
-- 
cgit v1.2.3


From 4ffca5a96678c9fe1a39ec1e9c8fd326b57ea8a7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 21 Aug 2024 18:39:11 +0100
Subject: mm: support only one page_type per page

By using a few values in the top byte, users of page_type can store up to
24 bits of additional data in page_type.  It also reduces the code size as
(with replacement of READ_ONCE() with data_race()), the kernel can check
just a single byte.  eg:

ffffffff811e3a79:       8b 47 30                mov    0x30(%rdi),%eax
ffffffff811e3a7c:       55                      push   %rbp
ffffffff811e3a7d:       48 89 e5                mov    %rsp,%rbp
ffffffff811e3a80:       25 00 00 00 82          and    $0x82000000,%eax
ffffffff811e3a85:       3d 00 00 00 80          cmp    $0x80000000,%eax
ffffffff811e3a8a:       74 4d                   je     ffffffff811e3ad9 <folio_mapping+0x69>

becomes:

ffffffff811e3a69:       80 7f 33 f5             cmpb   $0xf5,0x33(%rdi)
ffffffff811e3a6d:       55                      push   %rbp
ffffffff811e3a6e:       48 89 e5                mov    %rsp,%rbp
ffffffff811e3a71:       74 4d                   je     ffffffff811e3ac0 <folio_mapping+0x60>

replacing three instructions with one.

[wangkefeng.wang@huawei.com: fix ubsan warnings]
  Link: https://lkml.kernel.org/r/2d19c48a-c550-4345-bf36-d05cd303c5de@huawei.com
Link: https://lkml.kernel.org/r/20240821173914.2270383-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 68 +++++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 0b5484fdbca1..7a1b2948b075 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -923,42 +923,29 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
 #endif
 
 /*
- * For pages that are never mapped to userspace,
- * page_type may be used.  Because it is initialised to -1, we invert the
- * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
- * __ClearPageFoo *sets* the bit used for PageFoo.  We reserve a few high and
- * low bits so that an underflow or overflow of _mapcount won't be
- * mistaken for a page type value.
+ * For pages that do not use mapcount, page_type may be used.
+ * The low 24 bits of pagetype may be used for your own purposes, as long
+ * as you are careful to not affect the top 8 bits.  The low bits of
+ * pagetype will be overwritten when you clear the page_type from the page.
  */
-
 enum pagetype {
-	PG_buddy	= 0x40000000,
-	PG_offline	= 0x20000000,
-	PG_table	= 0x10000000,
-	PG_guard	= 0x08000000,
-	PG_hugetlb	= 0x04000000,
-	PG_slab		= 0x02000000,
-	PG_zsmalloc	= 0x01000000,
-	PG_unaccepted	= 0x00800000,
-
-	PAGE_TYPE_BASE	= 0x80000000,
-
-	/*
-	 * Reserve 0xffff0000 - 0xfffffffe to catch _mapcount underflows and
-	 * allow owners that set a type to reuse the lower 16 bit for their own
-	 * purposes.
-	 */
-	PAGE_MAPCOUNT_RESERVE	= ~0x0000ffff,
+	/* 0x00-0x7f are positive numbers, ie mapcount */
+	/* Reserve 0x80-0xef for mapcount overflow. */
+	PGTY_buddy	= 0xf0,
+	PGTY_offline	= 0xf1,
+	PGTY_table	= 0xf2,
+	PGTY_guard	= 0xf3,
+	PGTY_hugetlb	= 0xf4,
+	PGTY_slab	= 0xf5,
+	PGTY_zsmalloc	= 0xf6,
+	PGTY_unaccepted	= 0xf7,
+
+	PGTY_mapcount_underflow = 0xff
 };
 
-#define PageType(page, flag)						\
-	((READ_ONCE(page->page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
-#define folio_test_type(folio, flag)					\
-	((READ_ONCE(folio->page.page_type) & (PAGE_TYPE_BASE | flag))  == PAGE_TYPE_BASE)
-
 static inline bool page_type_has_type(int page_type)
 {
-	return page_type < PAGE_MAPCOUNT_RESERVE;
+	return page_type < (PGTY_mapcount_underflow << 24);
 }
 
 /* This takes a mapcount which is one more than page->_mapcount */
@@ -969,40 +956,41 @@ static inline bool page_mapcount_is_type(unsigned int mapcount)
 
 static inline bool page_has_type(const struct page *page)
 {
-	return page_type_has_type(READ_ONCE(page->page_type));
+	return page_mapcount_is_type(data_race(page->page_type));
 }
 
 #define FOLIO_TYPE_OPS(lname, fname)					\
-static __always_inline bool folio_test_##fname(const struct folio *folio)\
+static __always_inline bool folio_test_##fname(const struct folio *folio) \
 {									\
-	return folio_test_type(folio, PG_##lname);			\
+	return data_race(folio->page.page_type >> 24) == PGTY_##lname;	\
 }									\
 static __always_inline void __folio_set_##fname(struct folio *folio)	\
 {									\
-	VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio);		\
-	folio->page.page_type &= ~PG_##lname;				\
+	VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX,	\
+			folio);						\
+	folio->page.page_type = (unsigned int)PGTY_##lname << 24;	\
 }									\
 static __always_inline void __folio_clear_##fname(struct folio *folio)	\
 {									\
 	VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio);		\
-	folio->page.page_type |= PG_##lname;				\
+	folio->page.page_type = UINT_MAX;				\
 }
 
 #define PAGE_TYPE_OPS(uname, lname, fname)				\
 FOLIO_TYPE_OPS(lname, fname)						\
 static __always_inline int Page##uname(const struct page *page)		\
 {									\
-	return PageType(page, PG_##lname);				\
+	return data_race(page->page_type >> 24) == PGTY_##lname;	\
 }									\
 static __always_inline void __SetPage##uname(struct page *page)		\
 {									\
-	VM_BUG_ON_PAGE(!PageType(page, 0), page);			\
-	page->page_type &= ~PG_##lname;					\
+	VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page);	\
+	page->page_type = (unsigned int)PGTY_##lname << 24;		\
 }									\
 static __always_inline void __ClearPage##uname(struct page *page)	\
 {									\
 	VM_BUG_ON_PAGE(!Page##uname(page), page);			\
-	page->page_type |= PG_##lname;					\
+	page->page_type = UINT_MAX;					\
 }
 
 /*
-- 
cgit v1.2.3


From bf03c806993091d8fb2fc0e98f07a8ef251c78f3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 21 Aug 2024 20:34:34 +0100
Subject: mm: remove PageActive

Patch series "Simplify the page flags a little".

In the course of our folio conversions, we have made many page flags only
used on folios, so we can now remove the page-based accessors.  This
should cut down compile time a little, and prevent new users from cropping
up.

There is more that could be done in this area, but it would produce merge
conflicts, so I'll sit on those patches until next merge window.  We now
have line of sight to removing PG_private_2 and PG_private.


This patch (of 10):

This flag is now only used on folios, so we can remove all the page
accessors.

[akpm@linux-foundation.org: fix arch/powerpc/mm/pgtable-frag.c]
Link: https://lkml.kernel.org/r/20240821193445.2294269-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20240821193445.2294269-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 7a1b2948b075..34347bee1217 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -510,8 +510,9 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
 	__CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
 PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
 	TESTCLEARFLAG(LRU, lru, PF_HEAD)
-PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
-	TESTCLEARFLAG(Active, active, PF_HEAD)
+FOLIO_FLAG(active, FOLIO_HEAD_PAGE)
+	__FOLIO_CLEAR_FLAG(active, FOLIO_HEAD_PAGE)
+	FOLIO_TEST_CLEAR_FLAG(active, FOLIO_HEAD_PAGE)
 PAGEFLAG(Workingset, workingset, PF_HEAD)
 	TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
 PAGEFLAG(Checked, checked, PF_NO_COMPOUND)	   /* Used by some filesystems */
-- 
cgit v1.2.3


From 0b7582803649e0e58e5f8c9a4ede362777bc7eec Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 21 Aug 2024 20:34:35 +0100
Subject: mm: remove PageSwapBacked

This flag is now only used on folios, so we can remove all the page
accessors.

Link: https://lkml.kernel.org/r/20240821193445.2294269-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 34347bee1217..cebb3dc0fd0c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -528,9 +528,9 @@ PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)
 PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
 	__CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
 	__SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
-PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
-	__CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
-	__SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE)
+	__FOLIO_CLEAR_FLAG(swapbacked, FOLIO_HEAD_PAGE)
+	__FOLIO_SET_FLAG(swapbacked, FOLIO_HEAD_PAGE)
 
 /*
  * Private page markings that may be used by the filesystem that owns the page
-- 
cgit v1.2.3


From 6f394ee9ddb4bd1879e42c9c8648a37f650bea99 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 21 Aug 2024 20:34:36 +0100
Subject: mm: remove PageReadahead

This flag is now only used on folios, so we can remove all the page
accessors.

Link: https://lkml.kernel.org/r/20240821193445.2294269-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index cebb3dc0fd0c..71444223d630 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -553,8 +553,8 @@ PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
 /* PG_readahead is only used for reads; PG_reclaim is only for writes */
 PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
 	TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
-PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND)
-	TESTCLEARFLAG(Readahead, readahead, PF_NO_COMPOUND)
+FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE)
+	FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE)
 
 #ifdef CONFIG_HIGHMEM
 /*
-- 
cgit v1.2.3


From 32f51ead3d7771cdec29f75e08d50a76d2c6253d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 21 Aug 2024 20:34:37 +0100
Subject: mm: remove PageSwapCache

This flag is now only used on folios, so we can remove all the page
accessors and reword the comments that refer to them.

Link: https://lkml.kernel.org/r/20240821193445.2294269-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h   |  2 +-
 include/linux/page-flags.h | 11 +++--------
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 2419e60c9a7f..6e3bdf8e38bc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -109,7 +109,7 @@ struct page {
 			/**
 			 * @private: Mapping-private opaque data.
 			 * Usually used for buffer_heads if PagePrivate.
-			 * Used for swp_entry_t if PageSwapCache.
+			 * Used for swp_entry_t if swapcache flag set.
 			 * Indicates order in the buddy system if PageBuddy.
 			 */
 			unsigned long private;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 71444223d630..b31d2c50b6f0 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -574,15 +574,10 @@ static __always_inline bool folio_test_swapcache(const struct folio *folio)
 			test_bit(PG_swapcache, const_folio_flags(folio, 0));
 }
 
-static __always_inline bool PageSwapCache(const struct page *page)
-{
-	return folio_test_swapcache(page_folio(page));
-}
-
-SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
-CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
+FOLIO_SET_FLAG(swapcache, FOLIO_HEAD_PAGE)
+FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE)
 #else
-PAGEFLAG_FALSE(SwapCache, swapcache)
+FOLIO_FLAG_FALSE(swapcache)
 #endif
 
 PAGEFLAG(Unevictable, unevictable, PF_HEAD)
-- 
cgit v1.2.3


From cb29e7941d5d57576392a52c08077ef327ec8e17 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 21 Aug 2024 20:34:38 +0100
Subject: mm: remove PageUnevictable

There is only one caller of PageUnevictable() left; convert it to call
folio_test_unevictable() and remove all the page accessors.

Link: https://lkml.kernel.org/r/20240821193445.2294269-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index b31d2c50b6f0..2150502195d5 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -580,9 +580,9 @@ FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE)
 FOLIO_FLAG_FALSE(swapcache)
 #endif
 
-PAGEFLAG(Unevictable, unevictable, PF_HEAD)
-	__CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
-	TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)
+FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE)
+	__FOLIO_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE)
+	FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE)
 
 #ifdef CONFIG_MMU
 PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
-- 
cgit v1.2.3


From 99f86bbda31790f2ca0e365f02650585536929ab Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 21 Aug 2024 20:34:39 +0100
Subject: mm: remove PageMlocked

This flag is now only used on folios, so we can remove all the page
accessors.

Link: https://lkml.kernel.org/r/20240821193445.2294269-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 2150502195d5..08cce16c82aa 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -585,12 +585,15 @@ FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE)
 	FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE)
 
 #ifdef CONFIG_MMU
-PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
-	__CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
-	TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
+FOLIO_FLAG(mlocked, FOLIO_HEAD_PAGE)
+	__FOLIO_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE)
+	FOLIO_TEST_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE)
+	FOLIO_TEST_SET_FLAG(mlocked, FOLIO_HEAD_PAGE)
 #else
-PAGEFLAG_FALSE(Mlocked, mlocked) __CLEARPAGEFLAG_NOOP(Mlocked, mlocked)
-	TESTSCFLAG_FALSE(Mlocked, mlocked)
+FOLIO_FLAG_FALSE(mlocked)
+	__FOLIO_CLEAR_FLAG_NOOP(mlocked)
+	FOLIO_TEST_CLEAR_FLAG_FALSE(mlocked)
+	FOLIO_TEST_SET_FLAG_FALSE(mlocked)
 #endif
 
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
-- 
cgit v1.2.3


From 3026bc1e82b695d69cde21712512922abe74aab9 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 21 Aug 2024 20:34:40 +0100
Subject: mm: remove PageOwnerPriv1

While there are many aliases for this flag, nobody actually uses the
*PageOwnerPriv1() nor folio_*_owner_priv_1() accessors.  Remove them.

Link: https://lkml.kernel.org/r/20240821193445.2294269-8-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 08cce16c82aa..458d2dc5124d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -539,8 +539,6 @@ FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE)
  */
 PAGEFLAG(Private, private, PF_ANY)
 PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
-PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
-	TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
 
 /*
  * Only test-and-set exist for PG_writeback.  The unconditional operators are
-- 
cgit v1.2.3


From 6dc151388e44957d471633ce70d72e3d27ae836d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 21 Aug 2024 20:34:41 +0100
Subject: mm: remove page_has_private()

This function has no more callers, except folio_has_private().  Combine
the two functions.

Link: https://lkml.kernel.org/r/20240821193445.2294269-9-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 458d2dc5124d..017205048cab 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1175,20 +1175,15 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
 #define PAGE_FLAGS_PRIVATE				\
 	(1UL << PG_private | 1UL << PG_private_2)
 /**
- * page_has_private - Determine if page has private stuff
- * @page: The page to be checked
+ * folio_has_private - Determine if folio has private stuff
+ * @folio: The folio to be checked
  *
- * Determine if a page has private stuff, indicating that release routines
+ * Determine if a folio has private stuff, indicating that release routines
  * should be invoked upon it.
  */
-static inline int page_has_private(const struct page *page)
+static inline int folio_has_private(const struct folio *folio)
 {
-	return !!(page->flags & PAGE_FLAGS_PRIVATE);
-}
-
-static inline bool folio_has_private(const struct folio *folio)
-{
-	return page_has_private(&folio->page);
+	return !!(folio->flags & PAGE_FLAGS_PRIVATE);
 }
 
 #undef PF_ANY
-- 
cgit v1.2.3


From 02e1960aafac33721401dcd92e915325fdb524b2 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 21 Aug 2024 20:34:42 +0100
Subject: mm: rename PG_mappedtodisk to PG_owner_2

This flag has similar constraints to PG_owner_priv_1 -- it is ignored by
core code, and is entirely for the use of the code which allocated the
folio.  Since the pagecache does not use it, individual filesystems can
use it.  The bufferhead code does use it, so filesystems which use the
buffer cache must not use it for another purpose.

Link: https://lkml.kernel.org/r/20240821193445.2294269-10-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kernel-page-flags.h |  2 +-
 include/linux/page-flags.h        | 24 ++++++++++++++++--------
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h
index 859f4b0c1b2b..7c587a711be1 100644
--- a/include/linux/kernel-page-flags.h
+++ b/include/linux/kernel-page-flags.h
@@ -10,7 +10,7 @@
  */
 #define KPF_RESERVED		32
 #define KPF_MLOCKED		33
-#define KPF_MAPPEDTODISK	34
+#define KPF_OWNER_2		34
 #define KPF_PRIVATE		35
 #define KPF_PRIVATE_2		36
 #define KPF_OWNER_PRIVATE	37
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 017205048cab..1ff3d172c22c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -101,12 +101,12 @@ enum pageflags {
 	PG_waiters,		/* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
 	PG_active,
 	PG_workingset,
-	PG_owner_priv_1,	/* Owner use. If pagecache, fs may use*/
+	PG_owner_priv_1,	/* Owner use. If pagecache, fs may use */
+	PG_owner_2,		/* Owner use. If pagecache, fs may use */
 	PG_arch_1,
 	PG_reserved,
 	PG_private,		/* If pagecache, has fs-private data */
 	PG_private_2,		/* If pagecache, has fs aux data */
-	PG_mappedtodisk,	/* Has blocks allocated on-disk */
 	PG_reclaim,		/* To be reclaimed asap */
 	PG_swapbacked,		/* Page is backed by RAM/swap */
 	PG_unevictable,		/* Page is "unevictable"  */
@@ -131,6 +131,11 @@ enum pageflags {
 
 	PG_readahead = PG_reclaim,
 
+	/* Anonymous memory (and shmem) */
+	PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */
+	/* Some filesystems */
+	PG_checked = PG_owner_priv_1,
+
 	/*
 	 * Depending on the way an anonymous folio can be mapped into a page
 	 * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped
@@ -138,13 +143,13 @@ enum pageflags {
 	 * tail pages of an anonymous folio. For now, we only expect it to be
 	 * set on tail pages for PTE-mapped THP.
 	 */
-	PG_anon_exclusive = PG_mappedtodisk,
-
-	/* Filesystems */
-	PG_checked = PG_owner_priv_1,
+	PG_anon_exclusive = PG_owner_2,
 
-	/* SwapBacked */
-	PG_swapcache = PG_owner_priv_1,	/* Swap page: swp_entry_t in private */
+	/*
+	 * Set if all buffer heads in the folio are mapped.
+	 * Filesystems which do not use BHs can use it for their own purpose.
+	 */
+	PG_mappedtodisk = PG_owner_2,
 
 	/* Two page bits are conscripted by FS-Cache to maintain local caching
 	 * state.  These bits are set on pages belonging to the netfs's inodes
@@ -540,6 +545,9 @@ FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE)
 PAGEFLAG(Private, private, PF_ANY)
 PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
 
+/* owner_2 can be set on tail pages for anon memory */
+FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE)
+
 /*
  * Only test-and-set exist for PG_writeback.  The unconditional operators are
  * risky: they bypass page accounting.
-- 
cgit v1.2.3


From 7a87225ae2c6c317c7b80cf599e5cf0eee699196 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 21 Aug 2024 20:34:43 +0100
Subject: x86: remove PG_uncached

Convert x86 to use PG_arch_2 instead of PG_uncached and remove
PG_uncached.

Link: https://lkml.kernel.org/r/20240821193445.2294269-11-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kernel-page-flags.h |  1 -
 include/linux/page-flags.h        | 13 +++----------
 2 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h
index 7c587a711be1..196778a087c4 100644
--- a/include/linux/kernel-page-flags.h
+++ b/include/linux/kernel-page-flags.h
@@ -15,7 +15,6 @@
 #define KPF_PRIVATE_2		36
 #define KPF_OWNER_PRIVATE	37
 #define KPF_ARCH		38
-#define KPF_UNCACHED		39
 #define KPF_SOFTDIRTY		40
 #define KPF_ARCH_2		41
 #define KPF_ARCH_3		42
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1ff3d172c22c..2175ebceb41c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -113,9 +113,6 @@ enum pageflags {
 #ifdef CONFIG_MMU
 	PG_mlocked,		/* Page is vma mlocked */
 #endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-	PG_uncached,		/* Page has been mapped as uncached */
-#endif
 #ifdef CONFIG_MEMORY_FAILURE
 	PG_hwpoison,		/* hardware poisoned page. Don't touch */
 #endif
@@ -123,8 +120,10 @@ enum pageflags {
 	PG_young,
 	PG_idle,
 #endif
-#ifdef CONFIG_ARCH_USES_PG_ARCH_X
+#ifdef CONFIG_ARCH_USES_PG_ARCH_2
 	PG_arch_2,
+#endif
+#ifdef CONFIG_ARCH_USES_PG_ARCH_3
 	PG_arch_3,
 #endif
 	__NR_PAGEFLAGS,
@@ -602,12 +601,6 @@ FOLIO_FLAG_FALSE(mlocked)
 	FOLIO_TEST_SET_FLAG_FALSE(mlocked)
 #endif
 
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
-#else
-PAGEFLAG_FALSE(Uncached, uncached)
-#endif
-
 #ifdef CONFIG_MEMORY_FAILURE
 PAGEFLAG(HWPoison, hwpoison, PF_ANY)
 TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
-- 
cgit v1.2.3


From 0ca0c24e3211586d44a31b3c4767fe3f43a008a7 Mon Sep 17 00:00:00 2001
From: Usama Arif <usamaarif642@gmail.com>
Date: Fri, 23 Aug 2024 20:04:39 +0100
Subject: mm: store zero pages to be swapped out in a bitmap

Patch series "mm: store zero pages to be swapped out in a bitmap", v8.

As shown in the patch series that introduced the zswap same-filled
optimization [1], 10-20% of the pages stored in zswap are same-filled.
This is also observed across Meta's server fleet.  By using VM counters in
swap_writepage (not included in this patchseries) it was found that less
than 1% of the same-filled pages to be swapped out are non-zero pages.

For conventional swap setup (without zswap), rather than reading/writing
these pages to flash resulting in increased I/O and flash wear, a bitmap
can be used to mark these pages as zero at write time, and the pages can
be filled at read time if the bit corresponding to the page is set.

When using zswap with swap, this also means that a zswap_entry does not
need to be allocated for zero filled pages resulting in memory savings
which would offset the memory used for the bitmap.

A similar attempt was made earlier in [2] where zswap would only track
zero-filled pages instead of same-filled.  This patchseries adds
zero-filled pages optimization to swap (hence it can be used even if zswap
is disabled) and removes the same-filled code from zswap (as only 1% of
the same-filled pages are non-zero), simplifying code.

[1] https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/
[2] https://lore.kernel.org/lkml/20240325235018.2028408-1-yosryahmed@google.com/


This patch (of 2):

Approximately 10-20% of pages to be swapped out are zero pages [1].
Rather than reading/writing these pages to flash resulting
in increased I/O and flash wear, a bitmap can be used to mark these
pages as zero at write time, and the pages can be filled at
read time if the bit corresponding to the page is set.
With this patch, NVMe writes in Meta server fleet decreased
by almost 10% with conventional swap setup (zswap disabled).

[1] https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/

Link: https://lkml.kernel.org/r/20240823190545.979059-1-usamaarif642@gmail.com
Link: https://lkml.kernel.org/r/20240823190545.979059-2-usamaarif642@gmail.com
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 248db1dd7812..ca533b478c21 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -296,6 +296,7 @@ struct swap_info_struct {
 	signed char	type;		/* strange name for an index */
 	unsigned int	max;		/* extent of the swap_map */
 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+	unsigned long *zeromap;		/* kvmalloc'ed bitmap to track zero pages */
 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
 	struct list_head free_clusters; /* free clusters list */
 	struct list_head full_clusters; /* full clusters list */
-- 
cgit v1.2.3


From 63fc66f5b6b18f39269a66cf34d8cb7a24fbfe88 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
Date: Fri, 30 Aug 2024 00:00:58 -0400
Subject: ipc/shm, mm: drop do_vma_munmap()

The do_vma_munmap() wrapper existed for callers that didn't have a vma
iterator and needed to check the vma mseal status prior to calling the
underlying munmap().  All callers now use a vma iterator and since the
mseal check has been moved to do_vmi_align_munmap() and the vmas are
aligned, this function can just be called instead.

do_vmi_align_munmap() can no longer be static as ipc/shm is using it and
it is exported via the mm.h header.

Link: https://lkml.kernel.org/r/20240830040101.822209-19-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6f6053d7ea6f..b0ff06d18c71 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3287,14 +3287,14 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr,
 extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
 			 unsigned long start, size_t len, struct list_head *uf,
 			 bool unlock);
+int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
+		    struct mm_struct *mm, unsigned long start,
+		    unsigned long end, struct list_head *uf, bool unlock);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t,
 		     struct list_head *uf);
 extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
 
 #ifdef CONFIG_MMU
-extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
-			 unsigned long start, unsigned long end,
-			 struct list_head *uf, bool unlock);
 extern int __mm_populate(unsigned long addr, unsigned long len,
 			 int ignore_errors);
 static inline void mm_populate(unsigned long addr, unsigned long len)
-- 
cgit v1.2.3


From f1264e9531b0fbadf88e0b9c8143c546343b481c Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Tue, 27 Aug 2024 19:47:27 +0800
Subject: mm: migrate: add isolate_folio_to_list()

Add isolate_folio_to_list() helper to try to isolate HugeTLB, no-LRU
movable and LRU folios to a list, which will be reused by
do_migrate_range() from memory hotplug soon, also drop the
mf_isolate_folio() since we could directly use new helper in the
soft_offline_in_use_page().

Link: https://lkml.kernel.org/r/20240827114728.3212578-5-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Tested-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 644be30b69c8..002e49b2ebd9 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -70,6 +70,7 @@ int migrate_pages(struct list_head *l, new_folio_t new, free_folio_t free,
 		  unsigned int *ret_succeeded);
 struct folio *alloc_migration_target(struct folio *src, unsigned long private);
 bool isolate_movable_page(struct page *page, isolate_mode_t mode);
+bool isolate_folio_to_list(struct folio *folio, struct list_head *list);
 
 int migrate_huge_page_move_mapping(struct address_space *mapping,
 		struct folio *dst, struct folio *src);
@@ -91,6 +92,8 @@ static inline struct folio *alloc_migration_target(struct folio *src,
 	{ return NULL; }
 static inline bool isolate_movable_page(struct page *page, isolate_mode_t mode)
 	{ return false; }
+static inline bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
+	{ return false; }
 
 static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct folio *dst, struct folio *src)
-- 
cgit v1.2.3


From d29e741cad3f8f41df1834bf74df79380c1c6c6d Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 19 Aug 2024 17:17:04 +0200
Subject: gpio: davinci: drop platform data support

There are no more any board files that use the platform data for
gpio-davinci. We can remove the header defining it and port the code to
no longer store any context in pdata.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20240819151705.37258-1-brgl@bgdev.pl
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/platform_data/gpio-davinci.h | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 include/linux/platform_data/gpio-davinci.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/gpio-davinci.h b/include/linux/platform_data/gpio-davinci.h
deleted file mode 100644
index b82e44662efe..000000000000
--- a/include/linux/platform_data/gpio-davinci.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * DaVinci GPIO Platform Related Defines
- *
- * Copyright (C) 2013 Texas Instruments Incorporated - https://www.ti.com/
- */
-
-#ifndef __DAVINCI_GPIO_PLATFORM_H
-#define __DAVINCI_GPIO_PLATFORM_H
-
-struct davinci_gpio_platform_data {
-	bool	no_auto_base;
-	u32	base;
-	u32	ngpio;
-	u32	gpio_unbanked;
-};
-
-/* Convert GPIO signal to GPIO pin number */
-#define GPIO_TO_PIN(bank, gpio)	(16 * (bank) + (gpio))
-
-#endif
-- 
cgit v1.2.3


From 8b1451f2f723f845c05b8bad3d4c45de284338b5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 3 Sep 2024 21:54:28 -1000
Subject: sched_ext: Replace SCX_TASK_BAL_KEEP with SCX_RQ_BAL_KEEP

SCX_TASK_BAL_KEEP is used by balance_one() to tell pick_next_task_scx() to
keep running the current task. It's not really a task property. Replace it
with SCX_RQ_BAL_KEEP which resides in rq->scx.flags and is a better fit for
the usage. Also, the existing clearing rule is unnecessarily strict and
makes it difficult to use with core-sched. Just clear it on entry to
balance_one().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 69f68e2121a8..db2a266113ac 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -71,7 +71,6 @@ struct scx_dispatch_q {
 /* scx_entity.flags */
 enum scx_ent_flags {
 	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
-	SCX_TASK_BAL_KEEP	= 1 << 1, /* balance decided to keep current */
 	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
 	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
 
-- 
cgit v1.2.3


From 47f218d108950984b24af81f66356ceda380eb74 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 29 Aug 2024 21:06:17 -0300
Subject: iommu/amd: Store the nid in io_pgtable_cfg instead of the domain

We already have memory in the union here that is being wasted in AMD's
case, use it to store the nid.

Putting the nid here further isolates the io_pgtable code from the struct
protection_domain.

Fixup protection_domain_alloc so that the NID from the device is provided,
at this point dev is never NULL for AMD so this will now allocate the
first table pointer on the correct NUMA node.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Link: https://lore.kernel.org/r/8-v2-831cdc4d00f3+1a315-amd_iopgtbl_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/io-pgtable.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index f9a81761bfce..b1ecfc3cd5bc 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -171,6 +171,10 @@ struct io_pgtable_cfg {
 			u64 ttbr[4];
 			u32 n_ttbrs;
 		} apple_dart_cfg;
+
+		struct {
+			int nid;
+		} amd;
 	};
 };
 
-- 
cgit v1.2.3


From 9f82f15ddfdd60bb9820f09737333b2345e22ab3 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Thu, 22 Aug 2024 16:10:46 +0100
Subject: mm: use ARCH_PKEY_BITS to define VM_PKEY_BITN

Use the new CONFIG_ARCH_PKEY_BITS to simplify setting these bits
for different architectures.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20240822151113.1479789-4-joey.gouly@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/mm.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c4b238a20b76..fb6ccd93f589 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -330,12 +330,16 @@ extern unsigned int kobjsize(const void *objp);
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
-# define VM_PKEY_SHIFT	VM_HIGH_ARCH_BIT_0
-# define VM_PKEY_BIT0	VM_HIGH_ARCH_0	/* A protection key is a 4-bit value */
-# define VM_PKEY_BIT1	VM_HIGH_ARCH_1	/* on x86 and 5-bit value on ppc64   */
-# define VM_PKEY_BIT2	VM_HIGH_ARCH_2
-# define VM_PKEY_BIT3	VM_HIGH_ARCH_3
-#ifdef CONFIG_PPC
+# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
+# define VM_PKEY_BIT0  VM_HIGH_ARCH_0
+# define VM_PKEY_BIT1  VM_HIGH_ARCH_1
+# define VM_PKEY_BIT2  VM_HIGH_ARCH_2
+#if CONFIG_ARCH_PKEY_BITS > 3
+# define VM_PKEY_BIT3  VM_HIGH_ARCH_3
+#else
+# define VM_PKEY_BIT3  0
+#endif
+#if CONFIG_ARCH_PKEY_BITS > 4
 # define VM_PKEY_BIT4  VM_HIGH_ARCH_4
 #else
 # define VM_PKEY_BIT4  0
-- 
cgit v1.2.3


From facaa1373c9aabf8e34109a9cb205ad0f3a8584e Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Thu, 22 Aug 2024 16:10:55 +0100
Subject: arm64: re-order MTE VM_ flags

VM_PKEY_BIT[012] will use VM_HIGH_ARCH_[012], move the MTE VM flags to
accommodate this.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20240822151113.1479789-13-joey.gouly@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fb6ccd93f589..406512c16471 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -378,8 +378,8 @@ extern unsigned int kobjsize(const void *objp);
 #endif
 
 #if defined(CONFIG_ARM64_MTE)
-# define VM_MTE		VM_HIGH_ARCH_0	/* Use Tagged memory for access control */
-# define VM_MTE_ALLOWED	VM_HIGH_ARCH_1	/* Tagged memory permitted */
+# define VM_MTE		VM_HIGH_ARCH_4	/* Use Tagged memory for access control */
+# define VM_MTE_ALLOWED	VM_HIGH_ARCH_5	/* Tagged memory permitted */
 #else
 # define VM_MTE		VM_NONE
 # define VM_MTE_ALLOWED	VM_NONE
-- 
cgit v1.2.3


From c6fb88a5270f4ba24859f5ecb61cfc731ef16300 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Wed, 4 Sep 2024 21:51:50 +0900
Subject: firewire: core: allocate workqueue to handle isochronous contexts in
 card

This commit adds a workqueue dedicated for isochronous context processing.

The workqueue is allocated per instance of fw_card structure to satisfy the
following characteristics descending from 1394 OHCI specification:

In 1394 OHCI specification, memory pages are reserved to each isochronous
context dedicated to DMA transmission. It allows to operate these
per-context pages concurrently. Software can schedule hardware interrupt
for several isochronous context to the same cycle, thus WQ_UNBOUND is
specified. Additionally, it is sleepable to operate the content of pages,
thus WQ_BH is not used.

The isochronous context delivers the packets with time stamp, thus
WQ_HIGHPRI is specified for semi real-time data such as IEC 61883-1/6
protocol implemented by ALSA firewire stack. The isochronous context is not
used by the implementation of SCSI over IEEE1394 protocol (sbp2), thus
WQ_MEM_RECLAIM is not specified.

It is useful for users to adjust cpu affinity of the workqueue depending
on their work loads, thus WQ_SYS is specified to expose the attributes to
user space.

Tested-by: Edmund Raile <edmund.raile@protonmail.com>
Link: https://lore.kernel.org/r/20240904125155.461886-2-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 include/linux/firewire.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 1cca14cf5652..10e135d60824 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -134,6 +134,8 @@ struct fw_card {
 	__be32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
 
 	__be32 maint_utility_register;
+
+	struct workqueue_struct *isoc_wq;
 };
 
 static inline struct fw_card *fw_card_get(struct fw_card *card)
-- 
cgit v1.2.3


From 4f55ad754d6b7b6fa4ebcfd8c50f7e53e661a78e Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Wed, 4 Sep 2024 21:51:51 +0900
Subject: firewire: core: add local API to queue work item to workqueue
 specific to isochronous contexts

In the previous commit, the workqueue is added per the instance of fw_card
structure for isochronous contexts. The workqueue is designed to be used by
the implementation of fw_card_driver structure underlying the fw_card.

This commit adds some local APIs to be used by the implementation.

Tested-by: Edmund Raile <edmund.raile@protonmail.com>
Link: https://lore.kernel.org/r/20240904125155.461886-3-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 include/linux/firewire.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 10e135d60824..72f497b61739 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -511,6 +511,7 @@ union fw_iso_callback {
 
 struct fw_iso_context {
 	struct fw_card *card;
+	struct work_struct work;
 	int type;
 	int channel;
 	int speed;
-- 
cgit v1.2.3


From bd07d864522e7c3e4ee364e91aee8754992f5855 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 4 Sep 2024 14:11:20 +0206
Subject: printk: nbcon: Add function for printers to reacquire ownership

Since ownership can be lost at any time due to handover or
takeover, a printing context _must_ be prepared to back out
immediately and carefully. However, there are scenarios where
the printing context must reacquire ownership in order to
finalize or revert hardware changes.

One such example is when interrupts are disabled during
printing. No other context will automagically re-enable the
interrupts. For this case, the disabling context _must_
reacquire nbcon ownership so that it can re-enable the
interrupts.

Provide nbcon_reacquire_nobuf() for exactly this purpose. It
allows a printing context to reacquire ownership using the same
priority as its previous ownership.

Note that after a successful reacquire the printing context
will have no output buffer because that has been lost. This
function cannot be used to resume printing.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20240904120536.115780-2-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/console.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 9a13f91b0c43..88050d30a9cc 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -366,6 +366,10 @@ struct console {
 	 *
 	 * The callback should allow the takeover whenever it is safe. It
 	 * increases the chance to see messages when the system is in trouble.
+	 * If the driver must reacquire ownership in order to finalize or
+	 * revert hardware changes, nbcon_reacquire_nobuf() can be used.
+	 * However, on reacquire the buffer content is no longer available. A
+	 * reacquire cannot be used to resume printing.
 	 *
 	 * The callback can be called from any context (including NMI).
 	 * Therefore it must avoid usage of any locking and instead rely
@@ -558,12 +562,14 @@ extern void nbcon_cpu_emergency_exit(void);
 extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt);
 extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
 extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
+extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt);
 #else
 static inline void nbcon_cpu_emergency_enter(void) { }
 static inline void nbcon_cpu_emergency_exit(void) { }
 static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; }
 static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; }
 static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; }
+static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { }
 #endif
 
 extern int console_set_on_cmdline;
-- 
cgit v1.2.3


From 76f258bf3f2aae570209319944703a92ac64e29e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Sep 2024 14:11:25 +0206
Subject: printk: nbcon: Introduce printer kthreads

Provide the main implementation for running a printer kthread
per nbcon console that is takeover/handover aware. This
includes:

- new mandatory write_thread() callback
- kthread creation
- kthread main printing loop
- kthread wakeup mechanism
- kthread shutdown

kthread creation is a bit tricky because consoles may register
before kthreads can be created. In such cases, registration
will succeed, even though no kthread exists. Once kthreads can
be created, an early_initcall will set @printk_kthreads_ready.
If there are no registered boot consoles, the early_initcall
creates the kthreads for all registered nbcon consoles. If
kthread creation fails, the related console is unregistered.

If there are registered boot consoles when
@printk_kthreads_ready is set, no kthreads are created until
the final boot console unregisters.

Once kthread creation finally occurs, @printk_kthreads_running
is set so that the system knows kthreads are available for all
registered nbcon consoles.

If @printk_kthreads_running is already set when the console
is registering, the kthread is created during registration. If
kthread creation fails, the registration will fail.

Until @printk_kthreads_running is set, console printing occurs
directly via the console_lock.

kthread shutdown on system shutdown/reboot is necessary to
ensure the printer kthreads finish their printing so that the
system can cleanly transition back to direct printing via the
console_lock in order to reliably push out the final
shutdown/reboot messages. @printk_kthreads_running is cleared
before shutting down the individual kthreads.

The kthread uses a new mandatory write_thread() callback that
is called with both device_lock() and the console context
acquired.

The console ownership handling is necessary for synchronization
against write_atomic() which is synchronized only via the
console context ownership.

The device_lock() serializes acquiring the console context with
NBCON_PRIO_NORMAL. It is needed in case the device_lock() does
not disable preemption. It prevents the following race:

CPU0				CPU1

 [ task A ]

 nbcon_context_try_acquire()
   # success with NORMAL prio
   # .unsafe == false;  // safe for takeover

 [ schedule: task A -> B ]

				WARN_ON()
				  nbcon_atomic_flush_pending()
				    nbcon_context_try_acquire()
				      # success with EMERGENCY prio

				      # flushing
				      nbcon_context_release()

				      # HERE: con->nbcon_state is free
				      #       to take by anyone !!!

 nbcon_context_try_acquire()
   # success with NORMAL prio [ task B ]

 [ schedule: task B -> A ]

 nbcon_enter_unsafe()
   nbcon_context_can_proceed()

BUG: nbcon_context_can_proceed() returns "true" because
     the console is owned by a context on CPU0 with
     NBCON_PRIO_NORMAL.

     But it should return "false". The console is owned
     by a context from task B and we do the check
     in a context from task A.

Note that with these changes, the printer kthreads do not yet
take over full responsibility for nbcon printing during normal
operation. These changes only focus on the lifecycle of the
kthreads.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20240904120536.115780-7-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/console.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 88050d30a9cc..788ce9c829f6 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -16,7 +16,9 @@
 
 #include <linux/atomic.h>
 #include <linux/bits.h>
+#include <linux/irq_work.h>
 #include <linux/rculist.h>
+#include <linux/rcuwait.h>
 #include <linux/types.h>
 #include <linux/vesa.h>
 
@@ -324,6 +326,9 @@ struct nbcon_write_context {
  * @nbcon_seq:		Sequence number of the next record for nbcon to print
  * @nbcon_device_ctxt:	Context available for non-printing operations
  * @pbufs:		Pointer to nbcon private buffer
+ * @kthread:		Printer kthread for this console
+ * @rcuwait:		RCU-safe wait object for @kthread waking
+ * @irq_work:		Defer @kthread waking to IRQ work context
  */
 struct console {
 	char			name[16];
@@ -377,6 +382,37 @@ struct console {
 	 */
 	void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt);
 
+	/**
+	 * @write_thread:
+	 *
+	 * NBCON callback to write out text in task context.
+	 *
+	 * This callback must be called only in task context with both
+	 * device_lock() and the nbcon console acquired with
+	 * NBCON_PRIO_NORMAL.
+	 *
+	 * The same rules for console ownership verification and unsafe
+	 * sections handling applies as with write_atomic().
+	 *
+	 * The console ownership handling is necessary for synchronization
+	 * against write_atomic() which is synchronized only via the context.
+	 *
+	 * The device_lock() provides the primary serialization for operations
+	 * on the device. It might be as relaxed (mutex)[*] or as tight
+	 * (disabled preemption and interrupts) as needed. It allows
+	 * the kthread to operate in the least restrictive mode[**].
+	 *
+	 * [*] Standalone nbcon_context_try_acquire() is not safe with
+	 *     the preemption enabled, see nbcon_owner_matches(). But it
+	 *     can be safe when always called in the preemptive context
+	 *     under the device_lock().
+	 *
+	 * [**] The device_lock() makes sure that nbcon_context_try_acquire()
+	 *      would never need to spin which is important especially with
+	 *      PREEMPT_RT.
+	 */
+	void (*write_thread)(struct console *con, struct nbcon_write_context *wctxt);
+
 	/**
 	 * @device_lock:
 	 *
@@ -423,7 +459,11 @@ struct console {
 	atomic_t		__private nbcon_state;
 	atomic_long_t		__private nbcon_seq;
 	struct nbcon_context	__private nbcon_device_ctxt;
+
 	struct printk_buffers	*pbufs;
+	struct task_struct	*kthread;
+	struct rcuwait		rcuwait;
+	struct irq_work		irq_work;
 };
 
 #ifdef CONFIG_LOCKDEP
-- 
cgit v1.2.3


From 5102981d5e2a5a7a12424b5d26c7524ac2899898 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 4 Sep 2024 14:11:30 +0206
Subject: printk: nbcon: Show replay message on takeover

An emergency or panic context can takeover console ownership
while the current owner was printing a printk message. The
atomic printer will re-print the message that the previous
owner was printing. However, this can look confusing to the
user and may even seem as though a message was lost.

  [3430014.1
  [3430014.181123] usb 1-2: Product: USB Audio

Add a new field @nbcon_prev_seq to struct console to track
the sequence number to print that was assigned to the previous
console owner. If this matches the sequence number to print
that the current owner is assigned, then a takeover must have
occurred. In this case, print an additional message to inform
the user that the previous message is being printed again.

  [3430014.1
  ** replaying previous printk message **
  [3430014.181123] usb 1-2: Product: USB Audio

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20240904120536.115780-12-john.ogness@linutronix.de
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/console.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/console.h b/include/linux/console.h
index 788ce9c829f6..eba367bf605d 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -325,6 +325,7 @@ struct nbcon_write_context {
  * @nbcon_state:	State for nbcon consoles
  * @nbcon_seq:		Sequence number of the next record for nbcon to print
  * @nbcon_device_ctxt:	Context available for non-printing operations
+ * @nbcon_prev_seq:	Seq num the previous nbcon owner was assigned to print
  * @pbufs:		Pointer to nbcon private buffer
  * @kthread:		Printer kthread for this console
  * @rcuwait:		RCU-safe wait object for @kthread waking
@@ -459,6 +460,7 @@ struct console {
 	atomic_t		__private nbcon_state;
 	atomic_long_t		__private nbcon_seq;
 	struct nbcon_context	__private nbcon_device_ctxt;
+	atomic_long_t           __private nbcon_prev_seq;
 
 	struct printk_buffers	*pbufs;
 	struct task_struct	*kthread;
-- 
cgit v1.2.3


From a4d398a573d0f0c98c39d4af1866a3edaec4959c Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Mon, 2 Sep 2024 07:43:23 +0100
Subject: ARM: 9416/1: amba: make amba_bustype constant

Since commit d492cc2573a0 ("driver core: device.h: make struct
bus_type a const *"), the driver core can properly handle constant
struct bus_type, move the amba_bustype variable to be a constant
structure as well, placing it into read-only memory which can not be
modified at runtime.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
---
 include/linux/amba/bus.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 958a55bcc708..dda2f3ea89cb 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -105,7 +105,7 @@ enum amba_vendor {
 	AMBA_VENDOR_LSI = 0xb6,
 };
 
-extern struct bus_type amba_bustype;
+extern const struct bus_type amba_bustype;
 
 #define to_amba_device(d)	container_of_const(d, struct amba_device, dev)
 
-- 
cgit v1.2.3


From 2097154a10c6ee78be8796411e5d0ad81ee06ed6 Mon Sep 17 00:00:00 2001
From: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Date: Tue, 3 Sep 2024 17:16:12 +0200
Subject: namespace: introduce SB_I_NOIDMAP flag

Right now we determine if filesystem support vfs idmappings or not basing
on the FS_ALLOW_IDMAP flag presence. This "static" way works perfecly well
for local filesystems like ext4, xfs, btrfs, etc. But for network-like
filesystems like fuse, cephfs this approach is not ideal, because sometimes
proper support of vfs idmaps requires some extensions for the on-wire
protocol, which implies that changes have to be made not only in the Linux
kernel code but also in the 3rd party components like libfuse, cephfs MDS
server and so on.

We have seen that issue during our work on cephfs idmapped mounts [1] with
Christian, but right now I'm working on the idmapped mounts support for
fuse/virtiofs and I think that it is a right time for this extension.

[1] 5ccd8530dd7 ("ceph: handle idmapped mounts in create_request_message()")

Suggested-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd34b5755c0b..6ff547ef21f2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1189,6 +1189,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
 #define SB_I_RETIRED	0x00000800	/* superblock shouldn't be reused */
 #define SB_I_NOUMASK	0x00001000	/* VFS does not apply umask */
+#define SB_I_NOIDMAP	0x00002000	/* No idmapped mounts on this superblock */
 
 /* Possible states of 'frozen' field */
 enum {
-- 
cgit v1.2.3


From 940ce73bdec5a020fec058ba947d2bf627462c53 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 4 Sep 2024 11:08:44 -0700
Subject: bpf: Remove the insn_buf array stack usage from the inline_bpf_loop()

This patch removes the insn_buf array stack usage from the
inline_bpf_loop(). Instead, the env->insn_buf is used. The
usage in inline_bpf_loop() needs more than 16 insn, so the
INSN_BUF_SIZE needs to be increased from 16 to 32.
The compiler stack size warning on the verifier is gone
after this change.

Cc: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240904180847.56947-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 2e20207315a9..8458632824a4 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -24,7 +24,7 @@
  */
 #define TMP_STR_BUF_LEN 320
 /* Patch buffer size */
-#define INSN_BUF_SIZE 16
+#define INSN_BUF_SIZE 32
 
 /* Liveness marks, used for registers and spilled-regs (in stack slots).
  * Read marks propagate upwards until they find a write mark; they record that
-- 
cgit v1.2.3


From a8532fac7b5d27b8d62008a89593dccb6f9786ef Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 30 Aug 2024 22:02:34 -1000
Subject: sched_ext: TASK_DEAD tasks must be switched into SCX on ops_enable

During scx_ops_enable(), SCX needs to invoke the sleepable ops.init_task()
on every task. To do this, it does get_task_struct() on each iterated task,
drop the lock and then call ops.init_task().

However, a TASK_DEAD task may already have lost all its usage count and be
waiting for RCU grace period to be freed. If get_task_struct() is called on
such task, use-after-free can happen. To avoid such situations,
scx_ops_enable() skips initialization of TASK_DEAD tasks, which seems safe
as they are never going to be scheduled again.

Unfortunately, a racing sched_setscheduler(2) can grab the task before the
task is unhashed and then continue to e.g. move the task from RT to SCX
after TASK_DEAD is set and ops_enable skipped the task. As the task hasn't
gone through scx_ops_init_task(), scx_ops_enable_task() called from
switching_to_scx() triggers the following warning:

  sched_ext: Invalid task state transition 0 -> 3 for stress-ng-race-[2872]
  WARNING: CPU: 6 PID: 2367 at kernel/sched/ext.c:3327 scx_ops_enable_task+0x18f/0x1f0
  ...
  RIP: 0010:scx_ops_enable_task+0x18f/0x1f0
  ...
   switching_to_scx+0x13/0xa0
   __sched_setscheduler+0x84e/0xa50
   do_sched_setscheduler+0x104/0x1c0
   __x64_sys_sched_setscheduler+0x18/0x30
   do_syscall_64+0x7b/0x140
   entry_SYSCALL_64_after_hwframe+0x76/0x7e

As in the ops_disable path, it just doesn't seem like a good idea to leave
any task in an inconsistent state, even when the task is dead. The root
cause is ops_enable not being able to tell reliably whether a task is truly
dead (no one else is looking at it and it's about to be freed) and was
testing TASK_DEAD instead. Fix it by testing the task's usage count
directly.

- ops_init no longer ignores TASK_DEAD tasks. As now all users iterate all
  tasks, @include_dead is removed from scx_task_iter_next_locked() along
  with dead task filtering.

- tryget_task_struct() is added. Tasks are skipped iff tryget_task_struct()
  fails.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Vernet <void@manifault.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/sched/task.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 4df2f9055587..0f2aeb37bbb0 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -120,6 +120,11 @@ static inline struct task_struct *get_task_struct(struct task_struct *t)
 	return t;
 }
 
+static inline struct task_struct *tryget_task_struct(struct task_struct *t)
+{
+	return refcount_inc_not_zero(&t->usage) ? t : NULL;
+}
+
 extern void __put_task_struct(struct task_struct *t);
 extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);
 
-- 
cgit v1.2.3


From 8195136669661fdfe54e9a8923c33b31c92fc1da Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 4 Sep 2024 10:24:59 -1000
Subject: sched_ext: Add cgroup support

Add sched_ext_ops operations to init/exit cgroups, and track task migrations
and config changes. A BPF scheduler may not implement or implement only
subset of cgroup features. The implemented features can be indicated using
%SCX_OPS_HAS_CGOUP_* flags. If cgroup configuration makes use of features
that are not implemented, a warning is triggered.

While a BPF scheduler is being enabled and disabled, relevant cgroup
operations are locked out using scx_cgroup_rwsem. This avoids situations
like task prep taking place while the task is being moved across cgroups,
making things easier for BPF schedulers.

v7: - cgroup interface file visibility toggling is dropped in favor just
      warning messages. Dynamically changing interface visiblity caused more
      confusion than helping.

v6: - Updated to reflect the removal of SCX_KF_SLEEPABLE.

    - Updated to use CONFIG_GROUP_SCHED_WEIGHT and fixes for
      !CONFIG_FAIR_GROUP_SCHED && CONFIG_EXT_GROUP_SCHED.

v5: - Flipped the locking order between scx_cgroup_rwsem and
      cpus_read_lock() to avoid locking order conflict w/ cpuset. Better
      documentation around locking.

    - sched_move_task() takes an early exit if the source and destination
      are identical. This triggered the warning in scx_cgroup_can_attach()
      as it left p->scx.cgrp_moving_from uncleared. Updated the cgroup
      migration path so that ops.cgroup_prep_move() is skipped for identity
      migrations so that its invocations always match ops.cgroup_move()
      one-to-one.

v4: - Example schedulers moved into their own patches.

    - Fix build failure when !CONFIG_CGROUP_SCHED, reported by Andrea Righi.

v3: - Make scx_example_pair switch all tasks by default.

    - Convert to BPF inline iterators.

    - scx_bpf_task_cgroup() is added to determine the current cgroup from
      CPU controller's POV. This allows BPF schedulers to accurately track
      CPU cgroup membership.

    - scx_example_flatcg added. This demonstrates flattened hierarchy
      implementation of CPU cgroup control and shows significant performance
      improvement when cgroups which are nested multiple levels are under
      competition.

v2: - Build fixes for different CONFIG combinations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Andrea Righi <andrea.righi@canonical.com>
---
 include/linux/sched/ext.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index db2a266113ac..dc646cca4fcf 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -188,6 +188,9 @@ struct sched_ext_entity {
 	bool			disallow;	/* reject switching into SCX */
 
 	/* cold fields */
+#ifdef CONFIG_EXT_GROUP_SCHED
+	struct cgroup		*cgrp_moving_from;
+#endif
 	/* must be the last field, see init_scx_entity() */
 	struct list_head	tasks_node;
 };
-- 
cgit v1.2.3


From 4e893545ef8712d25f3176790ebb95beb073637e Mon Sep 17 00:00:00 2001
From: Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
Date: Wed, 4 Sep 2024 12:48:47 +0200
Subject: PCI/NPEM: Add Native PCIe Enclosure Management support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Native PCIe Enclosure Management (NPEM, PCIe r6.1 sec 6.28) allows managing
LEDs in storage enclosures. NPEM is indication oriented and it does not
give direct access to LEDs. Although each indication *could* represent an
individual LED, multiple indications could also be represented as a single,
multi-color LED or a single LED blinking in a specific interval.  The
specification leaves that open.

Each enabled indication (capability register bit on) is represented as a
ledclass_dev which can be controlled through sysfs. For every ledclass
device only 2 brightness states are allowed: LED_ON (1) or LED_OFF (0).
This corresponds to the NPEM control register (Indication bit on/off).

Ledclass devices appear in sysfs as child devices (subdirectory) of PCI
device which has an NPEM Extended Capability and indication is enabled in
NPEM capability register. For example, these are LEDs created for pcieport
"10000:02:05.0" on my setup:

  leds/
  ├── 10000:02:05.0:enclosure:fail
  ├── 10000:02:05.0:enclosure:locate
  ├── 10000:02:05.0:enclosure:ok
  └── 10000:02:05.0:enclosure:rebuild

They can be also found in "/sys/class/leds" directory. The parent PCIe
device domain/bus/device/function address is used to guarantee uniqueness
across leds subsystem.

To enable/disable a "fail" indication, the "brightness" file can be edited:

  echo 1 > ./leds/10000:02:05.0:enclosure:fail/brightness
  echo 0 > ./leds/10000:02:05.0:enclosure:fail/brightness

PCIe r6.1, sec 7.9.19.2 defines the possible indications.

Multiple indications for same parent PCIe device can conflict and hardware
may update them when processing new request. To avoid issues, driver
refresh all indications by reading back control register.

This driver expects to be the exclusive NPEM extended capability manager.
It waits up to 1 second after imposing new request, it doesn't verify if
controller is busy before write, and it assumes the mutex lock gives
protection from concurrent updates.

If _DSM LED management is available, we assume the platform may be using
NPEM for its own purposes (see PCI Firmware Spec r3.3 sec 4.7), so the
driver does not use NPEM. A future patch will add _DSM support; an info
message notes whether NPEM or _DSM is being used.

NPEM is a PCIe extended capability so it should be registered in
pcie_init_capabilities() but it is not possible due to LED dependency.  The
parent pci_device must be added earlier for led_classdev_register() to be
successful. NPEM does not require configuration on kernel side, so it is
safe to register LED devices later.

Link: https://lore.kernel.org/r/20240904104848.23480-3-mariusz.tkaczyk@linux.intel.com
Suggested-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Stuart Hayes <stuart.w.hayes@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 include/linux/pci.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4cf89a4b4cbc..c9db853e269f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -516,6 +516,9 @@ struct pci_dev {
 #endif
 #ifdef CONFIG_PCI_DOE
 	struct xarray	doe_mbs;	/* Data Object Exchange mailboxes */
+#endif
+#ifdef CONFIG_PCI_NPEM
+	struct npem	*npem;		/* Native PCIe Enclosure Management */
 #endif
 	u16		acs_cap;	/* ACS Capability offset */
 	phys_addr_t	rom;		/* Physical address if not from BAR */
-- 
cgit v1.2.3


From 73425800ac948cb78063b897a0c345d788f3594d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 5 Jun 2024 11:26:24 +0100
Subject: netfs, cifs: Move CIFS_INO_MODIFIED_ATTR to netfs_inode

Move CIFS_INO_MODIFIED_ATTR to netfs_inode as NETFS_ICTX_MODIFIED_ATTR and
then make netfs_perform_write() set it.  This means that cifs doesn't need
to implement the ->post_modify() hook.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Paulo Alcantara <pc@manguebit.com>
cc: linux-cifs@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20240814203850.2240469-7-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/netfs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index c47443e7a97e..71c748c53a1f 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -73,6 +73,7 @@ struct netfs_inode {
 #define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
 #define NETFS_ICTX_UNBUFFERED	1		/* I/O should not use the pagecache */
 #define NETFS_ICTX_WRITETHROUGH	2		/* Write-through caching */
+#define NETFS_ICTX_MODIFIED_ATTR 3		/* Indicate change in mtime/ctime */
 };
 
 /*
-- 
cgit v1.2.3


From 52d55922e0f1db1f580c9f91c174d2392bfad481 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 7 Jun 2024 09:02:58 +0100
Subject: netfs: Move max_len/max_nr_segs from netfs_io_subrequest to
 netfs_io_stream

Move max_len/max_nr_segs from struct netfs_io_subrequest to struct
netfs_io_stream as we only issue one subreq at a time and then don't need
these values again for that subreq unless and until we have to retry it -
in which case we want to renegotiate them.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20240814203850.2240469-8-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/netfs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 71c748c53a1f..eea5ea2842f6 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -134,6 +134,8 @@ static inline struct netfs_group *netfs_folio_group(struct folio *folio)
 struct netfs_io_stream {
 	/* Submission tracking */
 	struct netfs_io_subrequest *construct;	/* Op being constructed */
+	size_t			sreq_max_len;	/* Maximum size of a subrequest */
+	unsigned int		sreq_max_segs;	/* 0 or max number of segments in an iterator */
 	unsigned int		submit_off;	/* Folio offset we're submitting from */
 	unsigned int		submit_len;	/* Amount of data left to submit */
 	unsigned int		submit_max_len;	/* Amount I/O can be rounded up to */
@@ -177,14 +179,12 @@ struct netfs_io_subrequest {
 	struct list_head	rreq_link;	/* Link in rreq->subrequests */
 	struct iov_iter		io_iter;	/* Iterator for this subrequest */
 	unsigned long long	start;		/* Where to start the I/O */
-	size_t			max_len;	/* Maximum size of the I/O */
 	size_t			len;		/* Size of the I/O */
 	size_t			transferred;	/* Amount of data transferred */
 	refcount_t		ref;
 	short			error;		/* 0 or error that occurred */
 	unsigned short		debug_index;	/* Index in list (for debugging output) */
 	unsigned int		nr_segs;	/* Number of segs in io_iter */
-	unsigned int		max_nr_segs;	/* 0 or max number of segments in an iterator */
 	enum netfs_io_source	source;		/* Where to read from/write to */
 	unsigned char		stream_nr;	/* I/O stream this belongs to */
 	unsigned long		flags;
-- 
cgit v1.2.3


From 51e9a86a4f75c6dc8531407b2ab2ccc4ad137e5a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 9 Jul 2024 09:45:46 +0100
Subject: netfs: Reserve netfs_sreq_source 0 as unset/unknown

Reserve the 0-valued netfs_sreq_source to mean unset or unknown so that it
can be seen in the trace as such rather than appearing as
download-from-server when it's going to get switched to something else.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20240814203850.2240469-9-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/netfs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index eea5ea2842f6..37f8ce9c0284 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -43,6 +43,7 @@ static inline void folio_start_private_2(struct folio *folio)
 #define NETFS_BUF_PAGECACHE_MARK XA_MARK_1	/* - Page needs wb/dirty flag wrangling */
 
 enum netfs_io_source {
+	NETFS_SOURCE_UNKNOWN,
 	NETFS_FILL_WITH_ZEROES,
 	NETFS_DOWNLOAD_FROM_SERVER,
 	NETFS_READ_FROM_CACHE,
-- 
cgit v1.2.3


From c57de2a9259d7dc18a7a425fca91c77502263d8a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 9 Jul 2024 09:43:38 +0100
Subject: netfs: Remove NETFS_COPY_TO_CACHE

Remove NETFS_COPY_TO_CACHE as it isn't used anymore.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20240814203850.2240469-10-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/netfs.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 37f8ce9c0284..63ab2c775a21 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -207,11 +207,10 @@ enum netfs_io_origin {
 	NETFS_READAHEAD,		/* This read was triggered by readahead */
 	NETFS_READPAGE,			/* This read is a synchronous read */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
-	NETFS_COPY_TO_CACHE,		/* This write is to copy a read to the cache */
+	NETFS_DIO_READ,			/* This is a direct I/O read */
 	NETFS_WRITEBACK,		/* This write was triggered by writepages */
 	NETFS_WRITETHROUGH,		/* This write was made by netfs_perform_write() */
 	NETFS_UNBUFFERED_WRITE,		/* This is an unbuffered write */
-	NETFS_DIO_READ,			/* This is a direct I/O read */
 	NETFS_DIO_WRITE,		/* This is a direct I/O write */
 	nr__netfs_io_origin
 } __mode(byte);
-- 
cgit v1.2.3


From f9ecc2febf6fd6ad53208a1c0e1b5066ee65dd8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@baylibre.com>
Date: Fri, 12 Jul 2024 19:18:20 +0200
Subject: pwm: Don't export pwm_capture()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is only a single caller of this function, and that's in
drivers/pwm/core.c itself. So don't export the function.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Link: https://lore.kernel.org/r/20240712171821.1470833-2-u.kleine-koenig@baylibre.com
Signed-off-by: Uwe Kleine-König <ukleinek@kernel.org>
---
 include/linux/pwm.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index f8c2dc12dbd3..8acd60b53f58 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -394,9 +394,6 @@ static inline bool pwm_might_sleep(struct pwm_device *pwm)
 }
 
 /* PWM provider APIs */
-int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result,
-		unsigned long timeout);
-
 void pwmchip_put(struct pwm_chip *chip);
 struct pwm_chip *pwmchip_alloc(struct device *parent, unsigned int npwm, size_t sizeof_priv);
 struct pwm_chip *devm_pwmchip_alloc(struct device *parent, unsigned int npwm, size_t sizeof_priv);
@@ -462,13 +459,6 @@ static inline void pwm_disable(struct pwm_device *pwm)
 	might_sleep();
 }
 
-static inline int pwm_capture(struct pwm_device *pwm,
-			      struct pwm_capture *result,
-			      unsigned long timeout)
-{
-	return -EINVAL;
-}
-
 static inline void pwmchip_put(struct pwm_chip *chip)
 {
 }
-- 
cgit v1.2.3


From 4356d575ef0f39a3e8e0ce0c40d84ce900ac3b61 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 28 Aug 2024 20:19:43 +1000
Subject: fhandle: expose u64 mount id to name_to_handle_at(2)

Now that we provide a unique 64-bit mount ID interface in statx(2), we
can now provide a race-free way for name_to_handle_at(2) to provide a
file handle and corresponding mount without needing to worry about
racing with /proc/mountinfo parsing or having to open a file just to do
statx(2).

While this is not necessary if you are using AT_EMPTY_PATH and don't
care about an extra statx(2) call, users that pass full paths into
name_to_handle_at(2) need to know which mount the file handle comes from
(to make sure they don't try to open_by_handle_at a file handle from a
different filesystem) and switching to AT_EMPTY_PATH would require
allocating a file for every name_to_handle_at(2) call, turning

  err = name_to_handle_at(-EBADF, "/foo/bar/baz", &handle, &mntid,
                          AT_HANDLE_MNT_ID_UNIQUE);

into

  int fd = openat(-EBADF, "/foo/bar/baz", O_PATH | O_CLOEXEC);
  err1 = name_to_handle_at(fd, "", &handle, &unused_mntid, AT_EMPTY_PATH);
  err2 = statx(fd, "", AT_EMPTY_PATH, STATX_MNT_ID_UNIQUE, &statxbuf);
  mntid = statxbuf.stx_mnt_id;
  close(fd);

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Link: https://lore.kernel.org/r/20240828-exportfs-u64-mount-id-v3-2-10c2c4c16708@cyphar.com
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/syscalls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 4bcf6754738d..5758104921e6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -870,7 +870,7 @@ asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
 #endif
 asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
 				      struct file_handle __user *handle,
-				      int __user *mnt_id, int flag);
+				      void __user *mnt_id, int flag);
 asmlinkage long sys_open_by_handle_at(int mountdirfd,
 				      struct file_handle __user *handle,
 				      int flags);
-- 
cgit v1.2.3


From 19156263cb1f24128a9ba6ef7340be5cbacc3d22 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 5 Sep 2024 10:14:05 +0300
Subject: dma-mapping: use IOMMU DMA calls for common alloc/free page calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Common alloca and free pages routines are called when IOMMU DMA is used,
and internally it calls to DMA ops structure which is not available for
default IOMMU. This patch adds necessary if checks to call IOMMU DMA.

It fixes the following crash:

 Unable to handle kernel NULL pointer dereference at virtual address 0000000000000040
 Mem abort info:
   ESR = 0x0000000096000006
   EC = 0x25: DABT (current EL), IL = 32 bits
   SET = 0, FnV = 0
   EA = 0, S1PTW = 0
   FSC = 0x06: level 2 translation fault
 Data abort info:
   ISV = 0, ISS = 0x00000006, ISS2 = 0x00000000
   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
 user pgtable: 4k pages, 48-bit VAs, pgdp=00000000d20bb000
 [0000000000000040] pgd=08000000d20c1003
 , p4d=08000000d20c1003
 , pud=08000000d20c2003, pmd=0000000000000000
 Internal error: Oops: 0000000096000006 [#1] PREEMPT SMP
 Modules linked in: ipv6 hci_uart venus_core btqca
v4l2_mem2mem btrtl qcom_spmi_adc5 sbs_battery btbcm qcom_vadc_common
cros_ec_typec videobuf2_v4l2 leds_cros_ec cros_kbd_led_backlight
cros_ec_chardev videodev elan_i2c
videobuf2_common qcom_stats mc bluetooth coresight_stm stm_core
ecdh_generic ecc pwrseq_core panel_edp icc_bwmon ath10k_snoc ath10k_core
ath mac80211 phy_qcom_qmp_combo aux_bridge libarc4 coresight_replicator
coresight_etm4x coresight_tmc
coresight_funnel cfg80211 rfkill coresight qcom_wdt cbmem ramoops
reed_solomon pwm_bl coreboot_table backlight crct10dif_ce
 CPU: 7 UID: 0 PID: 70 Comm: kworker/u32:4 Not tainted 6.11.0-rc6-next-20240903-00003-gdfc6015d0711 #660
 Hardware name: Google Lazor Limozeen without Touchscreen (rev5 - rev8) (DT)
 Workqueue: events_unbound deferred_probe_work_func
 hub 2-1:1.0: 4 ports detected

 pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
 pc : dma_common_alloc_pages+0x54/0x1b4
 lr : dma_common_alloc_pages+0x4c/0x1b4
 sp : ffff8000807d3730
 x29: ffff8000807d3730 x28: ffff02a7d312f880 x27: 0000000000000001
 x26: 000000000000c000 x25: 0000000000000000 x24: 0000000000000001
 x23: ffff02a7d23b6898 x22: 0000000000006cc0 x21: 000000000000c000
 x20: ffff02a7858bf410 x19: fffffe0a60006000 x18: 0000000000000001
 x17: 00000000000000d5 x16: 1fffe054f0bcc261 x15: 0000000000000001
 x14: ffff02a7844dc680 x13: 0000000000100180 x12: dead000000000100
 x11: dead000000000122 x10: 00000000001001ff x9 : ffff02a87f7b7b00
 x8 : ffff02a87f7b7b00 x7 : ffff405977d6b000 x6 : ffff8000807d3310
 x5 : ffff02a87f6b6398 x4 : 0000000000000001 x3 : ffff405977d6b000
 x2 : ffff02a7844dc600 x1 : 0000000100000000 x0 : fffffe0a60006000
 Call trace:
  dma_common_alloc_pages+0x54/0x1b4
  __dma_alloc_pages+0x68/0x90
  dma_alloc_pages+0x10/0x1c
  snd_dma_noncoherent_alloc+0x28/0x8c
  __snd_dma_alloc_pages+0x30/0x50
  snd_dma_alloc_dir_pages+0x40/0x80
  do_alloc_pages+0xb8/0x13c
  preallocate_pcm_pages+0x6c/0xf8
  preallocate_pages+0x160/0x1a4
  snd_pcm_set_managed_buffer_all+0x64/0xb0
  lpass_platform_pcm_new+0xc0/0xe8
  snd_soc_pcm_component_new+0x3c/0xc8
  soc_new_pcm+0x4fc/0x668
  snd_soc_bind_card+0xabc/0xbac
  snd_soc_register_card+0xf0/0x108
  devm_snd_soc_register_card+0x4c/0xa4
  sc7180_snd_platform_probe+0x180/0x224
  platform_probe+0x68/0xc0
  really_probe+0xbc/0x298
  __driver_probe_device+0x78/0x12c
  driver_probe_device+0x3c/0x15c
  __device_attach_driver+0xb8/0x134
  bus_for_each_drv+0x84/0xe0
  __device_attach+0x9c/0x188
  device_initial_probe+0x14/0x20
  bus_probe_device+0xac/0xb0
  deferred_probe_work_func+0x88/0xc0
  process_one_work+0x14c/0x28c
  worker_thread+0x2cc/0x3d4
  kthread+0x114/0x118
  ret_from_fork+0x10/0x20
 Code: f9411c19 940000c9 aa0003f3 b4000460 (f9402326)
 ---[ end trace 0000000000000000 ]---

Fixes: b5c58b2fdc42 ("dma-mapping: direct calls for dma-iommu")
Closes: https://lore.kernel.org/all/10431dfd-ce04-4e0f-973b-c78477303c18@notapiano
Reported-by: Nícolas F. R. A. Prado <nfraprado@collabora.com> #KernelCI
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Tested-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/iommu-dma.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h
index d30a58bf00fd..1bb55ca1ab79 100644
--- a/include/linux/iommu-dma.h
+++ b/include/linux/iommu-dma.h
@@ -10,6 +10,10 @@
 #include <linux/dma-direction.h>
 
 #ifdef CONFIG_IOMMU_DMA
+static inline bool use_dma_iommu(struct device *dev)
+{
+	return dev->dma_iommu;
+}
 dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t size, enum dma_data_direction dir,
 		unsigned long attrs);
@@ -49,6 +53,10 @@ void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
 void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
 		int nelems, enum dma_data_direction dir);
 #else
+static inline bool use_dma_iommu(struct device *dev)
+{
+	return false;
+}
 static inline dma_addr_t iommu_dma_map_page(struct device *dev,
 		struct page *page, unsigned long offset, size_t size,
 		enum dma_data_direction dir, unsigned long attrs)
-- 
cgit v1.2.3


From 59da880afed211c989ef65da577b24215ce57774 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 3 Sep 2024 10:45:58 -0700
Subject: uprobes: get rid of enum uprobe_filter_ctx in uprobe filter callbacks

It serves no purpose beyond adding unnecessray argument passed to the
filter callback. Just get rid of it, no one is actually using it.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240903174603.3554182-4-andrii@kernel.org
---
 include/linux/uprobes.h | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index f50df1fa93e7..63ae2ade3487 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -28,20 +28,12 @@ struct page;
 
 #define MAX_URETPROBE_DEPTH		64
 
-enum uprobe_filter_ctx {
-	UPROBE_FILTER_REGISTER,
-	UPROBE_FILTER_UNREGISTER,
-	UPROBE_FILTER_MMAP,
-};
-
 struct uprobe_consumer {
 	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
 	int (*ret_handler)(struct uprobe_consumer *self,
 				unsigned long func,
 				struct pt_regs *regs);
-	bool (*filter)(struct uprobe_consumer *self,
-				enum uprobe_filter_ctx ctx,
-				struct mm_struct *mm);
+	bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
 
 	struct uprobe_consumer *next;
 };
-- 
cgit v1.2.3


From cc01bd044e6a521d2cd128f685ee8d23ef0067f2 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 3 Sep 2024 10:45:59 -0700
Subject: uprobes: travers uprobe's consumer list locklessly under SRCU
 protection

uprobe->register_rwsem is one of a few big bottlenecks to scalability of
uprobes, so we need to get rid of it to improve uprobe performance and
multi-CPU scalability.

First, we turn uprobe's consumer list to a typical doubly-linked list
and utilize existing RCU-aware helpers for traversing such lists, as
well as adding and removing elements from it.

For entry uprobes we already have SRCU protection active since before
uprobe lookup. For uretprobe we keep refcount, guaranteeing that uprobe
won't go away from under us, but we add SRCU protection around consumer
list traversal.

Lastly, to keep handler_chain()'s UPROBE_HANDLER_REMOVE handling simple,
we remember whether any removal was requested during handler calls, but
then we double-check the decision under a proper register_rwsem using
consumers' filter callbacks. Handler removal is very rare, so this extra
lock won't hurt performance, overall, but we also avoid the need for any
extra protection (e.g., seqcount locks).

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240903174603.3554182-5-andrii@kernel.org
---
 include/linux/uprobes.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 63ae2ade3487..f112b56e9479 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -29,13 +29,21 @@ struct page;
 #define MAX_URETPROBE_DEPTH		64
 
 struct uprobe_consumer {
+	/*
+	 * handler() can return UPROBE_HANDLER_REMOVE to signal the need to
+	 * unregister uprobe for current process. If UPROBE_HANDLER_REMOVE is
+	 * returned, filter() callback has to be implemented as well and it
+	 * should return false to "confirm" the decision to uninstall uprobe
+	 * for the current process. If filter() is omitted or returns true,
+	 * UPROBE_HANDLER_REMOVE is effectively ignored.
+	 */
 	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
 	int (*ret_handler)(struct uprobe_consumer *self,
 				unsigned long func,
 				struct pt_regs *regs);
 	bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
 
-	struct uprobe_consumer *next;
+	struct list_head cons_node;
 };
 
 #ifdef CONFIG_UPROBES
-- 
cgit v1.2.3


From 04b01625da130c7521b768996cd5e48052198b97 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 3 Sep 2024 10:46:00 -0700
Subject: perf/uprobe: split uprobe_unregister()

With uprobe_unregister() having grown a synchronize_srcu(), it becomes
fairly slow to call. Esp. since both users of this API call it in a
loop.

Peel off the sync_srcu() and do it once, after the loop.

We also need to add uprobe_unregister_sync() into uprobe_register()'s
error handling path, as we need to be careful about returning to the
caller before we have a guarantee that partially attached consumer won't
be called anymore. This is an unlikely slow path and this should be
totally fine to be slow in the case of a failed attach.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Co-developed-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240903174603.3554182-6-andrii@kernel.org
---
 include/linux/uprobes.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index f112b56e9479..2b294bf1881f 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -115,7 +115,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
 extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
-extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc);
+extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc);
+extern void uprobe_unregister_sync(void);
 extern int uprobe_mmap(struct vm_area_struct *vma);
 extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
 extern void uprobe_start_dup_mmap(void);
@@ -164,7 +165,10 @@ uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add)
 	return -ENOSYS;
 }
 static inline void
-uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
+uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+}
+static inline void uprobe_unregister_sync(void)
 {
 }
 static inline int uprobe_mmap(struct vm_area_struct *vma)
-- 
cgit v1.2.3


From 50a38035ed5ccc2ab8a28eaf70c3c7a87e060345 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 3 Sep 2024 10:46:01 -0700
Subject: rbtree: provide rb_find_rcu() / rb_find_add_rcu()

Much like latch_tree, add two RCU methods for the regular RB-tree,
which can be used in conjunction with a seqcount to provide lockless
lookups.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/20240903174603.3554182-7-andrii@kernel.org
---
 include/linux/rbtree.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index f7edca369eda..7c173aa64e1e 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -244,6 +244,42 @@ rb_find_add(struct rb_node *node, struct rb_root *tree,
 	return NULL;
 }
 
+/**
+ * rb_find_add_rcu() - find equivalent @node in @tree, or add @node
+ * @node: node to look-for / insert
+ * @tree: tree to search / modify
+ * @cmp: operator defining the node order
+ *
+ * Adds a Store-Release for link_node.
+ *
+ * Returns the rb_node matching @node, or NULL when no match is found and @node
+ * is inserted.
+ */
+static __always_inline struct rb_node *
+rb_find_add_rcu(struct rb_node *node, struct rb_root *tree,
+		int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+	struct rb_node **link = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	int c;
+
+	while (*link) {
+		parent = *link;
+		c = cmp(node, parent);
+
+		if (c < 0)
+			link = &parent->rb_left;
+		else if (c > 0)
+			link = &parent->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node_rcu(node, parent, link);
+	rb_insert_color(node, tree);
+	return NULL;
+}
+
 /**
  * rb_find() - find @key in tree @tree
  * @key: key to match
@@ -272,6 +308,37 @@ rb_find(const void *key, const struct rb_root *tree,
 	return NULL;
 }
 
+/**
+ * rb_find_rcu() - find @key in tree @tree
+ * @key: key to match
+ * @tree: tree to search
+ * @cmp: operator defining the node order
+ *
+ * Notably, tree descent vs concurrent tree rotations is unsound and can result
+ * in false-negatives.
+ *
+ * Returns the rb_node matching @key or NULL.
+ */
+static __always_inline struct rb_node *
+rb_find_rcu(const void *key, const struct rb_root *tree,
+	    int (*cmp)(const void *key, const struct rb_node *))
+{
+	struct rb_node *node = tree->rb_node;
+
+	while (node) {
+		int c = cmp(key, node);
+
+		if (c < 0)
+			node = rcu_dereference_raw(node->rb_left);
+		else if (c > 0)
+			node = rcu_dereference_raw(node->rb_right);
+		else
+			return node;
+	}
+
+	return NULL;
+}
+
 /**
  * rb_find_first() - find the first @key in @tree
  * @key: key to match
-- 
cgit v1.2.3


From e04e2b760ddbe3d7b283a05898c3a029085cd8cd Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Sun, 1 Sep 2024 05:10:52 +0200
Subject: platform/x86: wmi: Pass event data directly to legacy notify handlers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current legacy WMI handlers are susceptible to picking up wrong
WMI event data on systems where different WMI devices share some
notification IDs.

Prevent this by letting the WMI driver core taking care of retrieving
the event data. This also simplifies the legacy WMI handlers and their
implementation inside the WMI driver core.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Link: https://lore.kernel.org/r/20240901031055.3030-3-W_Armin@gmx.de
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/acpi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 0687a442fec7..eed105b1fbfb 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -386,7 +386,7 @@ extern bool acpi_is_pnp_device(struct acpi_device *);
 
 #if defined(CONFIG_ACPI_WMI) || defined(CONFIG_ACPI_WMI_MODULE)
 
-typedef void (*wmi_notify_handler) (u32 value, void *context);
+typedef void (*wmi_notify_handler) (union acpi_object *data, void *context);
 
 int wmi_instance_count(const char *guid);
 
-- 
cgit v1.2.3


From 79a56f4c8fa629ac79ada8f4b746195bd91f09c7 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Sun, 1 Sep 2024 05:10:53 +0200
Subject: platform/x86: wmi: Remove wmi_get_event_data()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since the WMI driver core now takes care of retrieving the
WMI event data even for legacy WMI notify handlers, this
function is no longer used.

Remove it to prevent WMI drivers from messing up the ACPI
firmware on some machines.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Link: https://lore.kernel.org/r/20240901031055.3030-4-W_Armin@gmx.de
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/acpi.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index eed105b1fbfb..3cbe4b57bc73 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -401,7 +401,6 @@ extern acpi_status wmi_set_block(const char *guid, u8 instance,
 extern acpi_status wmi_install_notify_handler(const char *guid,
 					wmi_notify_handler handler, void *data);
 extern acpi_status wmi_remove_notify_handler(const char *guid);
-extern acpi_status wmi_get_event_data(u32 event, struct acpi_buffer *out);
 extern bool wmi_has_guid(const char *guid);
 extern char *wmi_get_acpi_device_uid(const char *guid);
 
-- 
cgit v1.2.3


From 3e6a7e3cda773adacc2fa4b9623d0a8c0f904d50 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 27 Aug 2024 09:59:38 -0700
Subject: iommufd: Reorder struct forward declarations

Reorder struct forward declarations to alphabetic order to simplify
maintenance, as upcoming patches will add more to the list.

No functional change intended.

Link: https://patch.msgid.link/r/c5dd87100f6f01389b838c63237e28c5dd373358.1724776335.git.nicolinc@nvidia.com
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommufd.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index c2f2f6b9148e..30f832a60ccb 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -11,12 +11,12 @@
 #include <linux/types.h>
 
 struct device;
-struct iommufd_device;
-struct page;
-struct iommufd_ctx;
-struct iommufd_access;
 struct file;
 struct iommu_group;
+struct iommufd_access;
+struct iommufd_ctx;
+struct iommufd_device;
+struct page;
 
 struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 					   struct device *dev, u32 *id);
-- 
cgit v1.2.3


From 6ff4cd1160afafc12ad1603e3d2f39256e4b708d Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Tue, 27 Aug 2024 10:45:15 +0800
Subject: lib/string_choices: Add str_true_false()/str_false_true() helper

Add str_true_false()/str_false_true() helper to return "true" or
"false" string literal.

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Link: https://lore.kernel.org/r/20240827024517.914100-2-lihongbo22@huawei.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/string_choices.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h
index 1320bcdcb89c..ebcc56b28ede 100644
--- a/include/linux/string_choices.h
+++ b/include/linux/string_choices.h
@@ -48,6 +48,12 @@ static inline const char *str_up_down(bool v)
 }
 #define str_down_up(v)		str_up_down(!(v))
 
+static inline const char *str_true_false(bool v)
+{
+	return v ? "true" : "false";
+}
+#define str_false_true(v)		str_true_false(!(v))
+
 /**
  * str_plural - Return the simple pluralization based on English counts
  * @num: Number used for deciding pluralization
-- 
cgit v1.2.3


From c2708ba91c3c1fba424b77de83b6fc45cbf38c46 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Thu, 5 Sep 2024 17:25:39 +0800
Subject: lib/string_choices: Introduce several opposite string choice helpers

Similar to the exists helper: str_enable_disable/
str_enabled_disabled/str_on_off/str_yes_no helpers, we can
add the opposite helpers. That's str_disable_enable,
str_disabled_enabled, str_off_on and str_no_yes.

There are more than 10 cases currently (expect
str_disable_enable now has 3 use cases) exist in the code
can be replaced with these helper.

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Acked-by: Andy Shevchenko <andy@kernel.org>
Link: https://lore.kernel.org/r/20240905092540.2962122-2-lihongbo22@huawei.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/string_choices.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h
index ebcc56b28ede..fd067992260a 100644
--- a/include/linux/string_choices.h
+++ b/include/linux/string_choices.h
@@ -8,11 +8,13 @@ static inline const char *str_enable_disable(bool v)
 {
 	return v ? "enable" : "disable";
 }
+#define str_disable_enable(v)		str_enable_disable(!(v))
 
 static inline const char *str_enabled_disabled(bool v)
 {
 	return v ? "enabled" : "disabled";
 }
+#define str_disabled_enabled(v)		str_enabled_disabled(!(v))
 
 static inline const char *str_hi_lo(bool v)
 {
@@ -36,11 +38,13 @@ static inline const char *str_on_off(bool v)
 {
 	return v ? "on" : "off";
 }
+#define str_off_on(v)		str_on_off(!(v))
 
 static inline const char *str_yes_no(bool v)
 {
 	return v ? "yes" : "no";
 }
+#define str_no_yes(v)		str_yes_no(!(v))
 
 static inline const char *str_up_down(bool v)
 {
-- 
cgit v1.2.3


From c121d5cc3a993cdbfab46a152bdd50227a4d5e8c Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Thu, 5 Sep 2024 17:25:40 +0800
Subject: lib/string_choices: Add some comments to make more clear for string
 choices helpers.

Add some comments to explain why we should use string_choices helpers.

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Acked-by: Andy Shevchenko <andy@kernel.org>
Link: https://lore.kernel.org/r/20240905092540.2962122-3-lihongbo22@huawei.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/string_choices.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h
index fd067992260a..120ca0f28e95 100644
--- a/include/linux/string_choices.h
+++ b/include/linux/string_choices.h
@@ -2,6 +2,19 @@
 #ifndef _LINUX_STRING_CHOICES_H_
 #define _LINUX_STRING_CHOICES_H_
 
+/*
+ * Here provide a series of helpers in the str_$TRUE_$FALSE format (you can
+ * also expand some helpers as needed), where $TRUE and $FALSE are their
+ * corresponding literal strings. These helpers can be used in the printing
+ * and also in other places where constant strings are required. Using these
+ * helpers offers the following benefits:
+ *  1) Reducing the hardcoding of strings, which makes the code more elegant
+ *     through these simple literal-meaning helpers.
+ *  2) Unifying the output, which prevents the same string from being printed
+ *     in various forms, such as enable/disable, enabled/disabled, en/dis.
+ *  3) Deduping by the linker, which results in a smaller binary file.
+ */
+
 #include <linux/types.h>
 
 static inline const char *str_enable_disable(bool v)
-- 
cgit v1.2.3


From 1ae497c78f01855f3695b58481311ffdd429b028 Mon Sep 17 00:00:00 2001
From: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Date: Thu, 5 Sep 2024 13:52:32 +0800
Subject: bpf: use type_may_be_null() helper for nullable-param check

Commit 980ca8ceeae6 ("bpf: check bpf_dummy_struct_ops program params for
test runs") does bitwise AND between reg_type and PTR_MAYBE_NULL, which
is correct, but due to type difference the compiler complains:

  net/bpf/bpf_dummy_struct_ops.c:118:31: warning: bitwise operation between different enumeration types ('const enum bpf_reg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
    118 |                 if (info && (info->reg_type & PTR_MAYBE_NULL))
        |                              ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~

Workaround the warning by moving the type_may_be_null() helper from
verifier.c into bpf_verifier.h, and reuse it here to check whether param
is nullable.

Fixes: 980ca8ceeae6 ("bpf: check bpf_dummy_struct_ops program params for test runs")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202404241956.HEiRYwWq-lkp@intel.com/
Signed-off-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240905055233.70203-1-shung-hsi.yu@suse.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 8458632824a4..4513372c5bc8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -927,6 +927,11 @@ static inline bool type_is_sk_pointer(enum bpf_reg_type type)
 		type == PTR_TO_XDP_SOCK;
 }
 
+static inline bool type_may_be_null(u32 type)
+{
+	return type & PTR_MAYBE_NULL;
+}
+
 static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
 {
 	env->scratched_regs |= 1U << regno;
-- 
cgit v1.2.3


From 706ae6446494b4523d33b0d96ea1fd9d50ca9be6 Mon Sep 17 00:00:00 2001
From: Nikita Shubin <nikita.shubin@maquefel.me>
Date: Wed, 4 Sep 2024 14:41:07 +0300
Subject: clk: fixed-rate: add devm_clk_hw_register_fixed_rate_parent_data()

Add devm_clk_hw_register_fixed_rate_parent_data(), devres-managed helper
to register fixed-rate clock with parent_data.

Signed-off-by: Nikita Shubin <nikita.shubin@maquefel.me>
Link: https://lore.kernel.org/r/20240904-devm_clk_hw_register_fixed_rate_parent_data-v1-1-7f14d6b456e5@maquefel.me
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 4a537260f655..7e43caabb54b 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -393,6 +393,20 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name,
 #define devm_clk_hw_register_fixed_rate(dev, name, parent_name, flags, fixed_rate)  \
 	__clk_hw_register_fixed_rate((dev), NULL, (name), (parent_name), NULL, \
 				     NULL, (flags), (fixed_rate), 0, 0, true)
+/**
+ * devm_clk_hw_register_fixed_rate_parent_data - register fixed-rate clock with
+ * the clock framework
+ * @dev: device that is registering this clock
+ * @name: name of this clock
+ * @parent_data: parent clk data
+ * @flags: framework-specific flags
+ * @fixed_rate: non-adjustable clock rate
+ */
+#define devm_clk_hw_register_fixed_rate_parent_data(dev, name, parent_data, flags, \
+						    fixed_rate)			   \
+	__clk_hw_register_fixed_rate((dev), NULL, (name), NULL, NULL,		   \
+				     (parent_data), (flags), (fixed_rate), 0,	   \
+				     0, true)
 /**
  * clk_hw_register_fixed_rate_parent_hw - register fixed-rate clock with
  * the clock framework
-- 
cgit v1.2.3


From 9934a1bd45b2b03f6d1204a6ae2780d3b009799f Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 5 Aug 2024 10:57:31 +0200
Subject: clk: provide devm_clk_get_optional_enabled_with_rate()

There are clock users in the kernel that can't use
devm_clk_get_optional_enabled() as they need to set rate after getting
the clock and before enabling it. Provide a managed helper that wraps
these operations in the correct order.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Link: https://lore.kernel.org/r/20240805-clk-new-helper-v2-1-e5fdd1e1d729@linaro.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk.h b/include/linux/clk.h
index 0fa56d672532..851a0f2cf42c 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -640,6 +640,32 @@ struct clk *devm_clk_get_optional_prepared(struct device *dev, const char *id);
  */
 struct clk *devm_clk_get_optional_enabled(struct device *dev, const char *id);
 
+/**
+ * devm_clk_get_optional_enabled_with_rate - devm_clk_get_optional() +
+ *                                           clk_set_rate() +
+ *                                           clk_prepare_enable()
+ * @dev: device for clock "consumer"
+ * @id: clock consumer ID
+ * @rate: new clock rate
+ *
+ * Context: May sleep.
+ *
+ * Return: a struct clk corresponding to the clock producer, or
+ * valid IS_ERR() condition containing errno.  The implementation
+ * uses @dev and @id to determine the clock consumer, and thereby
+ * the clock producer.  If no such clk is found, it returns NULL
+ * which serves as a dummy clk.  That's the only difference compared
+ * to devm_clk_get_enabled().
+ *
+ * The returned clk (if valid) is prepared and enabled and rate was set.
+ *
+ * The clock will automatically be disabled, unprepared and freed
+ * when the device is unbound from the bus.
+ */
+struct clk *devm_clk_get_optional_enabled_with_rate(struct device *dev,
+						    const char *id,
+						    unsigned long rate);
+
 /**
  * devm_get_clk_from_child - lookup and obtain a managed reference to a
  *			     clock producer from child node.
@@ -982,6 +1008,13 @@ static inline struct clk *devm_clk_get_optional_enabled(struct device *dev,
 	return NULL;
 }
 
+static inline struct clk *
+devm_clk_get_optional_enabled_with_rate(struct device *dev, const char *id,
+					unsigned long rate)
+{
+	return NULL;
+}
+
 static inline int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 						 struct clk_bulk_data *clks)
 {
-- 
cgit v1.2.3


From 08062af0a52107a243f7608fd972edb54ca5b7f8 Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Wed, 4 Sep 2024 15:34:30 +0000
Subject: net: napi: Prevent overflow of napi_defer_hard_irqs

In commit 6f8b12d661d0 ("net: napi: add hard irqs deferral feature")
napi_defer_irqs was added to net_device and napi_defer_irqs_count was
added to napi_struct, both as type int.

This value never goes below zero, so there is not reason for it to be a
signed int. Change the type for both from int to u32, and add an
overflow check to sysfs to limit the value to S32_MAX.

The limit of S32_MAX was chosen because the practical limit before this
patch was S32_MAX (anything larger was an overflow) and thus there are
no behavioral changes introduced. If the extra bit is needed in the
future, the limit can be raised.

Before this patch:

$ sudo bash -c 'echo 2147483649 > /sys/class/net/eth4/napi_defer_hard_irqs'
$ cat /sys/class/net/eth4/napi_defer_hard_irqs
-2147483647

After this patch:

$ sudo bash -c 'echo 2147483649 > /sys/class/net/eth4/napi_defer_hard_irqs'
bash: line 0: echo: write error: Numerical result out of range

Similarly, /sys/class/net/XXXXX/tx_queue_len is defined as unsigned:

include/linux/netdevice.h:      unsigned int            tx_queue_len;

And has an overflow check:

dev_change_tx_queue_len(..., unsigned long new_len):

  if (new_len != (unsigned int)new_len)
          return -ERANGE;

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Joe Damato <jdamato@fastly.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20240904153431.307932-1-jdamato@fastly.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ca5f0dda733b..b47c00657bd0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -356,7 +356,7 @@ struct napi_struct {
 
 	unsigned long		state;
 	int			weight;
-	int			defer_hard_irqs_count;
+	u32			defer_hard_irqs_count;
 	unsigned long		gro_bitmask;
 	int			(*poll)(struct napi_struct *, int);
 #ifdef CONFIG_NETPOLL
@@ -2075,7 +2075,7 @@ struct net_device {
 	unsigned int		real_num_rx_queues;
 	struct netdev_rx_queue	*_rx;
 	unsigned long		gro_flush_timeout;
-	int			napi_defer_hard_irqs;
+	u32			napi_defer_hard_irqs;
 	unsigned int		gro_max_size;
 	unsigned int		gro_ipv4_max_size;
 	rx_handler_func_t __rcu	*rx_handler;
-- 
cgit v1.2.3


From de35996d4b364749194c5b473d1912578880833e Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sun, 4 Aug 2024 18:47:08 -0700
Subject: Input: matrix_keypad - remove support for platform data

There are no more users of struct matrix_keypad_platform_data in the
kernel, remove support for it from the driver.

Acked-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20240805014710.1961677-6-dmitry.torokhov@gmail.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/input/matrix_keypad.h | 48 -------------------------------------
 1 file changed, 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/input/matrix_keypad.h b/include/linux/input/matrix_keypad.h
index b8d8d69eba29..90867f44ab4d 100644
--- a/include/linux/input/matrix_keypad.h
+++ b/include/linux/input/matrix_keypad.h
@@ -34,52 +34,6 @@ struct matrix_keymap_data {
 	unsigned int	keymap_size;
 };
 
-/**
- * struct matrix_keypad_platform_data - platform-dependent keypad data
- * @keymap_data: pointer to &matrix_keymap_data
- * @row_gpios: pointer to array of gpio numbers representing rows
- * @col_gpios: pointer to array of gpio numbers reporesenting colums
- * @num_row_gpios: actual number of row gpios used by device
- * @num_col_gpios: actual number of col gpios used by device
- * @col_scan_delay_us: delay, measured in microseconds, that is
- *	needed before we can keypad after activating column gpio
- * @debounce_ms: debounce interval in milliseconds
- * @clustered_irq: may be specified if interrupts of all row/column GPIOs
- *	are bundled to one single irq
- * @clustered_irq_flags: flags that are needed for the clustered irq
- * @active_low: gpio polarity
- * @wakeup: controls whether the device should be set up as wakeup
- *	source
- * @no_autorepeat: disable key autorepeat
- * @drive_inactive_cols: drive inactive columns during scan, rather than
- *	making them inputs.
- *
- * This structure represents platform-specific data that use used by
- * matrix_keypad driver to perform proper initialization.
- */
-struct matrix_keypad_platform_data {
-	const struct matrix_keymap_data *keymap_data;
-
-	const unsigned int *row_gpios;
-	const unsigned int *col_gpios;
-
-	unsigned int	num_row_gpios;
-	unsigned int	num_col_gpios;
-
-	unsigned int	col_scan_delay_us;
-
-	/* key debounce interval in milli-second */
-	unsigned int	debounce_ms;
-
-	unsigned int	clustered_irq;
-	unsigned int	clustered_irq_flags;
-
-	bool		active_low;
-	bool		wakeup;
-	bool		no_autorepeat;
-	bool		drive_inactive_cols;
-};
-
 int matrix_keypad_build_keymap(const struct matrix_keymap_data *keymap_data,
 			       const char *keymap_name,
 			       unsigned int rows, unsigned int cols,
@@ -88,6 +42,4 @@ int matrix_keypad_build_keymap(const struct matrix_keymap_data *keymap_data,
 int matrix_keypad_parse_properties(struct device *dev,
 				   unsigned int *rows, unsigned int *cols);
 
-#define matrix_keypad_parse_of_params matrix_keypad_parse_properties
-
 #endif /* _MATRIX_KEYPAD_H */
-- 
cgit v1.2.3


From f820b43754cdbd078a0d17122b281f42cbcd2155 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 23 Aug 2024 22:50:27 -0700
Subject: Input: zforce_ts - remove support for platfrom data

There are no in-tree users of platform data and any new ones should
either use device tree or static device properties, so let's remove
platform data support.

Tested-by: Andreas Kemnade <andreas@kemnade.info> # Tolino Shine2HD
Link: https://lore.kernel.org/r/20240824055047.1706392-4-dmitry.torokhov@gmail.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/linux/platform_data/zforce_ts.h | 15 ---------------
 1 file changed, 15 deletions(-)
 delete mode 100644 include/linux/platform_data/zforce_ts.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/zforce_ts.h b/include/linux/platform_data/zforce_ts.h
deleted file mode 100644
index 2463a4a856a6..000000000000
--- a/include/linux/platform_data/zforce_ts.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/* drivers/input/touchscreen/zforce.c
- *
- * Copyright (C) 2012-2013 MundoReader S.L.
- */
-
-#ifndef _LINUX_INPUT_ZFORCE_TS_H
-#define _LINUX_INPUT_ZFORCE_TS_H
-
-struct zforce_ts_platdata {
-	unsigned int x_max;
-	unsigned int y_max;
-};
-
-#endif /* _LINUX_INPUT_ZFORCE_TS_H */
-- 
cgit v1.2.3


From 23de126f9248a2f9218175ea0dd1c1403e334440 Mon Sep 17 00:00:00 2001
From: Pieter Van Trappen <pieter.van.trappen@cern.ch>
Date: Wed, 4 Sep 2024 08:27:42 +0200
Subject: net: dsa: microchip: replace unclear KSZ8830 strings

Replace ksz8830 with ksz88x3 for CHIP_ID definition and other
strings. This due to KSZ8830 not being an actual switch but the Chip
ID shared among KSZ8863/8873 switches, impossible to differentiate
from their Chip ID or Revision ID registers.

Now all KSZ*_CHIP_ID macros refer to actual, existing switches which
removes confusion.

Signed-off-by: Pieter Van Trappen <pieter.van.trappen@cern.ch>
Acked-by: Arun Ramadoss <arun.ramadoss@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/platform_data/microchip-ksz.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h
index d074019474f5..2ee1a679e592 100644
--- a/include/linux/platform_data/microchip-ksz.h
+++ b/include/linux/platform_data/microchip-ksz.h
@@ -27,7 +27,7 @@ enum ksz_chip_id {
 	KSZ8795_CHIP_ID = 0x8795,
 	KSZ8794_CHIP_ID = 0x8794,
 	KSZ8765_CHIP_ID = 0x8765,
-	KSZ8830_CHIP_ID = 0x8830,
+	KSZ88X3_CHIP_ID = 0x8830,
 	KSZ8864_CHIP_ID = 0x8864,
 	KSZ8895_CHIP_ID = 0x8895,
 	KSZ9477_CHIP_ID = 0x00947700,
-- 
cgit v1.2.3


From 56bd72e9cd8272687aa2a8eccddabc5526cd7845 Mon Sep 17 00:00:00 2001
From: Marek Maslanka <mmaslanka@google.com>
Date: Mon, 12 Aug 2024 18:41:42 +0000
Subject: clocksource: acpi_pm: Add external callback for suspend/resume

Provides the capability to register an external callback for the ACPI PM
timer, which is called during the suspend and resume processes.

Signed-off-by: Marek Maslanka <mmaslanka@google.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20240812184150.1079924-1-mmaslanka@google.com
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 include/linux/acpi_pmtmr.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi_pmtmr.h b/include/linux/acpi_pmtmr.h
index 50d88bf1498d..0ded9220d379 100644
--- a/include/linux/acpi_pmtmr.h
+++ b/include/linux/acpi_pmtmr.h
@@ -26,6 +26,19 @@ static inline u32 acpi_pm_read_early(void)
 	return acpi_pm_read_verified() & ACPI_PM_MASK;
 }
 
+/**
+ * Register callback for suspend and resume event
+ *
+ * @cb Callback triggered on suspend and resume
+ * @data Data passed with the callback
+ */
+void acpi_pmtmr_register_suspend_resume_callback(void (*cb)(void *data, bool suspend), void *data);
+
+/**
+ * Remove registered callback for suspend and resume event
+ */
+void acpi_pmtmr_unregister_suspend_resume_callback(void);
+
 #else
 
 static inline u32 acpi_pm_read_early(void)
-- 
cgit v1.2.3


From 3f4c0ad490cc55db5b375a2e19c67159cb255795 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Mon, 26 Aug 2024 12:14:04 +0200
Subject: mtd: nand: Rename the NAND IO iteration helper

Soon a helper for iterating over blocks will be needed (for continuous
read purposes). In order to clarify the intend of this helper, let's
rename it with the "page" wording inside.

While at it, improve the doc and fix a typo.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Link: https://lore.kernel.org/linux-mtd/20240826101412.20644-2-miquel.raynal@bootlin.com
---
 include/linux/mtd/nand.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index b2996dc987ff..92e06ac33815 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -906,19 +906,19 @@ static inline void nanddev_pos_next_page(struct nand_device *nand,
 }
 
 /**
- * nand_io_iter_init - Initialize a NAND I/O iterator
+ * nand_io_page_iter_init - Initialize a NAND I/O iterator
  * @nand: NAND device
  * @offs: absolute offset
  * @req: MTD request
  * @iter: NAND I/O iterator
  *
  * Initializes a NAND iterator based on the information passed by the MTD
- * layer.
+ * layer for page jumps.
  */
-static inline void nanddev_io_iter_init(struct nand_device *nand,
-					enum nand_page_io_req_type reqtype,
-					loff_t offs, struct mtd_oob_ops *req,
-					struct nand_io_iter *iter)
+static inline void nanddev_io_page_iter_init(struct nand_device *nand,
+					     enum nand_page_io_req_type reqtype,
+					     loff_t offs, struct mtd_oob_ops *req,
+					     struct nand_io_iter *iter)
 {
 	struct mtd_info *mtd = nanddev_to_mtd(nand);
 
@@ -990,10 +990,10 @@ static inline bool nanddev_io_iter_end(struct nand_device *nand,
  * @req: MTD I/O request
  * @iter: NAND I/O iterator
  *
- * Should be used for iterate over pages that are contained in an MTD request.
+ * Should be used for iterating over pages that are contained in an MTD request.
  */
 #define nanddev_io_for_each_page(nand, type, start, req, iter)		\
-	for (nanddev_io_iter_init(nand, type, start, req, iter);	\
+	for (nanddev_io_page_iter_init(nand, type, start, req, iter);	\
 	     !nanddev_io_iter_end(nand, iter);				\
 	     nanddev_io_iter_next_page(nand, iter))
 
-- 
cgit v1.2.3


From 8adf1ac24ba8ee34b8fd36ebf159004ec472fca0 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Mon, 26 Aug 2024 12:14:05 +0200
Subject: mtd: nand: Introduce a block iterator

In order to be able to iterate easily across eraseblocks rather than
pages, let's introduce a block iterator inspired from the page iterator.

The main usage of this iterator will be for continuous/sequential reads,
where it is interesting to use a single request rather than split the
requests in smaller chunks (ie. pages) that can be hardly optimized.

So a "continuous" boolean get's added for this purpose.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20240826101412.20644-3-miquel.raynal@bootlin.com
---
 include/linux/mtd/nand.h | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 92e06ac33815..1e4208040956 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -103,6 +103,8 @@ enum nand_page_io_req_type {
  * @ooblen: the number of OOB bytes to read from/write to this page
  * @oobbuf: buffer to store OOB data in or get OOB data from
  * @mode: one of the %MTD_OPS_XXX mode
+ * @continuous: no need to start over the operation at the end of each page, the
+ * NAND device will automatically prepare the next one
  *
  * This object is used to pass per-page I/O requests to NAND sub-layers. This
  * way all useful information are already formatted in a useful way and
@@ -125,6 +127,7 @@ struct nand_page_io_req {
 		void *in;
 	} oobbuf;
 	int mode;
+	bool continuous;
 };
 
 const struct mtd_ooblayout_ops *nand_get_small_page_ooblayout(void);
@@ -937,6 +940,43 @@ static inline void nanddev_io_page_iter_init(struct nand_device *nand,
 	iter->req.ooblen = min_t(unsigned int,
 				 iter->oobbytes_per_page - iter->req.ooboffs,
 				 iter->oobleft);
+	iter->req.continuous = false;
+}
+
+/**
+ * nand_io_block_iter_init - Initialize a NAND I/O iterator
+ * @nand: NAND device
+ * @offs: absolute offset
+ * @req: MTD request
+ * @iter: NAND I/O iterator
+ *
+ * Initializes a NAND iterator based on the information passed by the MTD
+ * layer for block jumps (no OOB)
+ *
+ * In practice only reads may leverage this iterator.
+ */
+static inline void nanddev_io_block_iter_init(struct nand_device *nand,
+					      enum nand_page_io_req_type reqtype,
+					      loff_t offs, struct mtd_oob_ops *req,
+					      struct nand_io_iter *iter)
+{
+	unsigned int offs_in_eb;
+
+	iter->req.type = reqtype;
+	iter->req.mode = req->mode;
+	iter->req.dataoffs = nanddev_offs_to_pos(nand, offs, &iter->req.pos);
+	iter->req.ooboffs = 0;
+	iter->oobbytes_per_page = 0;
+	iter->dataleft = req->len;
+	iter->oobleft = 0;
+	iter->req.databuf.in = req->datbuf;
+	offs_in_eb = (nand->memorg.pagesize * iter->req.pos.page) + iter->req.dataoffs;
+	iter->req.datalen = min_t(unsigned int,
+				  nanddev_eraseblock_size(nand) - offs_in_eb,
+				  iter->dataleft);
+	iter->req.oobbuf.in = NULL;
+	iter->req.ooblen = 0;
+	iter->req.continuous = true;
 }
 
 /**
@@ -962,6 +1002,25 @@ static inline void nanddev_io_iter_next_page(struct nand_device *nand,
 				 iter->oobleft);
 }
 
+/**
+ * nand_io_iter_next_block - Move to the next block
+ * @nand: NAND device
+ * @iter: NAND I/O iterator
+ *
+ * Updates the @iter to point to the next block.
+ * No OOB handling available.
+ */
+static inline void nanddev_io_iter_next_block(struct nand_device *nand,
+					      struct nand_io_iter *iter)
+{
+	nanddev_pos_next_eraseblock(nand, &iter->req.pos);
+	iter->dataleft -= iter->req.datalen;
+	iter->req.databuf.in += iter->req.datalen;
+	iter->req.dataoffs = 0;
+	iter->req.datalen = min_t(unsigned int, nanddev_eraseblock_size(nand),
+				  iter->dataleft);
+}
+
 /**
  * nand_io_iter_end - Should end iteration or not
  * @nand: NAND device
@@ -997,6 +1056,21 @@ static inline bool nanddev_io_iter_end(struct nand_device *nand,
 	     !nanddev_io_iter_end(nand, iter);				\
 	     nanddev_io_iter_next_page(nand, iter))
 
+/**
+ * nand_io_for_each_block - Iterate over all NAND pages contained in an MTD I/O
+ *			    request, one block at a time
+ * @nand: NAND device
+ * @start: start address to read/write from
+ * @req: MTD I/O request
+ * @iter: NAND I/O iterator
+ *
+ * Should be used for iterating over blocks that are contained in an MTD request.
+ */
+#define nanddev_io_for_each_block(nand, type, start, req, iter)		\
+	for (nanddev_io_block_iter_init(nand, type, start, req, iter);	\
+	     !nanddev_io_iter_end(nand, iter);				\
+	     nanddev_io_iter_next_block(nand, iter))
+
 bool nanddev_isbad(struct nand_device *nand, const struct nand_pos *pos);
 bool nanddev_isreserved(struct nand_device *nand, const struct nand_pos *pos);
 int nanddev_markbad(struct nand_device *nand, const struct nand_pos *pos);
-- 
cgit v1.2.3


From 631cfdd0520d19b7f4fc13b834fd9c8b46c6dbac Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Mon, 26 Aug 2024 12:14:07 +0200
Subject: mtd: spi-nand: Add continuous read support

A regular page read consist in:
- Asking one page of content from the NAND array to be loaded in the
  chip's SRAM,
- Waiting for the operation to be done,
- Retrieving the data (I/O phase) from the chip's SRAM.

When reading several sequential pages, the above operation is repeated
over and over. There is however a way to optimize these accesses, by
enabling continuous reads. The feature requires the NAND chip to have a
second internal SRAM area plus a bit of additional internal logic to
trigger another internal transfer between the NAND array and the second
SRAM area while the I/O phase is ongoing. Once the first I/O phase is
done, the host can continue reading more data, continuously, as the chip
will automatically switch to the second SRAM content (which has already
been loaded) and in turns trigger the next load into the first SRAM area
again.

From an instruction perspective, the command op-codes are different, but
the same cycles are required. The only difference is that after a
continuous read (which is stopped by a CS deassert), the host must
observe a delay of tRST. However, because there is no guarantee in Linux
regarding the actual state of the CS pin after a transfer (in order to
speed-up the next transfer if targeting the same device), it was
necessary to manually end the continuous read with a configuration
register write operation.

Continuous reads have two main drawbacks:
* They only work on full pages (column address ignored)
* Only the main data area is pulled, out-of-band bytes are not
  accessible. Said otherwise, the feature can only be useful with on-die
  ECC engines.

Performance wise, measures have been performed on a Zynq platform using
Macronix SPI-NAND controller with a Macronix chip (based on the
flash_speed tool modified for testing sequential reads):
- 1-1-1 mode: performances improved from +3% (2-pages) up to +10% after
              a dozen pages.
- 1-1-4 mode: performances improved from +15% (2-pages) up to +40% after
              a dozen pages.

This series is based on a previous work from Macronix engineer Jaime
Liao.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Link: https://lore.kernel.org/linux-mtd/20240826101412.20644-5-miquel.raynal@bootlin.com
---
 include/linux/mtd/spinand.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 5c19ead60499..14dce347dc46 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -336,6 +336,7 @@ struct spinand_ondie_ecc_conf {
  * @op_variants.update_cache: variants of the update-cache operation
  * @select_target: function used to select a target/die. Required only for
  *		   multi-die chips
+ * @set_cont_read: enable/disable continuous cached reads
  *
  * Each SPI NAND manufacturer driver should have a spinand_info table
  * describing all the chips supported by the driver.
@@ -354,6 +355,8 @@ struct spinand_info {
 	} op_variants;
 	int (*select_target)(struct spinand_device *spinand,
 			     unsigned int target);
+	int (*set_cont_read)(struct spinand_device *spinand,
+			     bool enable);
 };
 
 #define SPINAND_ID(__method, ...)					\
@@ -379,6 +382,9 @@ struct spinand_info {
 #define SPINAND_SELECT_TARGET(__func)					\
 	.select_target = __func,
 
+#define SPINAND_CONT_READ(__set_cont_read)				\
+	.set_cont_read = __set_cont_read,
+
 #define SPINAND_INFO(__model, __id, __memorg, __eccreq, __op_variants,	\
 		     __flags, ...)					\
 	{								\
@@ -422,6 +428,12 @@ struct spinand_dirmap {
  *		passed in spi_mem_op be DMA-able, so we can't based the bufs on
  *		the stack
  * @manufacturer: SPI NAND manufacturer information
+ * @cont_read_possible: Field filled by the core once the whole system
+ *		configuration is known to tell whether continuous reads are
+ *		suitable to use or not in general with this chip/configuration.
+ *		A per-transfer check must of course be done to ensure it is
+ *		actually relevant to enable this feature.
+ * @set_cont_read: Enable/disable the continuous read feature
  * @priv: manufacturer private data
  */
 struct spinand_device {
@@ -451,6 +463,10 @@ struct spinand_device {
 	u8 *scratchbuf;
 	const struct spinand_manufacturer *manufacturer;
 	void *priv;
+
+	bool cont_read_possible;
+	int (*set_cont_read)(struct spinand_device *spinand,
+			     bool enable);
 };
 
 /**
-- 
cgit v1.2.3


From a06f2e7cc4de104b14971e08d37f9d08c256b053 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Mon, 26 Aug 2024 12:14:08 +0200
Subject: mtd: spi-nand: Expose spinand_write_reg_op()

This helper function will soon be used from a vendor driver, let's
export it through the spinand.h header. No need for any export, as there
is currently no reason for any module to need it.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20240826101412.20644-6-miquel.raynal@bootlin.com
---
 include/linux/mtd/spinand.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 14dce347dc46..8dbf67ca710a 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -533,6 +533,7 @@ int spinand_match_and_init(struct spinand_device *spinand,
 			   enum spinand_readid_method rdid_method);
 
 int spinand_upd_cfg(struct spinand_device *spinand, u8 mask, u8 val);
+int spinand_write_reg_op(struct spinand_device *spinand, u8 reg, u8 val);
 int spinand_select_target(struct spinand_device *spinand, unsigned int target);
 
 #endif /* __LINUX_MTD_SPINAND_H */
-- 
cgit v1.2.3


From bb2ac59f8205165fce5aee9135bd1e2ea9b1a74b Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 01:18:23 +0100
Subject: mfd: axp20x: AXP717: Add support for boost regulator

The AXP717 also contains a boost regulator, to provide the 5V USB VBUS
rail when running on battery.

Add the registers to the MFD description to be able to use them from the
regulator driver.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: John Watts <contact@jookia.org>
Acked-by: Lee Jones <lee@kernel.org>
Reviewed-by: Chen-Yu Tsai <wens@csie.org>
Link: https://patch.msgid.link/20240816001824.6028-3-andre.przywara@arm.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/axp20x.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 8c0a33a2e9ce..3758f986491c 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -115,6 +115,8 @@ enum axp20x_variants {
 #define AXP313A_IRQ_STATE		0x21
 
 #define AXP717_ON_INDICATE		0x00
+#define AXP717_MODULE_EN_CONTROL_2	0x19
+#define AXP717_BOOST_CONTROL		0x1e
 #define AXP717_IRQ0_EN			0x40
 #define AXP717_IRQ1_EN			0x41
 #define AXP717_IRQ2_EN			0x42
-- 
cgit v1.2.3


From 22dfe2ea1d63f15d8b9661e7690ac0a03159db6a Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 01:18:24 +0100
Subject: regulator: axp20x: AXP717: Add boost regulator

The AXP717 also contains an adjustable boost regulator, to provide the
5V USB VBUS rail when running on battery.

Add the regulator description that states the voltage range this
regulator can cover.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: John Watts <contact@jookia.org>
Reviewed-by: Chen-Yu Tsai <wens@csie.org>
Link: https://patch.msgid.link/20240816001824.6028-4-andre.przywara@arm.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/axp20x.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 3758f986491c..e0cd66bd3b6d 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -486,6 +486,7 @@ enum {
 	AXP717_CLDO3,
 	AXP717_CLDO4,
 	AXP717_CPUSLDO,
+	AXP717_BOOST,
 	AXP717_REG_ID_MAX,
 };
 
-- 
cgit v1.2.3


From 78f76b09c915d7281317d8e082c4c02f325a0366 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 17 Jul 2024 17:48:16 +0900
Subject: ata: libata: Move sata_std_hardreset() definition to libata-sata.c

Unlike ata_std_prereset() and ata_std_postreset(), the function
sata_std_hardreset() applies only to SATA devices, as its name implies.
So move its definition to libata-sata.c.

Together with this, also move the definition of sata_port_ops to
libata-sata.c, where it belongs.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
---
 include/linux/libata.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 6552e90753ae..d52ae7723c05 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1104,8 +1104,6 @@ static inline bool ata_port_is_frozen(const struct ata_port *ap)
 extern int ata_std_prereset(struct ata_link *link, unsigned long deadline);
 extern int ata_wait_after_reset(struct ata_link *link, unsigned long deadline,
 				int (*check_ready)(struct ata_link *link));
-extern int sata_std_hardreset(struct ata_link *link, unsigned int *class,
-			      unsigned long deadline);
 extern void ata_std_postreset(struct ata_link *link, unsigned int *classes);
 
 extern struct ata_host *ata_host_alloc(struct device *dev, int n_ports);
@@ -1229,6 +1227,8 @@ extern int sata_scr_read(struct ata_link *link, int reg, u32 *val);
 extern int sata_scr_write(struct ata_link *link, int reg, u32 val);
 extern int sata_scr_write_flush(struct ata_link *link, int reg, u32 val);
 extern int sata_set_spd(struct ata_link *link);
+int sata_std_hardreset(struct ata_link *link, unsigned int *class,
+		       unsigned long deadline);
 extern int sata_link_hardreset(struct ata_link *link,
 			const unsigned int *timing, unsigned long deadline,
 			bool *online, int (*check_ready)(struct ata_link *));
@@ -1256,6 +1256,11 @@ static inline int sata_scr_write_flush(struct ata_link *link, int reg, u32 val)
 	return -EOPNOTSUPP;
 }
 static inline int sata_set_spd(struct ata_link *link) { return -EOPNOTSUPP; }
+static inline int sata_std_hardreset(struct ata_link *link, unsigned int *class,
+				     unsigned long deadline)
+{
+	return -EOPNOTSUPP;
+}
 static inline int sata_link_hardreset(struct ata_link *link,
 				      const unsigned int *timing,
 				      unsigned long deadline,
-- 
cgit v1.2.3


From 10e807637f288f9963340a30ae8da9bb06beca3c Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 10 Jun 2024 19:11:31 +0900
Subject: ata: libata: Rename ata_eh_read_sense_success_ncq_log()

The function ata_eh_read_sense_success_ncq_log() does more that just
reading the sense data for successful NCQ commands log page as it also
sets the sense data for all commands listed in the log page.

Rename this function to ata_eh_get_ncq_success_sense() to better
describe what the function does. Furthermore, since this function is
only called from ata_eh_get_success_sense() in libata-eh.c, there is no
need to export it and its declaration can be moved to
drivers/ata/libata.h.

To be consistent with this change, the function
ata_eh_read_sense_success_non_ncq() is also renamed to
ata_eh_get_non_ncq_success_sense().

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
---
 include/linux/libata.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index d52ae7723c05..55a6b57742bc 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1234,7 +1234,6 @@ extern int sata_link_hardreset(struct ata_link *link,
 			bool *online, int (*check_ready)(struct ata_link *));
 extern int sata_link_resume(struct ata_link *link, const unsigned int *params,
 			    unsigned long deadline);
-extern int ata_eh_read_sense_success_ncq_log(struct ata_link *link);
 extern void ata_eh_analyze_ncq_error(struct ata_link *link);
 #else
 static inline const unsigned int *
@@ -1277,10 +1276,6 @@ static inline int sata_link_resume(struct ata_link *link,
 {
 	return -EOPNOTSUPP;
 }
-static inline int ata_eh_read_sense_success_ncq_log(struct ata_link *link)
-{
-	return -EOPNOTSUPP;
-}
 static inline void ata_eh_analyze_ncq_error(struct ata_link *link) { }
 #endif
 extern int sata_link_debounce(struct ata_link *link,
-- 
cgit v1.2.3


From da65bbdd3bc1e8d2193e01167a413d90d9988c04 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 28 Aug 2024 11:07:43 +0900
Subject: ata: libata: Move sector_buf from struct ata_port to struct
 ata_device

The 512B buffer sector_buf field of struct ata_port is used for scanning
devices as well as during error recovery with ata EH. This buffer is
thus useless if a port does not have a device connected to it.
And also given that commands using this buffer are issued to devices,
and not to ports, move this buffer definition from struct ata_port to
struct ata_device.

This change slightly increases system memory usage for systems using a
port-multiplier as in that case we do not need a per-device buffer for
scanning devices (PMP does not allow parallel scanning) nor for EH (as
when entering EH we are guaranteed that all commands to all devices
connected to the PMP have completed or have been aborted). However,
this change reduces memory usage on systems that have many ports with
only few devices rives connected, which is a much more common use case
than the PMP use case.

Suggested-by: Niklas Cassel <cassel@kernel.org>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
---
 include/linux/libata.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 55a6b57742bc..aac38dcd2230 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -769,6 +769,9 @@ struct ata_device {
 	int			spdn_cnt;
 	/* ering is CLEAR_END, read comment above CLEAR_END */
 	struct ata_ering	ering;
+
+	/* For EH */
+	u8			sector_buf[ATA_SECT_SIZE] ____cacheline_aligned;
 };
 
 /* Fields between ATA_DEVICE_CLEAR_BEGIN and ATA_DEVICE_CLEAR_END are
@@ -916,7 +919,6 @@ struct ata_port {
 #endif
 	/* owned by EH */
 	u8			*ncq_sense_buf;
-	u8			sector_buf[ATA_SECT_SIZE] ____cacheline_aligned;
 };
 
 /* The following initializer overrides a method to NULL whether one of
-- 
cgit v1.2.3


From 602bcf212637633d537ee74bf39c6bc5722efb9b Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 17 Jul 2024 17:55:31 +0900
Subject: ata: libata: Improve CDL resource management

The ncq_sense_buf buffer field of struct ata_port is allocated and used
only for devices that support the Command Duration Limits (CDL) feature.
However, the cdl buffer of struct ata_device, which is used to cache the
command duration limits log page for devices supporting CDL is always
allocated as part of struct ata_device, which is wasteful of memory for
devices that do not support this feature.

Clean this up by defining both buffers as part of the new ata_cdl
structure and allocating this structure only for devices that support
the CDL feature. This new structure is attached to struct ata_device
using the cdl pointer.

The functions ata_dev_init_cdl_resources() and
ata_dev_cleanup_cdl_resources() are defined to manage this new structure
allocation, initialization and freeing when a port is removed or a
device disabled. ata_dev_init_cdl_resources() is called from
ata_dev_config_cdl() only for devices that support CDL.
ata_dev_cleanup_cdl_resources() is called from ata_dev_free_resources()
to free the ata_cdl structure when a device is being disabled by EH.

Note that the name of the former cdl log buffer of struct ata_device is
changed to desc_log_buf to make it clearer that it is a buffer for the
limit descriptors log page.

This change reduces the size of struct ata_device, thus reducing memory
usage for ATA devices that do not support the CDL feature.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
---
 include/linux/libata.h | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index aac38dcd2230..9b4a6ff03235 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -700,6 +700,21 @@ struct ata_cpr_log {
 	struct ata_cpr		cpr[] __counted_by(nr_cpr);
 };
 
+struct ata_cdl {
+	/*
+	 * Buffer to cache the CDL log page 18h (command duration descriptors)
+	 * for SCSI-ATA translation.
+	 */
+	u8			desc_log_buf[ATA_LOG_CDL_SIZE];
+
+	/*
+	 * Buffer to handle reading the sense data for successful NCQ Commands
+	 * log page for commands using a CDL with one of the limits policy set
+	 * to 0xD (successful completion with sense data available bit set).
+	 */
+	u8			ncq_sense_log_buf[ATA_LOG_SENSE_NCQ_SIZE];
+};
+
 struct ata_device {
 	struct ata_link		*link;
 	unsigned int		devno;		/* 0 or 1 */
@@ -762,8 +777,8 @@ struct ata_device {
 	/* Concurrent positioning ranges */
 	struct ata_cpr_log	*cpr_log;
 
-	/* Command Duration Limits log support */
-	u8			cdl[ATA_LOG_CDL_SIZE];
+	/* Command Duration Limits support */
+	struct ata_cdl		*cdl;
 
 	/* error history */
 	int			spdn_cnt;
@@ -917,8 +932,6 @@ struct ata_port {
 #ifdef CONFIG_ATA_ACPI
 	struct ata_acpi_gtm	__acpi_init_gtm; /* use ata_acpi_init_gtm() */
 #endif
-	/* owned by EH */
-	u8			*ncq_sense_buf;
 };
 
 /* The following initializer overrides a method to NULL whether one of
-- 
cgit v1.2.3


From 446216bd8e5d4a3ffc9738f6adffc98fbfdcc582 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Sun, 8 Sep 2024 13:05:48 +0900
Subject: firewire: core: expose kernel API to schedule work item to process
 isochronous context

In packet-per-buffer mode for isochronous context of 1394 OHCI, software
can schedule hardIRQ to the buffer in which the content of isochronous
packet is processed. The actual behaviour is different between isochronous
receive (IR) and transmit (IT) contexts in respect to isochronous cycle in
which the hardIRQ occurs.

In IR context, the hardIRQ occurs when the buffer is filled actually by
the content of received packet. If there are any isochronous cycles in
which the packet transmission is skipped, it is postponed to generate
the hardIRQ in respect to the isochronous cycle. In IT context, software
can schedule the content of packet every isochronous cycle including
skipping, therefore the hardIRQ occurs in the isochronous cycle to which
the software scheduled.

ALSA firewire stack uses the packet-per-buffer mode for both IR/IT
contexts. To process time stamp per packet (or per sample in some cases)
steadily for media clock recovery against unexpected transmission skips,
it uses an IT context to operate all of isochronous contexts by calls of
fw_iso_context_flush_completions() in the bottom-half of hardIRQ for the
IT context.

Although it looks well to handle all of isochronous contexts in a single
bottom-half context, it relatively takes longer time. In the future code
integration (not yet), it is possible to apply parallelism method to
process these context. In the case, it is useful to allow unit drivers to
schedule work items to process these isochronous contexts.

As a preparation, this commit exposes
fw_iso_context_schedule_flush_completions() as a kernel API available by
unit drivers. It is renamed from fw_iso_context_queue_work() since it is
a counter part of fw_iso_context_flush_completions().

Link: https://lore.kernel.org/r/20240908040549.75304-2-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 include/linux/firewire.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 72f497b61739..f815d12deda0 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -531,6 +531,23 @@ int fw_iso_context_queue(struct fw_iso_context *ctx,
 			 unsigned long payload);
 void fw_iso_context_queue_flush(struct fw_iso_context *ctx);
 int fw_iso_context_flush_completions(struct fw_iso_context *ctx);
+
+/**
+ * fw_iso_context_schedule_flush_completions() - schedule work item to process isochronous context.
+ * @ctx: the isochronous context
+ *
+ * Schedule a work item on workqueue to process the isochronous context. The registered callback
+ * function is called in the worker if some packets have been already transferred since the last
+ * time. If it is required to process the context in the current context,
+ * fw_iso_context_flush_completions() is available instead.
+ *
+ * Context: Any context.
+ */
+static inline void fw_iso_context_schedule_flush_completions(struct fw_iso_context *ctx)
+{
+	queue_work(ctx->card->isoc_wq, &ctx->work);
+}
+
 int fw_iso_context_start(struct fw_iso_context *ctx,
 			 int cycle, int sync, int tags);
 int fw_iso_context_stop(struct fw_iso_context *ctx);
-- 
cgit v1.2.3


From 1d07085402d122f223bda3f8b72bea37a46ee0c9 Mon Sep 17 00:00:00 2001
From: Bibo Mao <maobibo@loongson.cn>
Date: Sat, 7 Sep 2024 16:27:20 +0800
Subject: smp: Mark smp_prepare_boot_cpu() __init

smp_prepare_boot_cpu() is only called during boot, hence mark it as
__init.

Signed-off-by: Bibo Mao <maobibo@loongson.cn>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Huacai Chen <chenhuacai@loongson.cn>
Link: https://lore.kernel.org/all/20240907082720.452148-1-maobibo@loongson.cn
---
 include/linux/smp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index fcd61dfe2af3..6a0813c905d0 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -109,7 +109,7 @@ static inline void on_each_cpu_cond(smp_cond_func_t cond_func,
  * Architecture specific boot CPU setup.  Defined as empty weak function in
  * init/main.c. Architectures can override it.
  */
-void smp_prepare_boot_cpu(void);
+void __init smp_prepare_boot_cpu(void);
 
 #ifdef CONFIG_SMP
 
-- 
cgit v1.2.3


From c259acab839e57eab0318f32da4ae803a8d59397 Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Wed, 4 Sep 2024 07:13:05 -0700
Subject: ptp/ioctl: support MONOTONIC{,_RAW} timestamps for
 PTP_SYS_OFFSET_EXTENDED

The ability to read the PHC (Physical Hardware Clock) alongside
multiple system clocks is currently dependent on the specific
hardware architecture. This limitation restricts the use of
PTP_SYS_OFFSET_PRECISE to certain hardware configurations.

The generic soultion which would work across all architectures
is to read the PHC along with the latency to perform PHC-read as
offered by PTP_SYS_OFFSET_EXTENDED which provides pre and post
timestamps.  However, these timestamps are currently limited
to the CLOCK_REALTIME timebase. Since CLOCK_REALTIME is affected
by NTP (or similar time synchronization services), it can
experience significant jumps forward or backward. This hinders
the precise latency measurements that PTP_SYS_OFFSET_EXTENDED
is designed to provide.

This problem could be addressed by supporting MONOTONIC_RAW
timestamps within PTP_SYS_OFFSET_EXTENDED. Unlike CLOCK_REALTIME
or CLOCK_MONOTONIC, the MONOTONIC_RAW timebase is unaffected
by NTP adjustments.

This enhancement can be implemented by utilizing one of the three
reserved words within the PTP_SYS_OFFSET_EXTENDED struct to pass
the clock-id for timestamps.  The current behavior aligns with
clock-id for CLOCK_REALTIME timebase (value of 0), ensuring
backward compatibility of the UAPI.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptp_clock_kernel.h | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 6e4b8206c7d0..c892d22ce0a7 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -47,10 +47,12 @@ struct system_device_crosststamp;
  * struct ptp_system_timestamp - system time corresponding to a PHC timestamp
  * @pre_ts: system timestamp before capturing PHC
  * @post_ts: system timestamp after capturing PHC
+ * @clockid: clock-base used for capturing the system timestamps
  */
 struct ptp_system_timestamp {
 	struct timespec64 pre_ts;
 	struct timespec64 post_ts;
+	clockid_t clockid;
 };
 
 /**
@@ -457,14 +459,40 @@ static inline ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp,
 
 static inline void ptp_read_system_prets(struct ptp_system_timestamp *sts)
 {
-	if (sts)
-		ktime_get_real_ts64(&sts->pre_ts);
+	if (sts) {
+		switch (sts->clockid) {
+		case CLOCK_REALTIME:
+			ktime_get_real_ts64(&sts->pre_ts);
+			break;
+		case CLOCK_MONOTONIC:
+			ktime_get_ts64(&sts->pre_ts);
+			break;
+		case CLOCK_MONOTONIC_RAW:
+			ktime_get_raw_ts64(&sts->pre_ts);
+			break;
+		default:
+			break;
+		}
+	}
 }
 
 static inline void ptp_read_system_postts(struct ptp_system_timestamp *sts)
 {
-	if (sts)
-		ktime_get_real_ts64(&sts->post_ts);
+	if (sts) {
+		switch (sts->clockid) {
+		case CLOCK_REALTIME:
+			ktime_get_real_ts64(&sts->post_ts);
+			break;
+		case CLOCK_MONOTONIC:
+			ktime_get_ts64(&sts->post_ts);
+			break;
+		case CLOCK_MONOTONIC_RAW:
+			ktime_get_raw_ts64(&sts->post_ts);
+			break;
+		default:
+			break;
+		}
+	}
 }
 
 #endif
-- 
cgit v1.2.3


From 1fcb932c8b5ce86219d7dedcd63659351a43291c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Thu, 4 Jul 2024 00:56:40 +0200
Subject: rcu/nocb: Simplify (de-)offloading state machine

Now that the (de-)offloading process can only apply to offline CPUs,
there is no more concurrency between rcu_core and nocb kthreads. Also
the mutation now happens on empty queues.

Therefore the state machine can be reduced to a single bit called
SEGCBLIST_OFFLOADED. Simplify the transition as follows:

* Upon offloading: queue the rdp to be added to the rcuog list and
  wait for the rcuog kthread to set the SEGCBLIST_OFFLOADED bit. Unpark
  rcuo kthread.

* Upon de-offloading: Park rcuo kthread. Queue the rdp to be removed
  from the rcuog list and wait for the rcuog kthread to clear the
  SEGCBLIST_OFFLOADED bit.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 include/linux/rcu_segcblist.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 1ef1bb54853d..2fdc2208f1ca 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -185,9 +185,7 @@ struct rcu_cblist {
  *  ----------------------------------------------------------------------------
  */
 #define SEGCBLIST_ENABLED	BIT(0)
-#define SEGCBLIST_LOCKING	BIT(1)
-#define SEGCBLIST_KTHREAD_GP	BIT(2)
-#define SEGCBLIST_OFFLOADED	BIT(3)
+#define SEGCBLIST_OFFLOADED	BIT(1)
 
 struct rcu_segcblist {
 	struct rcu_head *head;
-- 
cgit v1.2.3


From bd7c8ff9fef4b21a97f9b30a7364845ee6eaaf23 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Wed, 4 Sep 2024 15:04:53 +0200
Subject: treewide: Fix wrong singular form of jiffies in comments

There are several comments all over the place, which uses a wrong singular
form of jiffies.

Replace 'jiffie' by 'jiffy'. No functional change.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> # m68k
Link: https://lore.kernel.org/all/20240904-devel-anna-maria-b4-timers-flseep-v1-3-e98760256370@linutronix.de
---
 include/linux/jiffies.h             | 2 +-
 include/linux/timekeeper_internal.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index d9f1435a5a13..1220f0fbe5bf 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -418,7 +418,7 @@ extern unsigned long preset_lpj;
 #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\
                                         TICK_NSEC -1) / (u64)TICK_NSEC))
 /*
- * The maximum jiffie value is (MAX_INT >> 1).  Here we translate that
+ * The maximum jiffy value is (MAX_INT >> 1).  Here we translate that
  * into seconds.  The 64-bit case will overflow if we are not careful,
  * so use the messy SH_DIV macro to do it.  Still all constants.
  */
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 84ff2844df2a..902c20ef495a 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -73,7 +73,7 @@ struct tk_read_base {
  * @overflow_seen:	Overflow warning flag (DEBUG_TIMEKEEPING)
  *
  * Note: For timespec(64) based interfaces wall_to_monotonic is what
- * we need to add to xtime (or xtime corrected for sub jiffie times)
+ * we need to add to xtime (or xtime corrected for sub jiffy times)
  * to get to monotonic time.  Monotonic is pegged at zero at system
  * boot time, so wall_to_monotonic will be negative, however, we will
  * ALWAYS keep the tv_nsec part positive so we can use the usual
-- 
cgit v1.2.3


From ca229bdbef29be207c8666f106acc3bf50736c05 Mon Sep 17 00:00:00 2001
From: Cheng Ming Lin <chengminglin@mxic.com.tw>
Date: Mon, 9 Sep 2024 17:26:42 +0800
Subject: mtd: spinand: Add support for setting plane select bits

Add two flags for inserting the Plane Select bit into the column
address during the write_to_cache and the read_from_cache operation.

Add the SPINAND_HAS_PROG_PLANE_SELECT_BIT flag for serial NAND flash
that require inserting the Plane Select bit into the column address
during the write_to_cache operation.

Add the SPINAND_HAS_READ_PLANE_SELECT_BIT flag for serial NAND flash
that require inserting the Plane Select bit into the column address
during the read_from_cache operation.

Signed-off-by: Cheng Ming Lin <chengminglin@mxic.com.tw>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20240909092643.2434479-2-linchengming884@gmail.com
---
 include/linux/mtd/spinand.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 8dbf67ca710a..702e5fb13dae 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -312,6 +312,8 @@ struct spinand_ecc_info {
 
 #define SPINAND_HAS_QE_BIT		BIT(0)
 #define SPINAND_HAS_CR_FEAT_BIT		BIT(1)
+#define SPINAND_HAS_PROG_PLANE_SELECT_BIT		BIT(2)
+#define SPINAND_HAS_READ_PLANE_SELECT_BIT		BIT(3)
 
 /**
  * struct spinand_ondie_ecc_conf - private SPI-NAND on-die ECC engine structure
-- 
cgit v1.2.3


From d688d65a847ff910146eff51e70f4b7049895eb5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:49 +0200
Subject: fs: add generic_llseek_cookie()

This is similar to generic_file_llseek() but allows the caller to
specify a cookie that will be updated to indicate that a seek happened.
Caller's requiring that information in their readdir implementations can
use that.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-8-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 58c91a52cad1..3e6b3c1afb31 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3202,6 +3202,8 @@ extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
 extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
 		int whence, loff_t maxsize, loff_t eof);
+loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
+			     u64 *cookie);
 extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
 		int whence, loff_t size);
 extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);
-- 
cgit v1.2.3


From 88d2ae0e6eb84e1ed58f339c6a0de16c24fa2a60 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Aug 2024 11:18:21 -0400
Subject: inode: make __iget() a static inline

bcachefs is switching to an rhashtable for vfs inodes instead of the
standard inode.c hashtable, so we need this exported, or - a static
inline makes more sense for a single atomic_inc().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/fs.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd34b5755c0b..8fc4bad3b6ae 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3094,7 +3094,14 @@ static inline bool is_zero_ino(ino_t ino)
 	return (u32)ino == 0;
 }
 
-extern void __iget(struct inode * inode);
+/*
+ * inode->i_lock must be held
+ */
+static inline void __iget(struct inode *inode)
+{
+	atomic_inc(&inode->i_count);
+}
+
 extern void iget_failed(struct inode *);
 extern void clear_inode(struct inode *);
 extern void __destroy_inode(struct inode *);
-- 
cgit v1.2.3


From f6594633817374a64a5fb31007bd810cad1425ff Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Jun 2024 19:00:33 -0400
Subject: lib/generic-radix-tree.c: genradix_ptr_inlined()

Provide an inlined fast path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/generic-radix-tree.h | 75 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index f3512fddf3d7..8a3e1e886d1c 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -48,6 +48,49 @@ struct genradix_root;
 #define GENRADIX_NODE_SHIFT	9
 #define GENRADIX_NODE_SIZE	(1U << GENRADIX_NODE_SHIFT)
 
+#define GENRADIX_ARY		(GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
+#define GENRADIX_ARY_SHIFT	ilog2(GENRADIX_ARY)
+
+/* depth that's needed for a genradix that can address up to ULONG_MAX: */
+#define GENRADIX_MAX_DEPTH	\
+	DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
+
+#define GENRADIX_DEPTH_MASK				\
+	((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
+
+static inline int genradix_depth_shift(unsigned depth)
+{
+	return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
+}
+
+/*
+ * Returns size (of data, in bytes) that a tree of a given depth holds:
+ */
+static inline size_t genradix_depth_size(unsigned depth)
+{
+	return 1UL << genradix_depth_shift(depth);
+}
+
+static inline unsigned genradix_root_to_depth(struct genradix_root *r)
+{
+	return (unsigned long) r & GENRADIX_DEPTH_MASK;
+}
+
+static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+{
+	return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
+}
+
+struct genradix_node {
+	union {
+		/* Interior node: */
+		struct genradix_node	*children[GENRADIX_ARY];
+
+		/* Leaf: */
+		u8			data[GENRADIX_NODE_SIZE];
+	};
+};
+
 struct __genradix {
 	struct genradix_root		*root;
 };
@@ -128,6 +171,30 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
 #define __genradix_idx_to_offset(_radix, _idx)			\
 	__idx_to_offset(_idx, __genradix_obj_size(_radix))
 
+static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset)
+{
+	struct genradix_root *r = READ_ONCE(radix->root);
+	struct genradix_node *n = genradix_root_to_node(r);
+	unsigned level		= genradix_root_to_depth(r);
+	unsigned shift		= genradix_depth_shift(level);
+
+	if (unlikely(ilog2(offset) >= genradix_depth_shift(level)))
+		return NULL;
+
+	while (n && shift > GENRADIX_NODE_SHIFT) {
+		shift -= GENRADIX_ARY_SHIFT;
+		n = n->children[offset >> shift];
+		offset &= (1UL << shift) - 1;
+	}
+
+	return n ? &n->data[offset] : NULL;
+}
+
+#define genradix_ptr_inlined(_radix, _idx)			\
+	(__genradix_cast(_radix)				\
+	 __genradix_ptr_inlined(&(_radix)->tree,		\
+			__genradix_idx_to_offset(_radix, _idx)))
+
 void *__genradix_ptr(struct __genradix *, size_t);
 
 /**
@@ -144,6 +211,14 @@ void *__genradix_ptr(struct __genradix *, size_t);
 
 void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
 
+#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp)			\
+	(__genradix_cast(_radix)					\
+	 (__genradix_ptr_inlined(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _idx)) ?:	\
+	  __genradix_ptr_alloc(&(_radix)->tree,				\
+			__genradix_idx_to_offset(_radix, _idx),		\
+			_gfp)))
+
 /**
  * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
  *			if necessary
-- 
cgit v1.2.3


From b3f9da79e7783ca4f1c1d4214af976c548629c1d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 10 Aug 2024 23:14:44 -0400
Subject: lib/generic-radix-tree.c: add preallocation

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/generic-radix-tree.h | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index 8a3e1e886d1c..5b51c3d582d6 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -41,6 +41,7 @@
 #include <linux/limits.h>
 #include <linux/log2.h>
 #include <linux/math.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 
 struct genradix_root;
@@ -81,6 +82,10 @@ static inline struct genradix_node *genradix_root_to_node(struct genradix_root *
 	return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
 }
 
+struct __genradix {
+	struct genradix_root		*root;
+};
+
 struct genradix_node {
 	union {
 		/* Interior node: */
@@ -91,9 +96,15 @@ struct genradix_node {
 	};
 };
 
-struct __genradix {
-	struct genradix_root		*root;
-};
+static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
+{
+	return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
+}
+
+static inline void genradix_free_node(struct genradix_node *node)
+{
+	kfree(node);
+}
 
 /*
  * NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE:
@@ -209,7 +220,8 @@ void *__genradix_ptr(struct __genradix *, size_t);
 	 __genradix_ptr(&(_radix)->tree,			\
 			__genradix_idx_to_offset(_radix, _idx)))
 
-void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
+void *__genradix_ptr_alloc(struct __genradix *, size_t,
+			   struct genradix_node **, gfp_t);
 
 #define genradix_ptr_alloc_inlined(_radix, _idx, _gfp)			\
 	(__genradix_cast(_radix)					\
@@ -217,7 +229,15 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
 			__genradix_idx_to_offset(_radix, _idx)) ?:	\
 	  __genradix_ptr_alloc(&(_radix)->tree,				\
 			__genradix_idx_to_offset(_radix, _idx),		\
-			_gfp)))
+			NULL, _gfp)))
+
+#define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\
+	(__genradix_cast(_radix)					\
+	 (__genradix_ptr_inlined(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _idx)) ?:	\
+	  __genradix_ptr_alloc(&(_radix)->tree,				\
+			__genradix_idx_to_offset(_radix, _idx),		\
+			_new_node, _gfp)))
 
 /**
  * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
@@ -232,7 +252,13 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
 	(__genradix_cast(_radix)				\
 	 __genradix_ptr_alloc(&(_radix)->tree,			\
 			__genradix_idx_to_offset(_radix, _idx),	\
-			_gfp))
+			NULL, _gfp))
+
+#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\
+	(__genradix_cast(_radix)				\
+	 __genradix_ptr_alloc(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _idx),	\
+			_new_node, _gfp))
 
 struct genradix_iter {
 	size_t			offset;
-- 
cgit v1.2.3


From f877f1d81b2ffcac341ff62ae076d7ad5ba83f46 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Mon, 9 Sep 2024 23:00:18 +0900
Subject: firewire: core: use mutex to coordinate concurrent calls to flush
 completions

In current implementation, test_and_set_bit_lock() is used to mediate
concurrent calls of ohci_flush_iso_completions(). However, the ad-hoc
usage of atomic operations is not preferable.

This commit uses mutex_trylock() as the similar operations. The core
function is responsible for the mediation, instead of 1394 OHCI driver.

Link: https://lore.kernel.org/r/20240909140018.65289-3-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 include/linux/firewire.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index f815d12deda0..19e8c5f9537c 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -512,6 +512,7 @@ union fw_iso_callback {
 struct fw_iso_context {
 	struct fw_card *card;
 	struct work_struct work;
+	struct mutex flushing_completions_mutex;
 	int type;
 	int channel;
 	int speed;
-- 
cgit v1.2.3


From 34c626c3004a73ee7da5072be48ddff9c2902902 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Mon, 16 Jan 2023 01:16:43 +0200
Subject: net/mlx5: Added missing mlx5_ifc definition for HW Steering

Add mlx5_ifc definitions that are required for HWS support.

Note that due to change in the mlx5_ifc_flow_table_context_bits
structure that now includes both SWS and HWS bits in a union,
this patch also includes small change in one of SWS files that
was required for compilation.

Reviewed-by: Hamdan Agbariya <hamdani@nvidia.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 189 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 161 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 234ad6f16e92..b6f8e3834bd3 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -80,23 +80,15 @@ enum {
 
 enum {
 	MLX5_OBJ_TYPE_SW_ICM = 0x0008,
-	MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT  = 0x23,
-};
-
-enum {
-	MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM),
-	MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT = (1ULL << 11),
-	MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q = (1ULL << 13),
-	MLX5_GENERAL_OBJ_TYPES_CAP_HEADER_MODIFY_ARGUMENT =
-		(1ULL << MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT),
-	MLX5_GENERAL_OBJ_TYPES_CAP_MACSEC_OFFLOAD = (1ULL << 39),
-};
-
-enum {
 	MLX5_OBJ_TYPE_GENEVE_TLV_OPT = 0x000b,
 	MLX5_OBJ_TYPE_VIRTIO_NET_Q = 0x000d,
 	MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS = 0x001c,
 	MLX5_OBJ_TYPE_MATCH_DEFINER = 0x0018,
+	MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT  = 0x23,
+	MLX5_OBJ_TYPE_STC = 0x0040,
+	MLX5_OBJ_TYPE_RTC = 0x0041,
+	MLX5_OBJ_TYPE_STE = 0x0042,
+	MLX5_OBJ_TYPE_MODIFY_HDR_PATTERN = 0x0043,
 	MLX5_OBJ_TYPE_PAGE_TRACK = 0x46,
 	MLX5_OBJ_TYPE_MKEY = 0xff01,
 	MLX5_OBJ_TYPE_QP = 0xff02,
@@ -112,6 +104,16 @@ enum {
 	MLX5_OBJ_TYPE_RQT = 0xff0e,
 	MLX5_OBJ_TYPE_FLOW_COUNTER = 0xff0f,
 	MLX5_OBJ_TYPE_CQ = 0xff10,
+	MLX5_OBJ_TYPE_FT_ALIAS = 0xff15,
+};
+
+enum {
+	MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM),
+	MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT = (1ULL << 11),
+	MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q = (1ULL << 13),
+	MLX5_GENERAL_OBJ_TYPES_CAP_HEADER_MODIFY_ARGUMENT =
+		(1ULL << MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT),
+	MLX5_GENERAL_OBJ_TYPES_CAP_MACSEC_OFFLOAD = (1ULL << 39),
 };
 
 enum {
@@ -313,6 +315,7 @@ enum {
 	MLX5_CMD_OP_MODIFY_VHCA_STATE             = 0xb0e,
 	MLX5_CMD_OP_SYNC_CRYPTO                   = 0xb12,
 	MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS       = 0xb16,
+	MLX5_CMD_OP_GENERATE_WQE                  = 0xb17,
 	MLX5_CMD_OP_MAX
 };
 
@@ -485,7 +488,13 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         reserved_at_66[0x2];
 	u8         reformat_add_macsec[0x1];
 	u8         reformat_remove_macsec[0x1];
-	u8         reserved_at_6a[0xe];
+	u8         reparse[0x1];
+	u8         reserved_at_6b[0x1];
+	u8         cross_vhca_object[0x1];
+	u8         reformat_l2_to_l3_audp_tunnel[0x1];
+	u8         reformat_l3_audp_tunnel_to_l2[0x1];
+	u8         ignore_flow_level_rtc_valid[0x1];
+	u8         reserved_at_70[0x8];
 	u8         log_max_ft_num[0x8];
 
 	u8         reserved_at_80[0x10];
@@ -522,7 +531,15 @@ struct mlx5_ifc_ipv6_layout_bits {
 	u8         ipv6[16][0x8];
 };
 
+struct mlx5_ifc_ipv6_simple_layout_bits {
+	u8         ipv6_127_96[0x20];
+	u8         ipv6_95_64[0x20];
+	u8         ipv6_63_32[0x20];
+	u8         ipv6_31_0[0x20];
+};
+
 union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits {
+	struct mlx5_ifc_ipv6_simple_layout_bits ipv6_simple_layout;
 	struct mlx5_ifc_ipv6_layout_bits ipv6_layout;
 	struct mlx5_ifc_ipv4_layout_bits ipv4_layout;
 	u8         reserved_at_0[0x80];
@@ -911,7 +928,9 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits {
 	u8      reserved_at_8[0x5];
 	u8      fdb_uplink_hairpin[0x1];
 	u8      fdb_multi_path_any_table_limit_regc[0x1];
-	u8      reserved_at_f[0x3];
+	u8      reserved_at_f[0x1];
+	u8      fdb_dynamic_tunnel[0x1];
+	u8      reserved_at_11[0x1];
 	u8      fdb_multi_path_any_table[0x1];
 	u8      reserved_at_13[0x2];
 	u8      fdb_modify_header_fwd_to_table[0x1];
@@ -950,6 +969,73 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits {
 	u8      reserved_at_1900[0x6700];
 };
 
+struct mlx5_ifc_wqe_based_flow_table_cap_bits {
+	u8         reserved_at_0[0x3];
+	u8         log_max_num_ste[0x5];
+	u8         reserved_at_8[0x3];
+	u8         log_max_num_stc[0x5];
+	u8         reserved_at_10[0x3];
+	u8         log_max_num_rtc[0x5];
+	u8         reserved_at_18[0x3];
+	u8         log_max_num_header_modify_pattern[0x5];
+
+	u8         rtc_hash_split_table[0x1];
+	u8         rtc_linear_lookup_table[0x1];
+	u8         reserved_at_22[0x1];
+	u8         stc_alloc_log_granularity[0x5];
+	u8         reserved_at_28[0x3];
+	u8         stc_alloc_log_max[0x5];
+	u8         reserved_at_30[0x3];
+	u8         ste_alloc_log_granularity[0x5];
+	u8         reserved_at_38[0x3];
+	u8         ste_alloc_log_max[0x5];
+
+	u8         reserved_at_40[0xb];
+	u8         rtc_reparse_mode[0x5];
+	u8         reserved_at_50[0x3];
+	u8         rtc_index_mode[0x5];
+	u8         reserved_at_58[0x3];
+	u8         rtc_log_depth_max[0x5];
+
+	u8         reserved_at_60[0x10];
+	u8         ste_format[0x10];
+
+	u8         stc_action_type[0x80];
+
+	u8         header_insert_type[0x10];
+	u8         header_remove_type[0x10];
+
+	u8         trivial_match_definer[0x20];
+
+	u8         reserved_at_140[0x1b];
+	u8         rtc_max_num_hash_definer_gen_wqe[0x5];
+
+	u8         reserved_at_160[0x18];
+	u8         access_index_mode[0x8];
+
+	u8         reserved_at_180[0x10];
+	u8         ste_format_gen_wqe[0x10];
+
+	u8         linear_match_definer_reg_c3[0x20];
+
+	u8         fdb_jump_to_tir_stc[0x1];
+	u8         reserved_at_1c1[0x1f];
+};
+
+struct mlx5_ifc_esw_cap_bits {
+	u8         reserved_at_0[0x1d];
+	u8         merged_eswitch[0x1];
+	u8         reserved_at_1e[0x2];
+
+	u8         reserved_at_20[0x40];
+
+	u8         esw_manager_vport_number_valid[0x1];
+	u8         reserved_at_61[0xf];
+	u8         esw_manager_vport_number[0x10];
+
+	u8         reserved_at_80[0x780];
+};
+
 enum {
 	MLX5_COUNTER_SOURCE_ESWITCH = 0x0,
 	MLX5_COUNTER_FLOW_ESWITCH   = 0x1,
@@ -1443,9 +1529,13 @@ enum {
 };
 
 enum {
+	MLX5_FLEX_IPV4_OVER_VXLAN_ENABLED	= 1 << 0,
+	MLX5_FLEX_IPV6_OVER_VXLAN_ENABLED	= 1 << 1,
+	MLX5_FLEX_IPV6_OVER_IP_ENABLED		= 1 << 2,
 	MLX5_FLEX_PARSER_GENEVE_ENABLED		= 1 << 3,
 	MLX5_FLEX_PARSER_MPLS_OVER_GRE_ENABLED	= 1 << 4,
 	MLX5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED	= 1 << 5,
+	MLX5_FLEX_P_BIT_VXLAN_GPE_ENABLED	= 1 << 6,
 	MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED	= 1 << 7,
 	MLX5_FLEX_PARSER_ICMP_V4_ENABLED	= 1 << 8,
 	MLX5_FLEX_PARSER_ICMP_V6_ENABLED	= 1 << 9,
@@ -1650,7 +1740,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         pci_sync_for_fw_update_event[0x1];
 	u8         reserved_at_1f2[0x6];
 	u8         init2_lag_tx_port_affinity[0x1];
-	u8         reserved_at_1fa[0x3];
+	u8         reserved_at_1fa[0x2];
+	u8         wqe_based_flow_table_update_cap[0x1];
 	u8         cqe_version[0x4];
 
 	u8         compact_address_vector[0x1];
@@ -1959,7 +2050,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_760[0x3];
 	u8         log_max_num_header_modify_argument[0x5];
-	u8         reserved_at_768[0x4];
+	u8         log_header_modify_argument_granularity_offset[0x4];
 	u8         log_header_modify_argument_granularity[0x4];
 	u8         reserved_at_770[0x3];
 	u8         log_header_modify_argument_max_alloc[0x5];
@@ -2006,7 +2097,8 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_140[0x60];
 
 	u8	   flow_table_type_2_type[0x8];
-	u8	   reserved_at_1a8[0x3];
+	u8	   reserved_at_1a8[0x2];
+	u8         format_select_dw_8_6_ext[0x1];
 	u8	   log_min_mkey_entity_size[0x5];
 	u8	   reserved_at_1b0[0x10];
 
@@ -2022,6 +2114,16 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_250[0x10];
 
 	u8	   reserved_at_260[0x120];
+
+	u8	   format_select_dw_gtpu_dw_0[0x8];
+	u8	   format_select_dw_gtpu_dw_1[0x8];
+	u8	   format_select_dw_gtpu_dw_2[0x8];
+	u8	   format_select_dw_gtpu_first_ext_dw_0[0x8];
+
+	u8	   generate_wqe_type[0x20];
+
+	u8	   reserved_at_2c0[0xc0];
+
 	u8	   reserved_at_380[0xb];
 	u8	   min_mkey_log_entity_size_fixed_buffer[0x5];
 	u8	   ec_vf_vport_base[0x10];
@@ -2037,9 +2139,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 
 	u8	   reserved_at_400[0x1];
 	u8	   min_mkey_log_entity_size_fixed_buffer_valid[0x1];
-	u8	   reserved_at_402[0x1e];
+	u8	   reserved_at_402[0xe];
+	u8	   return_reg_id[0x10];
 
-	u8	   reserved_at_420[0x20];
+	u8	   reserved_at_420[0x1c];
+	u8	   flow_table_hash_type[0x4];
 
 	u8	   reserved_at_440[0x8];
 	u8	   max_num_eqs_24b[0x18];
@@ -2086,7 +2190,7 @@ struct mlx5_ifc_extended_dest_format_bits {
 	u8         reserved_at_60[0x20];
 };
 
-union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits {
+union mlx5_ifc_dest_format_flow_counter_list_auto_bits {
 	struct mlx5_ifc_extended_dest_format_bits extended_dest_format;
 	struct mlx5_ifc_flow_counter_list_bits flow_counter_list;
 };
@@ -2178,7 +2282,10 @@ struct mlx5_ifc_wq_bits {
 	u8         reserved_at_139[0x4];
 	u8         log_wqe_stride_size[0x3];
 
-	u8         reserved_at_140[0x80];
+	u8         dbr_umem_id[0x20];
+	u8         wq_umem_id[0x20];
+
+	u8         wq_umem_offset[0x40];
 
 	u8         headers_mkey[0x20];
 
@@ -3562,6 +3669,8 @@ union mlx5_ifc_hca_cap_union_bits {
 	struct mlx5_ifc_per_protocol_networking_offload_caps_bits per_protocol_networking_offload_caps;
 	struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap;
 	struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap;
+	struct mlx5_ifc_wqe_based_flow_table_cap_bits wqe_based_flow_table_cap;
+	struct mlx5_ifc_esw_cap_bits esw_cap;
 	struct mlx5_ifc_e_switch_cap_bits e_switch_cap;
 	struct mlx5_ifc_port_selection_cap_bits port_selection_cap;
 	struct mlx5_ifc_qos_cap_bits qos_cap;
@@ -3678,7 +3787,7 @@ struct mlx5_ifc_flow_context_bits {
 
 	u8         reserved_at_1300[0x500];
 
-	union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits destination[];
+	union mlx5_ifc_dest_format_flow_counter_list_auto_bits destination[];
 };
 
 enum {
@@ -3919,7 +4028,8 @@ struct mlx5_ifc_sqc_bits {
 	u8         reg_umr[0x1];
 	u8         allow_swp[0x1];
 	u8         hairpin[0x1];
-	u8         reserved_at_f[0xb];
+	u8         non_wire[0x1];
+	u8         reserved_at_10[0xa];
 	u8	   ts_format[0x2];
 	u8	   reserved_at_1c[0x4];
 
@@ -4961,6 +5071,16 @@ struct mlx5_ifc_set_fte_in_bits {
 	struct mlx5_ifc_flow_context_bits flow_context;
 };
 
+struct mlx5_ifc_dest_format_bits {
+	u8         destination_type[0x8];
+	u8         destination_id[0x18];
+
+	u8         destination_eswitch_owner_vhca_id_valid[0x1];
+	u8         packet_reformat[0x1];
+	u8         reserved_at_22[0xe];
+	u8         destination_eswitch_owner_vhca_id[0x10];
+};
+
 struct mlx5_ifc_rts2rts_qp_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -6127,7 +6247,8 @@ struct mlx5_ifc_flow_table_context_bits {
 	u8         termination_table[0x1];
 	u8         table_miss_action[0x4];
 	u8         level[0x8];
-	u8         reserved_at_10[0x8];
+	u8         rtc_valid[0x1];
+	u8         reserved_at_11[0x7];
 	u8         log_size[0x8];
 
 	u8         reserved_at_20[0x8];
@@ -6137,11 +6258,21 @@ struct mlx5_ifc_flow_table_context_bits {
 	u8         lag_master_next_table_id[0x18];
 
 	u8         reserved_at_60[0x60];
+	union {
+		struct {
+			u8         sw_owner_icm_root_1[0x40];
+
+			u8         sw_owner_icm_root_0[0x40];
+		} sws;
+		struct {
+			u8         rtc_id_0[0x20];
 
-	u8         sw_owner_icm_root_1[0x40];
+			u8         rtc_id_1[0x20];
 
-	u8         sw_owner_icm_root_0[0x40];
+			u8         reserved_at_100[0x40];
 
+		} hws;
+	};
 };
 
 struct mlx5_ifc_query_flow_table_out_bits {
@@ -8923,7 +9054,9 @@ struct mlx5_ifc_create_qp_in_bits {
 
 	struct mlx5_ifc_qpc_bits qpc;
 
-	u8         reserved_at_800[0x60];
+	u8         wq_umem_offset[0x40];
+
+	u8         wq_umem_id[0x20];
 
 	u8         wq_umem_valid[0x1];
 	u8         reserved_at_861[0x1f];
-- 
cgit v1.2.3


From 00b9f0daefd7670356e592d418c890acf9179c94 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue, 17 Jan 2023 17:14:51 +0200
Subject: net/mlx5: Added missing definitions in preparation for HW Steering

As part of preparation for HWS, added missing definitions
in qp.h and fs_core.h:

 - FS_FT_FDB_RX/TX table types that are used by HWS in addition
   to an existing FS_FT_FDB
 - MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE that is used by HWS to
   require fence in WQE

Reviewed-by: Hamdan Agbariya <hamdani@nvidia.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/qp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index ad1ce650146c..fc7eeff99a8a 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -149,6 +149,7 @@ enum {
 	MLX5_WQE_CTRL_CQ_UPDATE		= 2 << 2,
 	MLX5_WQE_CTRL_CQ_UPDATE_AND_EQE	= 3 << 2,
 	MLX5_WQE_CTRL_SOLICITED		= 1 << 1,
+	MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE = 1 << 5,
 };
 
 enum {
-- 
cgit v1.2.3


From 452ef7f86036392005940de54228d42ca0044192 Mon Sep 17 00:00:00 2001
From: Carolina Jubran <cjubran@nvidia.com>
Date: Mon, 5 Aug 2024 10:03:20 +0300
Subject: net/mlx5: Add missing masks and QoS bit masks for scheduling elements

Add the missing masks for supported element types and Transmit
Scheduling Arbiter (TSAR) types in scheduling elements.

Also, add the corresponding bit masks for these types in the QoS
capabilities of a NIC scheduler.

Fixes: 214baf22870c ("net/mlx5e: Support HTB offload")
Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index cab228cf51c6..cfdf984a95a8 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1027,7 +1027,8 @@ struct mlx5_ifc_qos_cap_bits {
 
 	u8         max_tsar_bw_share[0x20];
 
-	u8         reserved_at_100[0x20];
+	u8         nic_element_type[0x10];
+	u8         nic_tsar_type[0x10];
 
 	u8         reserved_at_120[0x3];
 	u8         log_meter_aso_granularity[0x5];
@@ -3966,6 +3967,7 @@ enum {
 	ELEMENT_TYPE_CAP_MASK_VPORT		= 1 << 1,
 	ELEMENT_TYPE_CAP_MASK_VPORT_TC		= 1 << 2,
 	ELEMENT_TYPE_CAP_MASK_PARA_VPORT_TC	= 1 << 3,
+	ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP	= 1 << 4,
 };
 
 struct mlx5_ifc_scheduling_context_bits {
@@ -4675,6 +4677,12 @@ enum {
 	TSAR_ELEMENT_TSAR_TYPE_ETS = 0x2,
 };
 
+enum {
+	TSAR_TYPE_CAP_MASK_DWRR		= 1 << 0,
+	TSAR_TYPE_CAP_MASK_ROUND_ROBIN	= 1 << 1,
+	TSAR_TYPE_CAP_MASK_ETS		= 1 << 2,
+};
+
 struct mlx5_ifc_tsar_element_bits {
 	u8         reserved_at_0[0x8];
 	u8         tsar_type[0x8];
-- 
cgit v1.2.3


From 3dc95a3edd0a86b4a59670b3fafcc64c7d83e2e7 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 4 Sep 2024 17:47:45 +0200
Subject: netdevice: add netdev_tx_reset_subqueue() shorthand

Add a shorthand similar to other net*_subqueue() helpers for resetting
the queue by its index w/o obtaining &netdev_tx_queue beforehand
manually.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/netdevice.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b47c00657bd0..44d1dbb54ffe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3568,6 +3568,17 @@ static inline void netdev_tx_reset_queue(struct netdev_queue *q)
 #endif
 }
 
+/**
+ * netdev_tx_reset_subqueue - reset the BQL stats and state of a netdev queue
+ * @dev: network device
+ * @qid: stack index of the queue to reset
+ */
+static inline void netdev_tx_reset_subqueue(const struct net_device *dev,
+					    u32 qid)
+{
+	netdev_tx_reset_queue(netdev_get_tx_queue(dev, qid));
+}
+
 /**
  * 	netdev_reset_queue - reset the packets and bytes count of a network device
  * 	@dev_queue: network device
@@ -3577,7 +3588,7 @@ static inline void netdev_tx_reset_queue(struct netdev_queue *q)
  */
 static inline void netdev_reset_queue(struct net_device *dev_queue)
 {
-	netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0));
+	netdev_tx_reset_subqueue(dev_queue, 0);
 }
 
 /**
-- 
cgit v1.2.3


From 40a895fd9a358eea16901026360bd2cd3ca691a7 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 5 Sep 2024 15:35:45 -0700
Subject: cxl: move cxl headers to new include/cxl/ directory

Group all cxl related kernel headers into include/cxl/ directory.

Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Link: https://patch.msgid.link/20240905223711.1990186-2-dave.jiang@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 include/linux/cxl-event.h | 175 ----------------------------------------------
 include/linux/einj-cxl.h  |  44 ------------
 2 files changed, 219 deletions(-)
 delete mode 100644 include/linux/cxl-event.h
 delete mode 100644 include/linux/einj-cxl.h

(limited to 'include/linux')

diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
deleted file mode 100644
index 0bea1afbd747..000000000000
--- a/include/linux/cxl-event.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright(c) 2023 Intel Corporation. */
-#ifndef _LINUX_CXL_EVENT_H
-#define _LINUX_CXL_EVENT_H
-
-#include <linux/types.h>
-#include <linux/uuid.h>
-#include <linux/workqueue_types.h>
-
-/*
- * Common Event Record Format
- * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
- */
-struct cxl_event_record_hdr {
-	u8 length;
-	u8 flags[3];
-	__le16 handle;
-	__le16 related_handle;
-	__le64 timestamp;
-	u8 maint_op_class;
-	u8 reserved[15];
-} __packed;
-
-struct cxl_event_media_hdr {
-	struct cxl_event_record_hdr hdr;
-	__le64 phys_addr;
-	u8 descriptor;
-	u8 type;
-	u8 transaction_type;
-	/*
-	 * The meaning of Validity Flags from bit 2 is
-	 * different across DRAM and General Media records
-	 */
-	u8 validity_flags[2];
-	u8 channel;
-	u8 rank;
-} __packed;
-
-#define CXL_EVENT_RECORD_DATA_LENGTH 0x50
-struct cxl_event_generic {
-	struct cxl_event_record_hdr hdr;
-	u8 data[CXL_EVENT_RECORD_DATA_LENGTH];
-} __packed;
-
-/*
- * General Media Event Record
- * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
- */
-#define CXL_EVENT_GEN_MED_COMP_ID_SIZE	0x10
-struct cxl_event_gen_media {
-	struct cxl_event_media_hdr media_hdr;
-	u8 device[3];
-	u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE];
-	u8 reserved[46];
-} __packed;
-
-/*
- * DRAM Event Record - DER
- * CXL rev 3.0 section 8.2.9.2.1.2; Table 3-44
- */
-#define CXL_EVENT_DER_CORRECTION_MASK_SIZE	0x20
-struct cxl_event_dram {
-	struct cxl_event_media_hdr media_hdr;
-	u8 nibble_mask[3];
-	u8 bank_group;
-	u8 bank;
-	u8 row[3];
-	u8 column[2];
-	u8 correction_mask[CXL_EVENT_DER_CORRECTION_MASK_SIZE];
-	u8 reserved[0x17];
-} __packed;
-
-/*
- * Get Health Info Record
- * CXL rev 3.0 section 8.2.9.8.3.1; Table 8-100
- */
-struct cxl_get_health_info {
-	u8 health_status;
-	u8 media_status;
-	u8 add_status;
-	u8 life_used;
-	u8 device_temp[2];
-	u8 dirty_shutdown_cnt[4];
-	u8 cor_vol_err_cnt[4];
-	u8 cor_per_err_cnt[4];
-} __packed;
-
-/*
- * Memory Module Event Record
- * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45
- */
-struct cxl_event_mem_module {
-	struct cxl_event_record_hdr hdr;
-	u8 event_type;
-	struct cxl_get_health_info info;
-	u8 reserved[0x3d];
-} __packed;
-
-union cxl_event {
-	struct cxl_event_generic generic;
-	struct cxl_event_gen_media gen_media;
-	struct cxl_event_dram dram;
-	struct cxl_event_mem_module mem_module;
-	/* dram & gen_media event header */
-	struct cxl_event_media_hdr media_hdr;
-} __packed;
-
-/*
- * Common Event Record Format; in event logs
- * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
- */
-struct cxl_event_record_raw {
-	uuid_t id;
-	union cxl_event event;
-} __packed;
-
-enum cxl_event_type {
-	CXL_CPER_EVENT_GENERIC,
-	CXL_CPER_EVENT_GEN_MEDIA,
-	CXL_CPER_EVENT_DRAM,
-	CXL_CPER_EVENT_MEM_MODULE,
-};
-
-#define CPER_CXL_DEVICE_ID_VALID		BIT(0)
-#define CPER_CXL_DEVICE_SN_VALID		BIT(1)
-#define CPER_CXL_COMP_EVENT_LOG_VALID		BIT(2)
-struct cxl_cper_event_rec {
-	struct {
-		u32 length;
-		u64 validation_bits;
-		struct cper_cxl_event_devid {
-			u16 vendor_id;
-			u16 device_id;
-			u8 func_num;
-			u8 device_num;
-			u8 bus_num;
-			u16 segment_num;
-			u16 slot_num; /* bits 2:0 reserved */
-			u8 reserved;
-		} __packed device_id;
-		struct cper_cxl_event_sn {
-			u32 lower_dw;
-			u32 upper_dw;
-		} __packed dev_serial_num;
-	} __packed hdr;
-
-	union cxl_event event;
-} __packed;
-
-struct cxl_cper_work_data {
-	enum cxl_event_type event_type;
-	struct cxl_cper_event_rec rec;
-};
-
-#ifdef CONFIG_ACPI_APEI_GHES
-int cxl_cper_register_work(struct work_struct *work);
-int cxl_cper_unregister_work(struct work_struct *work);
-int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd);
-#else
-static inline int cxl_cper_register_work(struct work_struct *work)
-{
-	return 0;
-}
-
-static inline int cxl_cper_unregister_work(struct work_struct *work)
-{
-	return 0;
-}
-static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
-{
-	return 0;
-}
-#endif
-
-#endif /* _LINUX_CXL_EVENT_H */
diff --git a/include/linux/einj-cxl.h b/include/linux/einj-cxl.h
deleted file mode 100644
index 624ff6ff41f9..000000000000
--- a/include/linux/einj-cxl.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * CXL protocol Error INJection support.
- *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Author: Ben Cheatham <benjamin.cheatham@amd.com>
- */
-#ifndef EINJ_CXL_H
-#define EINJ_CXL_H
-
-#include <linux/errno.h>
-#include <linux/types.h>
-
-struct pci_dev;
-struct seq_file;
-
-#if IS_ENABLED(CONFIG_ACPI_APEI_EINJ_CXL)
-int einj_cxl_available_error_type_show(struct seq_file *m, void *v);
-int einj_cxl_inject_error(struct pci_dev *dport_dev, u64 type);
-int einj_cxl_inject_rch_error(u64 rcrb, u64 type);
-bool einj_cxl_is_initialized(void);
-#else /* !IS_ENABLED(CONFIG_ACPI_APEI_EINJ_CXL) */
-static inline int einj_cxl_available_error_type_show(struct seq_file *m,
-						     void *v)
-{
-	return -ENXIO;
-}
-
-static inline int einj_cxl_inject_error(struct pci_dev *dport_dev, u64 type)
-{
-	return -ENXIO;
-}
-
-static inline int einj_cxl_inject_rch_error(u64 rcrb, u64 type)
-{
-	return -ENXIO;
-}
-
-static inline bool einj_cxl_is_initialized(void) { return false; }
-#endif /* CONFIG_ACPI_APEI_EINJ_CXL */
-
-#endif /* EINJ_CXL_H */
-- 
cgit v1.2.3


From 246d3aa3e53151fa150f10257ddd8a4facd31a6a Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 8 Aug 2024 12:18:46 +0100
Subject: mm: cleanup count_mthp_stat() definition

Patch series "Shmem mTHP controls and stats improvements", v3.

This is a small series to tidy up the way the shmem controls and stats are
exposed.  These patches were previously part of the series at [2], but I
decided to split them out since they can go in independently.


This patch (of 2):

Let's move count_mthp_stat() so that it's always defined, even when THP is
disabled.  Previously uses of the function in files such as shmem.c, which
are compiled even when THP is disabled, required ugly THP ifdeferry.  With
this cleanup, we can remove those ifdefs and the function resolves to a
nop when THP is disabled.

I shortly plan to call count_mthp_stat() from more THP-invariant source
files.

Link: https://lkml.kernel.org/r/20240808111849.651867-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20240808111849.651867-2-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Barry Song <baohua@kernel.org>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lance Yang <ioworker0@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Gavin Shan <gshan@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 70 ++++++++++++++++++++++++-------------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 6370026689e0..4c32058cacfe 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -114,6 +114,41 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
 #define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
 #define HPAGE_PUD_SIZE	((1UL) << HPAGE_PUD_SHIFT)
 
+enum mthp_stat_item {
+	MTHP_STAT_ANON_FAULT_ALLOC,
+	MTHP_STAT_ANON_FAULT_FALLBACK,
+	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+	MTHP_STAT_SWPOUT,
+	MTHP_STAT_SWPOUT_FALLBACK,
+	MTHP_STAT_SHMEM_ALLOC,
+	MTHP_STAT_SHMEM_FALLBACK,
+	MTHP_STAT_SHMEM_FALLBACK_CHARGE,
+	MTHP_STAT_SPLIT,
+	MTHP_STAT_SPLIT_FAILED,
+	MTHP_STAT_SPLIT_DEFERRED,
+	__MTHP_STAT_COUNT
+};
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
+struct mthp_stat {
+	unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
+};
+
+DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
+
+static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+{
+	if (order <= 0 || order > PMD_ORDER)
+		return;
+
+	this_cpu_inc(mthp_stats.stats[order][item]);
+}
+#else
+static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+{
+}
+#endif
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
 extern unsigned long transparent_hugepage_flags;
@@ -269,41 +304,6 @@ struct thpsize {
 
 #define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)
 
-enum mthp_stat_item {
-	MTHP_STAT_ANON_FAULT_ALLOC,
-	MTHP_STAT_ANON_FAULT_FALLBACK,
-	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
-	MTHP_STAT_SWPOUT,
-	MTHP_STAT_SWPOUT_FALLBACK,
-	MTHP_STAT_SHMEM_ALLOC,
-	MTHP_STAT_SHMEM_FALLBACK,
-	MTHP_STAT_SHMEM_FALLBACK_CHARGE,
-	MTHP_STAT_SPLIT,
-	MTHP_STAT_SPLIT_FAILED,
-	MTHP_STAT_SPLIT_DEFERRED,
-	__MTHP_STAT_COUNT
-};
-
-struct mthp_stat {
-	unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
-};
-
-#ifdef CONFIG_SYSFS
-DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
-
-static inline void count_mthp_stat(int order, enum mthp_stat_item item)
-{
-	if (order <= 0 || order > PMD_ORDER)
-		return;
-
-	this_cpu_inc(mthp_stats.stats[order][item]);
-}
-#else
-static inline void count_mthp_stat(int order, enum mthp_stat_item item)
-{
-}
-#endif
-
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
 	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
-- 
cgit v1.2.3


From 5d65c8d758f2596c008009e39bb2614deed2c730 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Sat, 24 Aug 2024 13:04:40 +1200
Subject: mm: count the number of anonymous THPs per size

Patch series "mm: count the number of anonymous THPs per size", v4.

Knowing the number of transparent anon THPs in the system is crucial
for performance analysis. It helps in understanding the ratio and
distribution of THPs versus small folios throughout the system.

Additionally, partial unmapping by userspace can lead to significant waste
of THPs over time and increase memory reclamation pressure. We need this
information for comprehensive system tuning.


This patch (of 2):

Let's track for each anonymous THP size, how many of them are currently
allocated.  We'll track the complete lifespan of an anon THP, starting
when it becomes an anon THP ("large anon folio") (->mapping gets set),
until it gets freed (->mapping gets cleared).

Introduce a new "nr_anon" counter per THP size and adjust the
corresponding counter in the following cases:
* We allocate a new THP and call folio_add_new_anon_rmap() to map
   it the first time and turn it into an anon THP.
* We split an anon THP into multiple smaller ones.
* We migrate an anon THP, when we prepare the destination.
* We free an anon THP back to the buddy.

Note that AnonPages in /proc/meminfo currently tracks the total number of
*mapped* anonymous *pages*, and therefore has slightly different
semantics.  In the future, we might also want to track "nr_anon_mapped"
for each THP size, which might be helpful when comparing it to the number
of allocated anon THPs (long-term pinning, stuck in swapcache, memory
leaks, ...).

Further note that for now, we only track anon THPs after they got their
->mapping set, for example via folio_add_new_anon_rmap().  If we would
allocate some in the swapcache, they will only show up in the statistics
for now after they have been mapped to user space the first time, where we
call folio_add_new_anon_rmap().

[akpm@linux-foundation.org: documentation fixups, per David]
  Link: https://lkml.kernel.org/r/3e8add35-e26b-443b-8a04-1078f4bc78f6@redhat.com
Link: https://lkml.kernel.org/r/20240824010441.21308-1-21cnbao@gmail.com
Link: https://lkml.kernel.org/r/20240824010441.21308-2-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Chuanhua Han <hanchuanhua@oppo.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuai Yuan <yuanshuai@oppo.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4c32058cacfe..2ee2971e4e10 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -126,6 +126,7 @@ enum mthp_stat_item {
 	MTHP_STAT_SPLIT,
 	MTHP_STAT_SPLIT_FAILED,
 	MTHP_STAT_SPLIT_DEFERRED,
+	MTHP_STAT_NR_ANON,
 	__MTHP_STAT_COUNT
 };
 
@@ -136,14 +137,24 @@ struct mthp_stat {
 
 DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
 
-static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
 {
 	if (order <= 0 || order > PMD_ORDER)
 		return;
 
-	this_cpu_inc(mthp_stats.stats[order][item]);
+	this_cpu_add(mthp_stats.stats[order][item], delta);
+}
+
+static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+{
+	mod_mthp_stat(order, item, 1);
 }
+
 #else
+static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
+{
+}
+
 static inline void count_mthp_stat(int order, enum mthp_stat_item item)
 {
 }
-- 
cgit v1.2.3


From 8175ebfd302abe6fbdca9037f763ecbfdb8db572 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Sat, 24 Aug 2024 13:04:41 +1200
Subject: mm: count the number of partially mapped anonymous THPs per size

When a THP is added to the deferred_list due to partially mapped, its
partial pages are unused, leading to wasted memory and potentially
increasing memory reclamation pressure.

Detailing the specifics of how unmapping occurs is quite difficult and not
that useful, so we adopt a simple approach: each time a THP enters the
deferred_list, we increment the count by 1; whenever it leaves for any
reason, we decrement the count by 1.

Link: https://lkml.kernel.org/r/20240824010441.21308-3-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Chuanhua Han <hanchuanhua@oppo.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuai Yuan <yuanshuai@oppo.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2ee2971e4e10..4902e2f7e896 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -127,6 +127,7 @@ enum mthp_stat_item {
 	MTHP_STAT_SPLIT_FAILED,
 	MTHP_STAT_SPLIT_DEFERRED,
 	MTHP_STAT_NR_ANON,
+	MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
 	__MTHP_STAT_COUNT
 };
 
-- 
cgit v1.2.3


From b7315fbb64736acad3cd33851884b222b00dc0f4 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sun, 25 Aug 2024 21:23:20 -0700
Subject: mm/damon/core: introduce per-context region priorities histogram
 buffer

Patch series "replace per-quota region priorities histogram buffer with
per-context one".

Each DAMOS quota (struct damos_quota) maintains a histogram for total
regions size per its prioritization score.  DAMOS calcultes minimum
prioritization score of regions that are ok to apply the DAMOS action to
while respecting the quota.  The histogram is constructed only for the
calculation of the minimum score in damos_adjust_quota() for each quota
which called by kdamond_fn().

Hence, there is no real reason to have per-quota histogram.  Only
per-kdamond histogram is needed, since parallel kdamonds could have races
otherwise.  The current implementation is only wasting the memory, and can
easily cause unintended stack usage[1].

So, introducing a per-kdamond histogram and replacing the per-quota one
with it would be the right solution for the issue.  However, supporting
multiple DAMON contexts per kdamond is still an ongoing work[2] without a
clear estimated time of arrival.  Meanwhile, per-context histogram could
be an effective and straightforward solution having no blocker.  Let's fix
the problem first in the way.


This patch (of 4):

Introduce per-context buffer for region priority scores-total size
histogram.  Same to the per-quota one (->histogram of struct damos_quota),
the new buffer is hidden from DAMON API users by being defined as a
private field of DAMON context structure.  It is dynamically allocated and
de-allocated at the beginning and ending of the execution of the kdamond
by kdamond_fn() itself.

[1] commit 0742cadf5e4c ("mm/damon/lru_sort: adjust local variable to dynamic allocation")
[2] https://lore.kernel.org/20240531122320.909060-1-yorha.op@gmail.com

Link: https://lkml.kernel.org/r/20240826042323.87025-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20240826042323.87025-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 27c546bfc6d4..60cb8eada3d6 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -630,6 +630,8 @@ struct damon_ctx {
 	unsigned long next_ops_update_sis;
 	/* for waiting until the execution of the kdamond_fn is started */
 	struct completion kdamond_started;
+	/* for scheme quotas prioritization */
+	unsigned long *regions_score_histogram;
 
 /* public: */
 	struct task_struct *kdamond;
-- 
cgit v1.2.3


From e3bcb1672583f2e1cf456e4303ce562a3cc775c0 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sun, 25 Aug 2024 21:23:22 -0700
Subject: mm/damon/core: remove per-scheme region priority histogram buffer

Nobody is reading from or writing to the per-scheme region priorities
histogram buffer.  It is only wasting memory.  Remove it.

Link: https://lkml.kernel.org/r/20240826042323.87025-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 60cb8eada3d6..a67f2c4940e9 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -233,7 +233,6 @@ struct damos_quota {
 	unsigned long charge_addr_from;
 
 	/* For prioritization */
-	unsigned long histogram[DAMOS_MAX_SCORE + 1];
 	unsigned int min_score;
 
 	/* For feedback loop */
-- 
cgit v1.2.3


From 17d75422604f0b92869aa17cb44f60958212f033 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Sat, 31 Aug 2024 08:28:22 +1200
Subject: mm: document __GFP_NOFAIL must be blockable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Non-blocking allocation with __GFP_NOFAIL is not supported and may still
result in NULL pointers (if we don't return NULL, we result in busy-loop
within non-sleepable contexts):

static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
						struct alloc_context *ac)
{
	...
	/*
	 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
	 * we always retry
	 */
	if (gfp_mask & __GFP_NOFAIL) {
		/*
		 * All existing users of the __GFP_NOFAIL are blockable, so warn
		 * of any new users that actually require GFP_NOWAIT
		 */
		if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
			goto fail;
		...
	}
	...
fail:
	warn_alloc(gfp_mask, ac->nodemask,
			"page allocation failure: order:%u", order);
got_pg:
	return page;
}

Highlight this in the documentation of __GFP_NOFAIL so that non-mm
subsystems can reject any illegal usage of __GFP_NOFAIL with GFP_ATOMIC,
GFP_NOWAIT, etc.

Link: https://lkml.kernel.org/r/20240830202823.21478-3-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Eugenio Pérez" <eperezma@redhat.com>
Cc: Hailong.Liu <hailong.liu@oppo.com>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Xie Yongji <xieyongji@bytedance.com>
Cc: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp_types.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 313be4ad79fd..4a1fa7706b0c 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -215,7 +215,8 @@ enum {
  * the caller still has to check for failures) while costly requests try to be
  * not disruptive and back off even without invoking the OOM killer.
  * The following three modifiers might be used to override some of these
- * implicit rules.
+ * implicit rules. Please note that all of them must be used along with
+ * %__GFP_DIRECT_RECLAIM flag.
  *
  * %__GFP_NORETRY: The VM implementation will try only very lightweight
  * memory direct reclaim to get some memory under memory pressure (thus
@@ -246,6 +247,8 @@ enum {
  * cannot handle allocation failures. The allocation could block
  * indefinitely but will never return with failure. Testing for
  * failure is pointless.
+ * It _must_ be blockable and used together with __GFP_DIRECT_RECLAIM.
+ * It should _never_ be used in non-sleepable contexts.
  * New users should be evaluated carefully (and the flag should be
  * used only when there is no reasonable failure policy) but it is
  * definitely preferable to use the flag rather than opencode endless
-- 
cgit v1.2.3


From 903edea6c53f097f5f0c847fdbbfab0c6c44f241 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Sat, 31 Aug 2024 08:28:23 +1200
Subject: mm: warn about illegal __GFP_NOFAIL usage in a more appropriate
 location and manner
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three points for this change:

1. We should consolidate all warnings in one place. Currently, the
   order > 1 warning is in the hotpath, while others are in less
   likely scenarios. Moving all warnings to the slowpath will reduce
   the overhead for order > 1 and increase the visibility of other
   warnings.

2. We currently have two warnings for order: one for order > 1 in
   the hotpath and another for order > costly_order in the laziest
   path. I suggest standardizing on order > 1 since it's been in
   use for a long time.

3. We don't need to check for __GFP_NOWARN in this case. __GFP_NOWARN
   is meant to suppress allocation failure reports, but here we're
   dealing with bug detection, not allocation failures. So replace
   WARN_ON_ONCE_GFP by WARN_ON_ONCE.

[v-songbaohua@oppo.com: also update the doc for __GFP_NOFAIL with order > 1]
  Link: https://lkml.kernel.org/r/20240903223935.1697-1-21cnbao@gmail.com
Link: https://lkml.kernel.org/r/20240830202823.21478-4-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David Rientjes <rientjes@google.com>
Cc: "Eugenio Pérez" <eperezma@redhat.com>
Cc: Hailong.Liu <hailong.liu@oppo.com>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Xie Yongji <xieyongji@bytedance.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp_types.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 4a1fa7706b0c..65db9349f905 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -253,7 +253,8 @@ enum {
  * used only when there is no reasonable failure policy) but it is
  * definitely preferable to use the flag rather than opencode endless
  * loop around allocator.
- * Using this flag for costly allocations is _highly_ discouraged.
+ * Allocating pages from the buddy with __GFP_NOFAIL and order > 1 is
+ * not supported. Please consider using kvmalloc() instead.
  */
 #define __GFP_IO	((__force gfp_t)___GFP_IO)
 #define __GFP_FS	((__force gfp_t)___GFP_FS)
-- 
cgit v1.2.3


From b1f202060afeb7fcb98473929d26fd3d2093b067 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Fri, 30 Aug 2024 11:03:36 +0100
Subject: mm: remap unused subpages to shared zeropage when splitting isolated
 thp

Patch series "mm: split underused THPs", v5.

The current upstream default policy for THP is always.  However, Meta uses
madvise in production as the current THP=always policy vastly
overprovisions THPs in sparsely accessed memory areas, resulting in
excessive memory pressure and premature OOM killing.  Using madvise +
relying on khugepaged has certain drawbacks over THP=always.  Using
madvise hints mean THPs aren't "transparent" and require userspace
changes.  Waiting for khugepaged to scan memory and collapse pages into
THP can be slow and unpredictable in terms of performance (i.e.  you dont
know when the collapse will happen), while production environments require
predictable performance.  If there is enough memory available, its better
for both performance and predictability to have a THP from fault time,
i.e.  THP=always rather than wait for khugepaged to collapse it, and deal
with sparsely populated THPs when the system is running out of memory.

This patch series is an attempt to mitigate the issue of running out of
memory when THP is always enabled.  During runtime whenever a THP is being
faulted in or collapsed by khugepaged, the THP is added to a list.
Whenever memory reclaim happens, the kernel runs the deferred_split
shrinker which goes through the list and checks if the THP was underused,
i.e.  how many of the base 4K pages of the entire THP were zero-filled.
If this number goes above a certain threshold, the shrinker will attempt
to split that THP.  Then at remap time, the pages that were zero-filled
are mapped to the shared zeropage, hence saving memory.  This method
avoids the downside of wasting memory in areas where THP is sparsely
filled when THP is always enabled, while still providing the upside THPs
like reduced TLB misses without having to use madvise.

Meta production workloads that were CPU bound (>99% CPU utilzation) were
tested with THP shrinker.  The results after 2 hours are as follows:

                            | THP=madvise |  THP=always   | THP=always
                            |             |               | + shrinker series
                            |             |               | + max_ptes_none=409
-----------------------------------------------------------------------------
Performance improvement     |      -      |    +1.8%      |     +1.7%
(over THP=madvise)          |             |               |
-----------------------------------------------------------------------------
Memory usage                |    54.6G    | 58.8G (+7.7%) |   55.9G (+2.4%)
-----------------------------------------------------------------------------
max_ptes_none=409 means that any THP that has more than 409 out of 512
(80%) zero filled filled pages will be split.

To test out the patches, the below commands without the shrinker will
invoke OOM killer immediately and kill stress, but will not fail with the
shrinker:

echo 450 > /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
mkdir /sys/fs/cgroup/test
echo $$ > /sys/fs/cgroup/test/cgroup.procs
echo 20M > /sys/fs/cgroup/test/memory.max
echo 0 > /sys/fs/cgroup/test/memory.swap.max
# allocate twice memory.max for each stress worker and touch 40/512 of
# each THP, i.e. vm-stride 50K.
# With the shrinker, max_ptes_none of 470 and below won't invoke OOM
# killer.
# Without the shrinker, OOM killer is invoked immediately irrespective
# of max_ptes_none value and kills stress.
stress --vm 1 --vm-bytes 40M --vm-stride 50K


This patch (of 5):

Here being unused means containing only zeros and inaccessible to
userspace.  When splitting an isolated thp under reclaim or migration, the
unused subpages can be mapped to the shared zeropage, hence saving memory.
This is particularly helpful when the internal fragmentation of a thp is
high, i.e.  it has many untouched subpages.

This is also a prerequisite for THP low utilization shrinker which will be
introduced in later patches, where underutilized THPs are split, and the
zero-filled pages are freed saving memory.

Link: https://lkml.kernel.org/r/20240830100438.3623486-1-usamaarif642@gmail.com
Link: https://lkml.kernel.org/r/20240830100438.3623486-3-usamaarif642@gmail.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
Tested-by: Shuang Zhai <zhais@google.com>
Cc: Alexander Zhu <alexlzhu@fb.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Shuang Zhai <szhai2@cs.rochester.edu>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 91b5935e8485..d5e93e44322e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
 		      struct vm_area_struct *vma);
 
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
+enum rmp_flags {
+	RMP_LOCKED		= 1 << 0,
+	RMP_USE_SHARED_ZEROPAGE	= 1 << 1,
+};
+
+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
 
 /*
  * rmap_walk_control: To control rmap traversing for specific needs
-- 
cgit v1.2.3


From 8422acdc97ed5839692b45f800dbfb78abe65a94 Mon Sep 17 00:00:00 2001
From: Usama Arif <usamaarif642@gmail.com>
Date: Fri, 30 Aug 2024 11:03:38 +0100
Subject: mm: introduce a pageflag for partially mapped folios

Currently folio->_deferred_list is used to keep track of partially_mapped
folios that are going to be split under memory pressure.  In the next
patch, all THPs that are faulted in and collapsed by khugepaged are also
going to be tracked using _deferred_list.

This patch introduces a pageflag to be able to distinguish between
partially mapped folios and others in the deferred_list at split time in
deferred_split_scan.  Its needed as __folio_remove_rmap decrements
_mapcount, _large_mapcount and _entire_mapcount, hence it won't be
possible to distinguish between partially mapped folios and others in
deferred_split_scan.

Eventhough it introduces an extra flag to track if the folio is partially
mapped, there is no functional change intended with this patch and the
flag is not useful in this patch itself, it will become useful in the next
patch when _deferred_list has non partially mapped folios.

Link: https://lkml.kernel.org/r/20240830100438.3623486-5-usamaarif642@gmail.com
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
Cc: Alexander Zhu <alexlzhu@fb.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Shuang Zhai <zhais@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Shuang Zhai <szhai2@cs.rochester.edu>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h    |  4 ++--
 include/linux/page-flags.h | 13 ++++++++++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4902e2f7e896..e3896ebf0f94 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -333,7 +333,7 @@ static inline int split_huge_page(struct page *page)
 {
 	return split_huge_page_to_list_to_order(page, NULL, 0);
 }
-void deferred_split_folio(struct folio *folio);
+void deferred_split_folio(struct folio *folio, bool partially_mapped);
 
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long address, bool freeze, struct folio *folio);
@@ -507,7 +507,7 @@ static inline int split_huge_page(struct page *page)
 {
 	return 0;
 }
-static inline void deferred_split_folio(struct folio *folio) {}
+static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
 #define split_huge_pmd(__vma, __pmd, __address)	\
 	do { } while (0)
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 2175ebceb41c..1b3a76710487 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -186,6 +186,7 @@ enum pageflags {
 	/* At least one page in this folio has the hwpoison flag set */
 	PG_has_hwpoisoned = PG_active,
 	PG_large_rmappable = PG_workingset, /* anon or file-backed */
+	PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
 };
 
 #define PAGEFLAGS_MASK		((1UL << NR_PAGEFLAGS) - 1)
@@ -859,8 +860,18 @@ static inline void ClearPageCompound(struct page *page)
 	ClearPageHead(page);
 }
 FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
+FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
+/*
+ * PG_partially_mapped is protected by deferred_split split_queue_lock,
+ * so its safe to use non-atomic set/clear.
+ */
+__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
+__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
 #else
 FOLIO_FLAG_FALSE(large_rmappable)
+FOLIO_TEST_FLAG_FALSE(partially_mapped)
+__FOLIO_SET_FLAG_NOOP(partially_mapped)
+__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
 #endif
 
 #define PG_head_mask ((1UL << PG_head))
@@ -1171,7 +1182,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
  */
 #define PAGE_FLAGS_SECOND						\
 	(0xffUL /* order */		| 1UL << PG_has_hwpoisoned |	\
-	 1UL << PG_large_rmappable)
+	 1UL << PG_large_rmappable	| 1UL << PG_partially_mapped)
 
 #define PAGE_FLAGS_PRIVATE				\
 	(1UL << PG_private | 1UL << PG_private_2)
-- 
cgit v1.2.3


From dafff3f4c850c98e15501bc6ee20581f9a013dcc Mon Sep 17 00:00:00 2001
From: Usama Arif <usamaarif642@gmail.com>
Date: Fri, 30 Aug 2024 11:03:39 +0100
Subject: mm: split underused THPs

This is an attempt to mitigate the issue of running out of memory when THP
is always enabled.  During runtime whenever a THP is being faulted in
(__do_huge_pmd_anonymous_page) or collapsed by khugepaged
(collapse_huge_page), the THP is added to _deferred_list.  Whenever memory
reclaim happens in linux, the kernel runs the deferred_split shrinker
which goes through the _deferred_list.

If the folio was partially mapped, the shrinker attempts to split it.  If
the folio is not partially mapped, the shrinker checks if the THP was
underused, i.e.  how many of the base 4K pages of the entire THP were
zero-filled.  If this number goes above a certain threshold (decided by
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none), the
shrinker will attempt to split that THP.  Then at remap time, the pages
that were zero-filled are mapped to the shared zeropage, hence saving
memory.

Link: https://lkml.kernel.org/r/20240830100438.3623486-6-usamaarif642@gmail.com
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
Suggested-by: Rik van Riel <riel@surriel.com>
Co-authored-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Alexander Zhu <alexlzhu@fb.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Shuang Zhai <zhais@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Shuang Zhai <szhai2@cs.rochester.edu>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/khugepaged.h    | 1 +
 include/linux/vm_event_item.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index f68865e19b0b..30baae91b225 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -4,6 +4,7 @@
 
 #include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
 
+extern unsigned int khugepaged_max_ptes_none __read_mostly;
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern struct attribute_group khugepaged_attr_group;
 
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index aae5c7c5cfb4..aed952d04132 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -105,6 +105,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		THP_SPLIT_PAGE,
 		THP_SPLIT_PAGE_FAILED,
 		THP_DEFERRED_SPLIT_PAGE,
+		THP_UNDERUSED_SPLIT_PAGE,
 		THP_SPLIT_PMD,
 		THP_SCAN_EXCEED_NONE_PTE,
 		THP_SCAN_EXCEED_SWAP_PTE,
-- 
cgit v1.2.3


From 0a6fff20d36bc81156a49649f622b152330fed3c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 1 Sep 2024 16:24:59 -0400
Subject: mm: fix folio_alloc_noprof()

folio_alloc_noprof) wasn't calling the _noprof version, causing
allocations to be accounted here instead of to the caller

Link: https://lkml.kernel.org/r/20240901202459.4867-1-kent.overstreet@linux.dev
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 03ba9563c6db..a951de920e20 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -319,7 +319,7 @@ static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order
 }
 static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
 {
-	return __folio_alloc_node(gfp, order, numa_node_id());
+	return __folio_alloc_node_noprof(gfp, order, numa_node_id());
 }
 static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
 		struct mempolicy *mpol, pgoff_t ilx, int nid)
-- 
cgit v1.2.3


From 6050df6d706f58b2240bfabece9f406170104d57 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Fri, 9 Aug 2024 02:01:15 +0000
Subject: maple_tree: fix comment typo on ma_flag of allocation tree

The maple tree flag of allocation tree is MT_FLAGS_ALLOC_RANGE.

Just correct it.

Link: https://lkml.kernel.org/r/20240809020115.31575-1-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 8e1504a81cd2..c2c11004085e 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -52,9 +52,9 @@
  * bit in the node type.  This is possible by using bit 1 to indicate if bit 2
  * is part of the type or the slot.
  *
- * Once the type is decided, the decision of an allocation range type or a range
- * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE
- * flag.
+ * Once the type is decided, the decision of an allocation range type or a
+ * range type is done by examining the immutable tree flag for the
+ * MT_FLAGS_ALLOC_RANGE flag.
  *
  *  Node types:
  *   0x??1 = Root
-- 
cgit v1.2.3


From 4fc4187984e5dbd7ad63c3acf0b228fa9bf298d3 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <senozhatsky@chromium.org>
Date: Mon, 2 Sep 2024 19:55:49 +0900
Subject: lib: zstd: export API needed for dictionary support

Patch series "zram: introduce custom comp backends API", v7.

This series introduces support for run-time compression algorithms tuning,
so users, for instance, can adjust compression/acceleration levels and
provide pre-trained compression/decompression dictionaries which certain
algorithms support.

At this point we stop supporting (old/deprecated) comp API.  We may add
new acomp API support in the future, but before that zram needs to undergo
some major rework (we are not ready for async compression).

Some benchmarks for reference (look at column #2)

*** init zstd
/sys/block/zram0/mm_stat
1750659072 504622188 514355200        0 514355200        1        0    34204    34204

*** init zstd dict=/home/ss/zstd-dict-amd64
/sys/block/zram0/mm_stat
1750650880 465908890 475398144        0 475398144        1        0    34185    34185

*** init zstd level=8 dict=/home/ss/zstd-dict-amd64
/sys/block/zram0/mm_stat
1750654976 430803319 439873536        0 439873536        1        0    34185    34185

*** init lz4
/sys/block/zram0/mm_stat
1750646784 664266564 677060608        0 677060608        1        0    34288    34288

*** init lz4 dict=/home/ss/lz4-dict-amd64
/sys/block/zram0/mm_stat
1750650880 619990300 632102912        0 632102912        1        0    34278    34278

*** init lz4hc
/sys/block/zram0/mm_stat
1750630400 609023822 621232128        0 621232128        1        0    34288    34288

*** init lz4hc dict=/home/ss/lz4-dict-amd64
/sys/block/zram0/mm_stat
1750659072 505133172 515231744        0 515231744        1        0    34278    34278


Recompress
init zram zstd (prio=0), zstd level=5 (prio 1), zstd with dict (prio 2)

*** zstd
/sys/block/zram0/mm_stat
1750982656 504630584 514269184        0 514269184        1        0    34204    34204

*** idle recompress priority=1 (zstd level=5)
/sys/block/zram0/mm_stat
1750982656 488645601 525438976        0 514269184        1        0    34204    34204

*** idle recompress priority=2 (zstd dict)
/sys/block/zram0/mm_stat
1750982656 460869640 517914624        0 514269184        1        0    34185    34204


This patch (of 24):

We need to export a number of API functions that enable advanced zstd
usage - C/D dictionaries, dictionaries sharing between contexts, etc.

Link: https://lkml.kernel.org/r/20240902105656.1383858-1-senozhatsky@chromium.org
Link: https://lkml.kernel.org/r/20240902105656.1383858-2-senozhatsky@chromium.org
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zstd.h | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/zstd.h b/include/linux/zstd.h
index 113408eef6ec..b2c7cf310c8f 100644
--- a/include/linux/zstd.h
+++ b/include/linux/zstd.h
@@ -77,6 +77,30 @@ int zstd_min_clevel(void);
  */
 int zstd_max_clevel(void);
 
+/**
+ * zstd_default_clevel() - default compression level
+ *
+ * Return: Default compression level.
+ */
+int zstd_default_clevel(void);
+
+/**
+ * struct zstd_custom_mem - custom memory allocation
+ */
+typedef ZSTD_customMem zstd_custom_mem;
+
+/**
+ * struct zstd_dict_load_method - Dictionary load method.
+ * See zstd_lib.h.
+ */
+typedef ZSTD_dictLoadMethod_e zstd_dict_load_method;
+
+/**
+ * struct zstd_dict_content_type - Dictionary context type.
+ * See zstd_lib.h.
+ */
+typedef ZSTD_dictContentType_e zstd_dict_content_type;
+
 /* ======   Parameter Selection   ====== */
 
 /**
@@ -136,6 +160,19 @@ typedef ZSTD_parameters zstd_parameters;
 zstd_parameters zstd_get_params(int level,
 	unsigned long long estimated_src_size);
 
+
+/**
+ * zstd_get_cparams() - returns zstd_compression_parameters for selected level
+ * @level:              The compression level
+ * @estimated_src_size: The estimated source size to compress or 0
+ *                      if unknown.
+ * @dict_size:          Dictionary size.
+ *
+ * Return:              The selected zstd_compression_parameters.
+ */
+zstd_compression_parameters zstd_get_cparams(int level,
+	unsigned long long estimated_src_size, size_t dict_size);
+
 /* ======   Single-pass Compression   ====== */
 
 typedef ZSTD_CCtx zstd_cctx;
@@ -180,6 +217,71 @@ zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size);
 size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity,
 	const void *src, size_t src_size, const zstd_parameters *parameters);
 
+/**
+ * zstd_create_cctx_advanced() - Create compression context
+ * @custom_mem:   Custom allocator.
+ *
+ * Return:        NULL on error, pointer to compression context otherwise.
+ */
+zstd_cctx *zstd_create_cctx_advanced(zstd_custom_mem custom_mem);
+
+/**
+ * zstd_free_cctx() - Free compression context
+ * @cdict:        Pointer to compression context.
+ *
+ * Return:        Always 0.
+ */
+size_t zstd_free_cctx(zstd_cctx* cctx);
+
+/**
+ * struct zstd_cdict - Compression dictionary.
+ * See zstd_lib.h.
+ */
+typedef ZSTD_CDict zstd_cdict;
+
+/**
+ * zstd_create_cdict_byreference() - Create compression dictionary
+ * @dict:              Pointer to dictionary buffer.
+ * @dict_size:         Size of the dictionary buffer.
+ * @dict_load_method:  Dictionary load method.
+ * @dict_content_type: Dictionary content type.
+ * @custom_mem:        Memory allocator.
+ *
+ * Note, this uses @dict by reference (ZSTD_dlm_byRef), so it should be
+ * free before zstd_cdict is destroyed.
+ *
+ * Return:             NULL on error, pointer to compression dictionary
+ *                     otherwise.
+ */
+zstd_cdict *zstd_create_cdict_byreference(const void *dict, size_t dict_size,
+					  zstd_compression_parameters cparams,
+					  zstd_custom_mem custom_mem);
+
+/**
+ * zstd_free_cdict() - Free compression dictionary
+ * @cdict:        Pointer to compression dictionary.
+ *
+ * Return:        Always 0.
+ */
+size_t zstd_free_cdict(zstd_cdict* cdict);
+
+/**
+ * zstd_compress_using_cdict() - compress src into dst using a dictionary
+ * @cctx:         The context. Must have been initialized with zstd_init_cctx().
+ * @dst:          The buffer to compress src into.
+ * @dst_capacity: The size of the destination buffer. May be any size, but
+ *                ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src:          The data to compress.
+ * @src_size:     The size of the data to compress.
+ * @cdict:        The dictionary to be used.
+ *
+ * Return:        The compressed size or an error, which can be checked using
+ *                zstd_is_error().
+ */
+size_t zstd_compress_using_cdict(zstd_cctx *cctx, void *dst,
+	size_t dst_capacity, const void *src, size_t src_size,
+	const zstd_cdict *cdict);
+
 /* ======   Single-pass Decompression   ====== */
 
 typedef ZSTD_DCtx zstd_dctx;
@@ -220,6 +322,71 @@ zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size);
 size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity,
 	const void *src, size_t src_size);
 
+/**
+ * struct zstd_ddict - Decompression dictionary.
+ * See zstd_lib.h.
+ */
+typedef ZSTD_DDict zstd_ddict;
+
+/**
+ * zstd_create_ddict_byreference() - Create decompression dictionary
+ * @dict:              Pointer to dictionary buffer.
+ * @dict_size:         Size of the dictionary buffer.
+ * @dict_load_method:  Dictionary load method.
+ * @dict_content_type: Dictionary content type.
+ * @custom_mem:        Memory allocator.
+ *
+ * Note, this uses @dict by reference (ZSTD_dlm_byRef), so it should be
+ * free before zstd_ddict is destroyed.
+ *
+ * Return:             NULL on error, pointer to decompression dictionary
+ *                     otherwise.
+ */
+zstd_ddict *zstd_create_ddict_byreference(const void *dict, size_t dict_size,
+					  zstd_custom_mem custom_mem);
+/**
+ * zstd_free_ddict() - Free decompression dictionary
+ * @dict:         Pointer to the dictionary.
+ *
+ * Return:        Always 0.
+ */
+size_t zstd_free_ddict(zstd_ddict *ddict);
+
+/**
+ * zstd_create_dctx_advanced() - Create decompression context
+ * @custom_mem:   Custom allocator.
+ *
+ * Return:        NULL on error, pointer to decompression context otherwise.
+ */
+zstd_dctx *zstd_create_dctx_advanced(zstd_custom_mem custom_mem);
+
+/**
+ * zstd_free_dctx() -- Free decompression context
+ * @dctx:         Pointer to decompression context.
+ * Return:        Always 0.
+ */
+size_t zstd_free_dctx(zstd_dctx *dctx);
+
+/**
+ * zstd_decompress_using_ddict() - decompress src into dst using a dictionary
+ * @dctx:         The decompression context.
+ * @dst:          The buffer to decompress src into.
+ * @dst_capacity: The size of the destination buffer. Must be at least as large
+ *                as the decompressed size. If the caller cannot upper bound the
+ *                decompressed size, then it's better to use the streaming API.
+ * @src:          The zstd compressed data to decompress. Multiple concatenated
+ *                frames and skippable frames are allowed.
+ * @src_size:     The exact size of the data to decompress.
+ * @ddict:        The dictionary to be used.
+ *
+ * Return:        The decompressed size or an error, which can be checked using
+ *                zstd_is_error().
+ */
+size_t zstd_decompress_using_ddict(zstd_dctx *dctx,
+	void *dst, size_t dst_capacity, const void *src, size_t src_size,
+	const zstd_ddict *ddict);
+
+
 /* ======   Streaming Buffers   ====== */
 
 /**
-- 
cgit v1.2.3


From e1e4cfd01a6e75dd4c810aeac115340805cf63ff Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Tue, 3 Sep 2024 11:19:28 -0400
Subject: mm,tmpfs: consider end of file write in shmem_is_huge

Take the end of a file write into consideration when deciding whether or
not to use huge pages for tmpfs files when the tmpfs filesystem is mounted
with huge=within_size

This allows large writes that append to the end of a file to automatically
use large pages.

Doing 4MB sequential writes without fallocate to a 16GB tmpfs file with
fio.  The numbers without THP or with huge=always stay the same, but the
performance with huge=within_size now matches that of huge=always.

huge		before		after
4kB pages	1560 MB/s	1560 MB/s
within_size	1560 MB/s	4720 MB/s
always:		4720 MB/s	4720 MB/s

[akpm@linux-foundation.org: coding-style cleanups]
Link: https://lkml.kernel.org/r/20240903111928.7171e60c@imladris.surriel.com
Signed-off-by: Rik van Riel <riel@surriel.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 1564d7d3ca61..515a9a6a3c6f 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -113,11 +113,11 @@ int shmem_unuse(unsigned int type);
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 unsigned long shmem_allowable_huge_orders(struct inode *inode,
 				struct vm_area_struct *vma, pgoff_t index,
-				bool shmem_huge_force);
+				loff_t write_end, bool shmem_huge_force);
 #else
 static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
 				struct vm_area_struct *vma, pgoff_t index,
-				bool shmem_huge_force)
+				loff_t write_end, bool shmem_huge_force)
 {
 	return 0;
 }
@@ -143,8 +143,8 @@ enum sgp_type {
 	SGP_FALLOC,	/* like SGP_WRITE, but make existing page Uptodate */
 };
 
-int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
-		enum sgp_type sgp);
+int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
+		struct folio **foliop, enum sgp_type sgp);
 struct folio *shmem_read_folio_gfp(struct address_space *mapping,
 		pgoff_t index, gfp_t gfp);
 
-- 
cgit v1.2.3


From 25d4054cc97484f2555709ac233f955f674e026a Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 4 Sep 2024 17:57:59 +0100
Subject: mm: make arch_get_unmapped_area() take vm_flags by default

Patch series "mm: Care about shadow stack guard gap when getting an
unmapped area", v2.

As covered in the commit log for c44357c2e76b ("x86/mm: care about shadow
stack guard gap during placement") our current mmap() implementation does
not take care to ensure that a new mapping isn't placed with existing
mappings inside it's own guard gaps.  This is particularly important for
shadow stacks since if two shadow stacks end up getting placed adjacent to
each other then they can overflow into each other which weakens the
protection offered by the feature.

On x86 there is a custom arch_get_unmapped_area() which was updated by the
above commit to cover this case by specifying a start_gap for allocations
with VM_SHADOW_STACK.  Both arm64 and RISC-V have equivalent features and
use the generic implementation of arch_get_unmapped_area() so let's make
the equivalent change there so they also don't get shadow stack pages
placed without guard pages.  The arm64 and RISC-V shadow stack
implementations are currently on the list:

   https://lore.kernel.org/r/20240829-arm64-gcs-v12-0-42fec94743
   https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/

Given the addition of the use of vm_flags in the generic implementation we
also simplify the set of possibilities that have to be dealt with in the
core code by making arch_get_unmapped_area() take vm_flags as standard.
This is a bit invasive since the prototype change touches quite a few
architectures but since the parameter is ignored the change is
straightforward, the simplification for the generic code seems worth it.


This patch (of 3):

When we introduced arch_get_unmapped_area_vmflags() in 961148704acd ("mm:
introduce arch_get_unmapped_area_vmflags()") we did so as part of properly
supporting guard pages for shadow stacks on x86_64, which uses a custom
arch_get_unmapped_area().  Equivalent features are also present on both
arm64 and RISC-V, both of which use the generic implementation of
arch_get_unmapped_area() and will require equivalent modification there.
Rather than continue to deal with having two versions of the functions
let's bite the bullet and have all implementations of
arch_get_unmapped_area() take vm_flags as a parameter.

The new parameter is currently ignored by all implementations other than
x86.  The only caller that doesn't have a vm_flags available is
mm_get_unmapped_area(), as for the x86 implementation and the wrapper used
on other architectures this is modified to supply no flags.

No functional changes.

Link: https://lkml.kernel.org/r/20240904-mm-generic-shadow-stack-guard-v2-0-a46b8b6dc0ed@kernel.org
Link: https://lkml.kernel.org/r/20240904-mm-generic-shadow-stack-guard-v2-1-a46b8b6dc0ed@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Acked-by: Helge Deller <deller@gmx.de>	[parisc]
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Chris Zankel <chris@zankel.net>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N Rao <naveen@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/mm.h | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 91546493c43d..c4d34abc45d4 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -179,27 +179,20 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
 
 extern void arch_pick_mmap_layout(struct mm_struct *mm,
 				  struct rlimit *rlim_stack);
-extern unsigned long
-arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
-		       unsigned long, unsigned long);
-extern unsigned long
+
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+		       unsigned long len, unsigned long pgoff,
+		       unsigned long flags, vm_flags_t vm_flags);
+unsigned long
 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
-			  unsigned long len, unsigned long pgoff,
-			  unsigned long flags);
+			       unsigned long len, unsigned long pgoff,
+			       unsigned long flags, vm_flags_t);
 
 unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp,
 				   unsigned long addr, unsigned long len,
 				   unsigned long pgoff, unsigned long flags);
 
-unsigned long
-arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
-			       unsigned long len, unsigned long pgoff,
-			       unsigned long flags, vm_flags_t vm_flags);
-unsigned long
-arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr,
-				       unsigned long len, unsigned long pgoff,
-				       unsigned long flags, vm_flags_t);
-
 unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm,
 					   struct file *filp,
 					   unsigned long addr,
-- 
cgit v1.2.3


From 540e00a729dfbefe0aa0d64b6dac1d1b620a1787 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 4 Sep 2024 17:58:00 +0100
Subject: mm: pass vm_flags to generic_get_unmapped_area()

In preparation for using vm_flags to ensure guard pages for shadow stacks
supply them as an argument to generic_get_unmapped_area().  The only user
outside of the core code is the PowerPC book3s64 implementation which is
trivially wrapping the generic implementation in the radix_enabled() case.

No functional changes.

Link: https://lkml.kernel.org/r/20240904-mm-generic-shadow-stack-guard-v2-2-a46b8b6dc0ed@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Chris Zankel <chris@zankel.net>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Naveen N Rao <naveen@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index c4d34abc45d4..07bb8d4181d7 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -204,11 +204,11 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm,
 unsigned long
 generic_get_unmapped_area(struct file *filp, unsigned long addr,
 			  unsigned long len, unsigned long pgoff,
-			  unsigned long flags);
+			  unsigned long flags, vm_flags_t vm_flags);
 unsigned long
 generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 				  unsigned long len, unsigned long pgoff,
-				  unsigned long flags);
+				  unsigned long flags, vm_flags_t vm_flags);
 #else
 static inline void arch_pick_mmap_layout(struct mm_struct *mm,
 					 struct rlimit *rlim_stack) {}
-- 
cgit v1.2.3


From 08e28de1160a712724268fd33d77b32f1bc84d1c Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Tue, 3 Sep 2024 09:36:28 +0200
Subject: uprobes: use vm_special_mapping close() functionality

The following KASAN splat was shown:

[   44.505448] ==================================================================                                                                      20:37:27 [3421/145075]
[   44.505455] BUG: KASAN: slab-use-after-free in special_mapping_close+0x9c/0xc8
[   44.505471] Read of size 8 at addr 00000000868dac48 by task sh/1384
[   44.505479]
[   44.505486] CPU: 51 UID: 0 PID: 1384 Comm: sh Not tainted 6.11.0-rc6-next-20240902-dirty #1496
[   44.505503] Hardware name: IBM 3931 A01 704 (z/VM 7.3.0)
[   44.505508] Call Trace:
[   44.505511]  [<000b0324d2f78080>] dump_stack_lvl+0xd0/0x108
[   44.505521]  [<000b0324d2f5435c>] print_address_description.constprop.0+0x34/0x2e0
[   44.505529]  [<000b0324d2f5464c>] print_report+0x44/0x138
[   44.505536]  [<000b0324d1383192>] kasan_report+0xc2/0x140
[   44.505543]  [<000b0324d2f52904>] special_mapping_close+0x9c/0xc8
[   44.505550]  [<000b0324d12c7978>] remove_vma+0x78/0x120
[   44.505557]  [<000b0324d128a2c6>] exit_mmap+0x326/0x750
[   44.505563]  [<000b0324d0ba655a>] __mmput+0x9a/0x370
[   44.505570]  [<000b0324d0bbfbe0>] exit_mm+0x240/0x340
[   44.505575]  [<000b0324d0bc0228>] do_exit+0x548/0xd70
[   44.505580]  [<000b0324d0bc1102>] do_group_exit+0x132/0x390
[   44.505586]  [<000b0324d0bc13b6>] __s390x_sys_exit_group+0x56/0x60
[   44.505592]  [<000b0324d0adcbd6>] do_syscall+0x2f6/0x430
[   44.505599]  [<000b0324d2f78434>] __do_syscall+0xa4/0x170
[   44.505606]  [<000b0324d2f9454c>] system_call+0x74/0x98
[   44.505614]
[   44.505616] Allocated by task 1384:
[   44.505621]  kasan_save_stack+0x40/0x70
[   44.505630]  kasan_save_track+0x28/0x40
[   44.505636]  __kasan_kmalloc+0xa0/0xc0
[   44.505642]  __create_xol_area+0xfa/0x410
[   44.505648]  get_xol_area+0xb0/0xf0
[   44.505652]  uprobe_notify_resume+0x27a/0x470
[   44.505657]  irqentry_exit_to_user_mode+0x15e/0x1d0
[   44.505664]  pgm_check_handler+0x122/0x170
[   44.505670]
[   44.505672] Freed by task 1384:
[   44.505676]  kasan_save_stack+0x40/0x70
[   44.505682]  kasan_save_track+0x28/0x40
[   44.505687]  kasan_save_free_info+0x4a/0x70
[   44.505693]  __kasan_slab_free+0x5a/0x70
[   44.505698]  kfree+0xe8/0x3f0
[   44.505704]  __mmput+0x20/0x370
[   44.505709]  exit_mm+0x240/0x340
[   44.505713]  do_exit+0x548/0xd70
[   44.505718]  do_group_exit+0x132/0x390
[   44.505722]  __s390x_sys_exit_group+0x56/0x60
[   44.505727]  do_syscall+0x2f6/0x430
[   44.505732]  __do_syscall+0xa4/0x170
[   44.505738]  system_call+0x74/0x98

The problem is that uprobe_clear_state() kfree's struct xol_area, which
contains struct vm_special_mapping *xol_mapping. This one is passed to
_install_special_mapping() in xol_add_vma().
__mput reads:

static inline void __mmput(struct mm_struct *mm)
{
        VM_BUG_ON(atomic_read(&mm->mm_users));

        uprobe_clear_state(mm);
        exit_aio(mm);
        ksm_exit(mm);
        khugepaged_exit(mm); /* must run before exit_mmap */
        exit_mmap(mm);
        ...
}

So uprobe_clear_state() in the beginning free's the memory area
containing the vm_special_mapping data, but exit_mmap() uses this
address later via vma->vm_private_data (which was set in
_install_special_mapping().

Fix this by moving uprobe_clear_state() to uprobes.c and use it as
close() callback.

[usama.anjum@collabora.com: remove unneeded condition]
  Link: https://lkml.kernel.org/r/20240906101825.177490-1-usama.anjum@collabora.com
Link: https://lkml.kernel.org/r/20240903073629.2442754-1-svens@linux.ibm.com
Fixes: 223febc6e557 ("mm: add optional close() to struct vm_special_mapping")
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/uprobes.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b503fafb7fb3..493dc95d912c 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -126,7 +126,6 @@ extern int uprobe_pre_sstep_notifier(struct pt_regs *regs);
 extern void uprobe_notify_resume(struct pt_regs *regs);
 extern bool uprobe_deny_signal(void);
 extern bool arch_uprobe_skip_sstep(struct arch_uprobe *aup, struct pt_regs *regs);
-extern void uprobe_clear_state(struct mm_struct *mm);
 extern int  arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr);
 extern int  arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern int  arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
-- 
cgit v1.2.3


From 0e40cf2a8b2c847950e025d5aa594bd545118d26 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Thu, 5 Sep 2024 00:30:50 +0000
Subject: cgroup: clarify css sibling linkage is protected by cgroup_mutex or
 RCU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Improve mem_cgroup_iter()", v4.

Incremental cgroup iteration is being used again [1]. This patchset
improves the reliability of mem_cgroup_iter(). It also improves
simplicity and code readability.

[1] https://lore.kernel.org/20240514202641.2821494-1-hannes@cmpxchg.org/


This patch (of 5):

Explicitly document that css sibling/descendant linkage is protected by
cgroup_mutex or RCU.  Also, document in css_next_descendant_pre() and
similar functions that it isn't necessary to hold a ref on @pos.

The following changes in this patchset rely on this clarification for
simplification in memcg iteration code.

Link: https://lkml.kernel.org/r/20240905003058.1859929-1-kinseyho@google.com
Link: https://lkml.kernel.org/r/20240905003058.1859929-2-kinseyho@google.com
Suggested-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: T.J. Mercier <tjmercier@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cgroup-defs.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 7fc2d0195f56..ca7e912b8355 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -172,7 +172,11 @@ struct cgroup_subsys_state {
 	/* reference count - access via css_[try]get() and css_put() */
 	struct percpu_ref refcnt;
 
-	/* siblings list anchored at the parent's ->children */
+	/*
+	 * siblings list anchored at the parent's ->children
+	 *
+	 * linkage is protected by cgroup_mutex or RCU
+	 */
 	struct list_head sibling;
 	struct list_head children;
 
-- 
cgit v1.2.3


From ec0db74b4b1f249ffca4df450f54c17573114045 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Thu, 5 Sep 2024 00:30:53 +0000
Subject: mm: restart if multiple traversals raced
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, if multiple reclaimers raced on the same position, the
reclaimers which detect the race will still reclaim from the same memcg.
Instead, the reclaimers which detect the race should move on to the next
memcg in the hierarchy.

So, in the case where multiple traversals race, jump back to the start of
the mem_cgroup_iter() function to find the next memcg in the hierarchy to
reclaim from.

Link: https://lkml.kernel.org/r/20240905003058.1859929-5-kinseyho@google.com
Reported-by: syzbot+e099d407346c45275ce9@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/000000000000817cf10620e20d33@google.com/
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Reviewed-by: T.J. Mercier <tjmercier@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fe05fdb92779..2ef94c74847d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -57,7 +57,7 @@ enum memcg_memory_event {
 
 struct mem_cgroup_reclaim_cookie {
 	pg_data_t *pgdat;
-	unsigned int generation;
+	int generation;
 };
 
 #ifdef CONFIG_MEMCG
@@ -78,7 +78,7 @@ struct lruvec_stats;
 struct mem_cgroup_reclaim_iter {
 	struct mem_cgroup *position;
 	/* scan generation, increased every round-trip */
-	unsigned int generation;
+	atomic_t generation;
 };
 
 /*
-- 
cgit v1.2.3


From eebc0f48468e68b93393d11f75ad17385a9acca8 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 5 Sep 2024 22:21:06 -0600
Subject: mm/codetag: fix a typo

Link: https://lkml.kernel.org/r/20240906042108.1150526-1-yuzhao@google.com
Fixes: 22d407b164ff ("lib: add allocation tagging support for memory allocation profiling")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Muchun Song <muchun.song@linux.dev>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/alloc_tag.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 8c61ccd161ba..896491d9ebe8 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -70,7 +70,7 @@ static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
 /*
  * When percpu variables are required to be defined as weak, static percpu
  * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION).
- * Instead we will accound all module allocations to a single counter.
+ * Instead we will account all module allocations to a single counter.
  */
 DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
 
-- 
cgit v1.2.3


From 95599ef684d01136a8b77c16a7c853496786e173 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 5 Sep 2024 22:21:07 -0600
Subject: mm/codetag: fix pgalloc_tag_split()

The current assumption is that a large folio can only be split into
order-0 folios.  That is not the case for hugeTLB demotion, nor for THP
split: see commit c010d47f107f ("mm: thp: split huge page to any lower
order pages").

When a large folio is split into ones of a lower non-zero order, only the
new head pages should be tagged.  Tagging tail pages can cause imbalanced
"calls" counters, since only head pages are untagged by pgalloc_tag_sub()
and the "calls" counts on tail pages are leaked, e.g.,

  # echo 2048kB >/sys/kernel/mm/hugepages/hugepages-1048576kB/demote_size
  # echo 700 >/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
  # time echo 700 >/sys/kernel/mm/hugepages/hugepages-1048576kB/demote
  # echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
  # grep alloc_gigantic_folio /proc/allocinfo

Before this patch:
  0  549427200  mm/hugetlb.c:1549 func:alloc_gigantic_folio

  real  0m2.057s
  user  0m0.000s
  sys   0m2.051s

After this patch:
  0          0  mm/hugetlb.c:1549 func:alloc_gigantic_folio

  real  0m1.711s
  user  0m0.000s
  sys   0m1.704s

Not tagging tail pages also improves the splitting time, e.g., by about
15% when demoting 1GB hugeTLB folios to 2MB ones, as shown above.

Link: https://lkml.kernel.org/r/20240906042108.1150526-2-yuzhao@google.com
Fixes: be25d1d4e822 ("mm: create new codetag references during page splitting")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h          | 30 ++++++++++++++++++++++++++++++
 include/linux/pgalloc_tag.h | 31 -------------------------------
 2 files changed, 30 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b0ff06d18c71..6bb778cbaabf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4084,4 +4084,34 @@ void vma_pgtable_walk_end(struct vm_area_struct *vma);
 
 int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size);
 
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
+{
+	int i;
+	struct alloc_tag *tag;
+	unsigned int nr_pages = 1 << new_order;
+
+	if (!mem_alloc_profiling_enabled())
+		return;
+
+	tag = pgalloc_tag_get(&folio->page);
+	if (!tag)
+		return;
+
+	for (i = nr_pages; i < (1 << old_order); i += nr_pages) {
+		union codetag_ref *ref = get_page_tag_ref(folio_page(folio, i));
+
+		if (ref) {
+			/* Set new reference to point to the original tag */
+			alloc_tag_ref_set(ref, tag);
+			put_page_tag_ref(ref);
+		}
+	}
+}
+#else /* !CONFIG_MEM_ALLOC_PROFILING */
+static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
+{
+}
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 207f0c83c8e9..59a3deb792a8 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -80,36 +80,6 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
 	}
 }
 
-static inline void pgalloc_tag_split(struct page *page, unsigned int nr)
-{
-	int i;
-	struct page_ext *first_page_ext;
-	struct page_ext *page_ext;
-	union codetag_ref *ref;
-	struct alloc_tag *tag;
-
-	if (!mem_alloc_profiling_enabled())
-		return;
-
-	first_page_ext = page_ext = page_ext_get(page);
-	if (unlikely(!page_ext))
-		return;
-
-	ref = codetag_ref_from_page_ext(page_ext);
-	if (!ref->ct)
-		goto out;
-
-	tag = ct_to_alloc_tag(ref->ct);
-	page_ext = page_ext_next(page_ext);
-	for (i = 1; i < nr; i++) {
-		/* Set new reference to point to the original tag */
-		alloc_tag_ref_set(codetag_ref_from_page_ext(page_ext), tag);
-		page_ext = page_ext_next(page_ext);
-	}
-out:
-	page_ext_put(first_page_ext);
-}
-
 static inline struct alloc_tag *pgalloc_tag_get(struct page *page)
 {
 	struct alloc_tag *tag = NULL;
@@ -142,7 +112,6 @@ static inline void clear_page_tag_ref(struct page *page) {}
 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
 				   unsigned int nr) {}
 static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
-static inline void pgalloc_tag_split(struct page *page, unsigned int nr) {}
 static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; }
 static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
 
-- 
cgit v1.2.3


From e0a955bf7f61cb034d228736d81c1ab3a47a3dca Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 5 Sep 2024 22:21:08 -0600
Subject: mm/codetag: add pgalloc_tag_copy()

Add pgalloc_tag_copy() to transfer the codetag from the old folio to the
new one during migration.  This makes original allocation sites persist
cross migration rather than lump into the get_new_folio callbacks passed
into migrate_pages(), e.g., compaction_alloc():

  # echo 1 >/proc/sys/vm/compact_memory
  # grep compaction_alloc /proc/allocinfo

Before this patch:
  132968448  32463  mm/compaction.c:1880 func:compaction_alloc

After this patch:
          0      0  mm/compaction.c:1880 func:compaction_alloc

Link: https://lkml.kernel.org/r/20240906042108.1150526-3-yuzhao@google.com
Fixes: dcfe378c81f7 ("lib: introduce support for page allocation tagging")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/alloc_tag.h | 24 ++++++++++--------------
 include/linux/mm.h        | 27 +++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 896491d9ebe8..1f0a9ff23a2c 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -137,7 +137,16 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
 /* Caller should verify both ref and tag to be valid */
 static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
 {
+	alloc_tag_add_check(ref, tag);
+	if (!ref || !tag)
+		return;
+
 	ref->ct = &tag->ct;
+}
+
+static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
+{
+	__alloc_tag_ref_set(ref, tag);
 	/*
 	 * We need in increment the call counter every time we have a new
 	 * allocation or when we split a large allocation into smaller ones.
@@ -147,22 +156,9 @@ static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag
 	this_cpu_inc(tag->counters->calls);
 }
 
-static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
-{
-	alloc_tag_add_check(ref, tag);
-	if (!ref || !tag)
-		return;
-
-	__alloc_tag_ref_set(ref, tag);
-}
-
 static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
 {
-	alloc_tag_add_check(ref, tag);
-	if (!ref || !tag)
-		return;
-
-	__alloc_tag_ref_set(ref, tag);
+	alloc_tag_ref_set(ref, tag);
 	this_cpu_add(tag->counters->bytes, bytes);
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6bb778cbaabf..39f6faace590 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4108,10 +4108,37 @@ static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new
 		}
 	}
 }
+
+static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
+{
+	struct alloc_tag *tag;
+	union codetag_ref *ref;
+
+	tag = pgalloc_tag_get(&old->page);
+	if (!tag)
+		return;
+
+	ref = get_page_tag_ref(&new->page);
+	if (!ref)
+		return;
+
+	/* Clear the old ref to the original allocation tag. */
+	clear_page_tag_ref(&old->page);
+	/* Decrement the counters of the tag on get_new_folio. */
+	alloc_tag_sub(ref, folio_nr_pages(new));
+
+	__alloc_tag_ref_set(ref, tag);
+
+	put_page_tag_ref(ref);
+}
 #else /* !CONFIG_MEM_ALLOC_PROFILING */
 static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
 {
 }
+
+static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
+{
+}
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
 
 #endif /* _LINUX_MM_H */
-- 
cgit v1.2.3


From 6462dd53a26088a433f90a5c15822196f201037c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 9 Sep 2024 13:42:47 -1000
Subject: sched_ext: Compact struct bpf_iter_scx_dsq_kern

struct scx_iter_scx_dsq is defined as 6 u64's and scx_dsq_iter_kern was
using 5 of them. We want to add two more u64 fields but it's better if we do
so while staying within scx_iter_scx_dsq to maintain binary compatibility.

The way scx_iter_scx_dsq_kern is laid out is rather inefficient - the node
field takes up three u64's but only one bit of the last u64 is used. Turn
the bool into u32 flags and only use the lower 16 bits freeing up 48 bits -
16 bits for flags, 32 bits for a u32 - for use by struct
bpf_iter_scx_dsq_kern.

This allows moving the dsq_seq and flags fields of bpf_iter_scx_dsq_kern
into the cursor field reducing the struct size by a full u64.

No behavior changes intended.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index dc646cca4fcf..1ddbde64a31b 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -119,9 +119,17 @@ enum scx_kf_mask {
 	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
 };
 
+enum scx_dsq_lnode_flags {
+	SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
+
+	/* high 16 bits can be for iter cursor flags */
+	__SCX_DSQ_LNODE_PRIV_SHIFT = 16,
+};
+
 struct scx_dsq_list_node {
 	struct list_head	node;
-	bool			is_bpf_iter_cursor;
+	u32			flags;
+	u32			priv;		/* can be used by iter cursor */
 };
 
 /*
-- 
cgit v1.2.3


From 17245a195df4c86b1fd38718d8cdc532c040c08e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 6 Sep 2024 09:10:59 -0700
Subject: net: remove dev_pick_tx_cpu_id()

dev_pick_tx_cpu_id() has been introduced with two users by
commit a4ea8a3dacc3 ("net: Add generic ndo_select_queue functions").
The use in AF_PACKET has been removed in 2019 by
commit b71b5837f871 ("packet: rework packet_pick_tx_queue() to use common code selection")
The other user was a Netlogic XLP driver, removed in 2021 by
commit 47ac6f567c28 ("staging: Remove Netlogic XLP network driver").

It's relatively unlikely that any modern driver will need an
.ndo_select_queue implementation which picks purely based on CPU ID
and skips XPS, delete dev_pick_tx_cpu_id()

Found by code inspection.

Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20240906161059.715546-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b47c00657bd0..2e40a137dc12 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3085,8 +3085,6 @@ void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
 u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 		     struct net_device *sb_dev);
-u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
-		       struct net_device *sb_dev);
 
 int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev);
 int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
-- 
cgit v1.2.3


From 2f87e9cf0c9e21ab9be1fb2ba8520a1525359497 Mon Sep 17 00:00:00 2001
From: Cindy Lu <lulu@redhat.com>
Date: Wed, 31 Jul 2024 11:16:01 +0800
Subject: vdpa: support set mac address from vdpa tool

Add new UAPI to support the mac address from vdpa tool
Function vdpa_nl_cmd_dev_attr_set_doit() will get the
new MAC address from the vdpa tool and then set it to the device.

The usage is: vdpa dev set name vdpa_name mac **:**:**:**:**:**

Here is example:
root@L1# vdpa -jp dev config show vdpa0
{
    "config": {
        "vdpa0": {
            "mac": "82:4d:e9:5d:d7:e6",
            "link ": "up",
            "link_announce ": false,
            "mtu": 1500
        }
    }
}

root@L1# vdpa dev set name vdpa0 mac 00:11:22:33:44:55

root@L1# vdpa -jp dev config show vdpa0
{
    "config": {
        "vdpa0": {
            "mac": "00:11:22:33:44:55",
            "link ": "up",
            "link_announce ": false,
            "mtu": 1500
        }
    }
}

Signed-off-by: Cindy Lu <lulu@redhat.com>
Message-Id: <20240731031653.1047692-2-lulu@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/vdpa.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 7977ca03ac7a..2e7a30fe6b92 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -582,11 +582,20 @@ void vdpa_set_status(struct vdpa_device *vdev, u8 status);
  *	     @dev: vdpa device to remove
  *	     Driver need to remove the specified device by calling
  *	     _vdpa_unregister_device().
+ * @dev_set_attr: change a vdpa device's attr after it was create
+ *	     @mdev: parent device to use for device
+ *	     @dev: vdpa device structure
+ *	     @config:Attributes to be set for the device.
+ *	     The driver needs to check the mask of the structure and then set
+ *	     the related information to the vdpa device. The driver must return 0
+ *	     if set successfully.
  */
 struct vdpa_mgmtdev_ops {
 	int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name,
 		       const struct vdpa_dev_set_config *config);
 	void (*dev_del)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev);
+	int (*dev_set_attr)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev,
+			    const struct vdpa_dev_set_config *config);
 };
 
 /**
-- 
cgit v1.2.3


From 11596dc3dfae572cc267dda2dc4dab9ae34668f3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Sep 2024 07:39:05 +0300
Subject: iomap: pass flags to iomap_file_buffered_write_punch_delalloc

To fix short write error handling, We'll need to figure out what operation
iomap_file_buffered_write_punch_delalloc is called for.  Pass the flags
argument on to it, and reorder the argument list to match that of
->iomap_end so that the compiler only has to add the new punch argument
to the end of it instead of reshuffling the registers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240910043949.3481298-4-hch@lst.de
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/iomap.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index f792b37f7627..52626906ff35 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -258,10 +258,6 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i)
 
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops, void *private);
-int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
-		struct iomap *iomap, loff_t pos, loff_t length, ssize_t written,
-		int (*punch)(struct inode *inode, loff_t pos, loff_t length));
-
 int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
 void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
 bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
@@ -277,6 +273,12 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 		const struct iomap_ops *ops);
 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
 			const struct iomap_ops *ops);
+
+typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
+int iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos,
+		loff_t length, ssize_t written, unsigned flag,
+		struct iomap *iomap, iomap_punch_t punch);
+
 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		u64 start, u64 len, const struct iomap_ops *ops);
 loff_t iomap_seek_hole(struct inode *inode, loff_t offset,
-- 
cgit v1.2.3


From 492f53758fad4fde3e9a98696780f8b53f87cdae Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Sep 2024 07:39:06 +0300
Subject: iomap: pass the iomap to the punch callback

XFS will need to look at the flags in the iomap structure, so pass it
down all the way to the callback.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240910043949.3481298-5-hch@lst.de
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/iomap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 52626906ff35..ec2eab94f00a 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -274,7 +274,8 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
 			const struct iomap_ops *ops);
 
-typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
+typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
+		struct iomap *iomap);
 int iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos,
 		loff_t length, ssize_t written, unsigned flag,
 		struct iomap *iomap, iomap_punch_t punch);
-- 
cgit v1.2.3


From 4bceb9ba05aca23652567fb5f899cc7fcb12c8d0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Sep 2024 07:39:07 +0300
Subject: iomap: remove the iomap_file_buffered_write_punch_delalloc return
 value

iomap_file_buffered_write_punch_delalloc can only return errors if either
the ->punch callback returned an error, or if someone changed the API of
mapping_seek_hole_data to return a negative error code that is not
-ENXIO.

As the only instance of ->punch never returns an error, an such an error
would be fatal anyway remove the entire error propagation and don't
return an error code from iomap_file_buffered_write_punch_delalloc.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240910043949.3481298-6-hch@lst.de
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/iomap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index ec2eab94f00a..4ad12a3c8bae 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -274,9 +274,9 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
 			const struct iomap_ops *ops);
 
-typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
+typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
 		struct iomap *iomap);
-int iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos,
+void iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos,
 		loff_t length, ssize_t written, unsigned flag,
 		struct iomap *iomap, iomap_punch_t punch);
 
-- 
cgit v1.2.3


From 9028cdeb38e1f37d63cb3154799dd259b67e879e Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Thu, 5 Sep 2024 10:34:22 -0700
Subject: memcg: add charging of already allocated slab objects

At the moment, the slab objects are charged to the memcg at the
allocation time. However there are cases where slab objects are
allocated at the time where the right target memcg to charge it to is
not known. One such case is the network sockets for the incoming
connection which are allocated in the softirq context.

Couple hundred thousand connections are very normal on large loaded
server and almost all of those sockets underlying those connections get
allocated in the softirq context and thus not charged to any memcg.
However later at the accept() time we know the right target memcg to
charge. Let's add new API to charge already allocated objects, so we can
have better accounting of the memory usage.

To measure the performance impact of this change, tcp_crr is used from
the neper [1] performance suite. Basically it is a network ping pong
test with new connection for each ping pong.

The server and the client are run inside 3 level of cgroup hierarchy
using the following commands:

Server:
 $ tcp_crr -6

Client:
 $ tcp_crr -6 -c -H ${server_ip}

If the client and server run on different machines with 50 GBPS NIC,
there is no visible impact of the change.

For the same machine experiment with v6.11-rc5 as base.

          base (throughput)     with-patch
tcp_crr   14545 (+- 80)         14463 (+- 56)

It seems like the performance impact is within the noise.

Link: https://github.com/google/neper [1]
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com> # net
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index eb2bf4629157..3be2a5ed4936 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -547,6 +547,35 @@ void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
 			    gfp_t gfpflags) __assume_slab_alignment __malloc;
 #define kmem_cache_alloc_lru(...)	alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__))
 
+/**
+ * kmem_cache_charge - memcg charge an already allocated slab memory
+ * @objp: address of the slab object to memcg charge
+ * @gfpflags: describe the allocation context
+ *
+ * kmem_cache_charge allows charging a slab object to the current memcg,
+ * primarily in cases where charging at allocation time might not be possible
+ * because the target memcg is not known (i.e. softirq context)
+ *
+ * The objp should be pointer returned by the slab allocator functions like
+ * kmalloc (with __GFP_ACCOUNT in flags) or kmem_cache_alloc. The memcg charge
+ * behavior can be controlled through gfpflags parameter, which affects how the
+ * necessary internal metadata can be allocated. Including __GFP_NOFAIL denotes
+ * that overcharging is requested instead of failure, but is not applied for the
+ * internal metadata allocation.
+ *
+ * There are several cases where it will return true even if the charging was
+ * not done:
+ * More specifically:
+ *
+ * 1. For !CONFIG_MEMCG or cgroup_disable=memory systems.
+ * 2. Already charged slab objects.
+ * 3. For slab objects from KMALLOC_NORMAL caches - allocated by kmalloc()
+ *    without __GFP_ACCOUNT
+ * 4. Allocating internal metadata has failed
+ *
+ * Return: true if charge was successful otherwise false.
+ */
+bool kmem_cache_charge(void *objp, gfp_t gfpflags);
 void kmem_cache_free(struct kmem_cache *s, void *objp);
 
 kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
-- 
cgit v1.2.3


From 879fb3c274c12cb0892718e1e54f85fc406e3c7b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 5 Sep 2024 09:56:45 +0200
Subject: slab: add struct kmem_cache_args

Currently we have multiple kmem_cache_create*() variants that take up to
seven separate parameters with one of the functions having to grow an
eigth parameter in the future to handle both usercopy and a custom
freelist pointer.

Add a struct kmem_cache_args structure and move less common parameters
into it. Core parameters such as name, object size, and flags continue
to be passed separately.

Add a new function __kmem_cache_create_args() that takes a struct
kmem_cache_args pointer and port do_kmem_cache_create_usercopy() over to
it.

In follow-up patches we will port the other kmem_cache_create*()
variants over to it as well.

Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 5b2da2cf31a8..2b8eeca7fd2c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -240,6 +240,28 @@ struct mem_cgroup;
  */
 bool slab_is_available(void);
 
+/**
+ * struct kmem_cache_args - Less common arguments for kmem_cache_create()
+ * @align: The required alignment for the objects.
+ * @useroffset: Usercopy region offset
+ * @usersize: Usercopy region size
+ * @freeptr_offset: Custom offset for the free pointer in RCU caches
+ * @use_freeptr_offset: Whether a @freeptr_offset is used
+ * @ctor: A constructor for the objects.
+ */
+struct kmem_cache_args {
+	unsigned int align;
+	unsigned int useroffset;
+	unsigned int usersize;
+	unsigned int freeptr_offset;
+	bool use_freeptr_offset;
+	void (*ctor)(void *);
+};
+
+struct kmem_cache *__kmem_cache_create_args(const char *name,
+					    unsigned int object_size,
+					    struct kmem_cache_args *args,
+					    slab_flags_t flags);
 struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
 			unsigned int align, slab_flags_t flags,
 			void (*ctor)(void *));
-- 
cgit v1.2.3


From 052d67b46bcd91b6785e8e6e047241814f6142a3 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 5 Sep 2024 09:56:53 +0200
Subject: slab: port KMEM_CACHE() to struct kmem_cache_args

Make KMEM_CACHE() use struct kmem_cache_args.

Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 2b8eeca7fd2c..1f38d94387cc 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -284,9 +284,11 @@ int kmem_cache_shrink(struct kmem_cache *s);
  * f.e. add ____cacheline_aligned_in_smp to the struct declaration
  * then the objects will be properly aligned in SMP configurations.
  */
-#define KMEM_CACHE(__struct, __flags)					\
-		kmem_cache_create(#__struct, sizeof(struct __struct),	\
-			__alignof__(struct __struct), (__flags), NULL)
+#define KMEM_CACHE(__struct, __flags)                                   \
+	__kmem_cache_create_args(#__struct, sizeof(struct __struct),    \
+			&(struct kmem_cache_args) {			\
+				.align	= __alignof__(struct __struct), \
+			}, (__flags))
 
 /*
  * To whitelist a single field for copying to/from usercopy, use this
-- 
cgit v1.2.3


From 199cd13a745eb44fb4828bca373155293cdcfa5c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 5 Sep 2024 09:56:54 +0200
Subject: slab: port KMEM_CACHE_USERCOPY() to struct kmem_cache_args

Make KMEM_CACHE_USERCOPY() use struct kmem_cache_args.

Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1f38d94387cc..7e15a3a3edb1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -294,12 +294,13 @@ int kmem_cache_shrink(struct kmem_cache *s);
  * To whitelist a single field for copying to/from usercopy, use this
  * macro instead for KMEM_CACHE() above.
  */
-#define KMEM_CACHE_USERCOPY(__struct, __flags, __field)			\
-		kmem_cache_create_usercopy(#__struct,			\
-			sizeof(struct __struct),			\
-			__alignof__(struct __struct), (__flags),	\
-			offsetof(struct __struct, __field),		\
-			sizeof_field(struct __struct, __field), NULL)
+#define KMEM_CACHE_USERCOPY(__struct, __flags, __field)						\
+	__kmem_cache_create_args(#__struct, sizeof(struct __struct),				\
+			&(struct kmem_cache_args) {						\
+				.align		= __alignof__(struct __struct),			\
+				.useroffset	= offsetof(struct __struct, __field),		\
+				.usersize	= sizeof_field(struct __struct, __field),	\
+			}, (__flags))
 
 /*
  * Common kmalloc functions provided by all allocators
-- 
cgit v1.2.3


From b2e7456b5c25c41eda7a8a15f7ccaa4e7579949f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 5 Sep 2024 09:56:55 +0200
Subject: slab: create kmem_cache_create() compatibility layer

Use _Generic() to create a compatibility layer that type switches on the
third argument to either call __kmem_cache_create() or
__kmem_cache_create_args(). If NULL is passed for the struct
kmem_cache_args argument use default args making porting for callers
that don't care about additional arguments easy.

Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 7e15a3a3edb1..15eb0a0defd6 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -262,9 +262,10 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
 					    unsigned int object_size,
 					    struct kmem_cache_args *args,
 					    slab_flags_t flags);
-struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
-			unsigned int align, slab_flags_t flags,
-			void (*ctor)(void *));
+
+struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size,
+				       unsigned int align, slab_flags_t flags,
+				       void (*ctor)(void *));
 struct kmem_cache *kmem_cache_create_usercopy(const char *name,
 			unsigned int size, unsigned int align,
 			slab_flags_t flags,
@@ -273,6 +274,28 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
 struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size,
 					 unsigned int freeptr_offset,
 					 slab_flags_t flags);
+
+/* If NULL is passed for @args, use this variant with default arguments. */
+static inline struct kmem_cache *
+__kmem_cache_default_args(const char *name, unsigned int size,
+			  struct kmem_cache_args *args,
+			  slab_flags_t flags)
+{
+	struct kmem_cache_args kmem_default_args = {};
+
+	/* Make sure we don't get passed garbage. */
+	if (WARN_ON_ONCE(args))
+		return ERR_PTR(-EINVAL);
+
+	return __kmem_cache_create_args(name, size, &kmem_default_args, flags);
+}
+
+#define kmem_cache_create(__name, __object_size, __args, ...)           \
+	_Generic((__args),                                              \
+		struct kmem_cache_args *: __kmem_cache_create_args,	\
+		void *: __kmem_cache_default_args,			\
+		default: __kmem_cache_create)(__name, __object_size, __args, __VA_ARGS__)
+
 void kmem_cache_destroy(struct kmem_cache *s);
 int kmem_cache_shrink(struct kmem_cache *s);
 
-- 
cgit v1.2.3


From 3d453e60f1a9c377d670d5fe306d3b9eed477bc0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 5 Sep 2024 09:56:57 +0200
Subject: slab: remove kmem_cache_create_rcu()

Now that we have ported all users of kmem_cache_create_rcu() to struct
kmem_cache_args the function is unused and can be removed.

Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 15eb0a0defd6..cb82a6414a28 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -271,9 +271,6 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
 			slab_flags_t flags,
 			unsigned int useroffset, unsigned int usersize,
 			void (*ctor)(void *));
-struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size,
-					 unsigned int freeptr_offset,
-					 slab_flags_t flags);
 
 /* If NULL is passed for @args, use this variant with default arguments. */
 static inline struct kmem_cache *
-- 
cgit v1.2.3


From 0c9050b09cfb1ddb3fe4b2d401dca1fb674c825a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 5 Sep 2024 09:56:58 +0200
Subject: slab: make kmem_cache_create_usercopy() static inline

Make kmem_cache_create_usercopy() a static inline function.

Signed-off-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 49 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 44 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index cb82a6414a28..634717c9f73b 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -266,11 +266,50 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
 struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size,
 				       unsigned int align, slab_flags_t flags,
 				       void (*ctor)(void *));
-struct kmem_cache *kmem_cache_create_usercopy(const char *name,
-			unsigned int size, unsigned int align,
-			slab_flags_t flags,
-			unsigned int useroffset, unsigned int usersize,
-			void (*ctor)(void *));
+
+/**
+ * kmem_cache_create_usercopy - Create a cache with a region suitable
+ * for copying to userspace
+ * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @size: The size of objects to be created in this cache.
+ * @align: The required alignment for the objects.
+ * @flags: SLAB flags
+ * @useroffset: Usercopy region offset
+ * @usersize: Usercopy region size
+ * @ctor: A constructor for the objects.
+ *
+ * Cannot be called within a interrupt, but can be interrupted.
+ * The @ctor is run when new pages are allocated by the cache.
+ *
+ * The flags are
+ *
+ * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
+ * to catch references to uninitialised memory.
+ *
+ * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
+ * for buffer overruns.
+ *
+ * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
+ * cacheline.  This can be beneficial if you're counting cycles as closely
+ * as davem.
+ *
+ * Return: a pointer to the cache on success, NULL on failure.
+ */
+static inline struct kmem_cache *
+kmem_cache_create_usercopy(const char *name, unsigned int size,
+			   unsigned int align, slab_flags_t flags,
+			   unsigned int useroffset, unsigned int usersize,
+			   void (*ctor)(void *))
+{
+	struct kmem_cache_args kmem_args = {
+		.align		= align,
+		.ctor		= ctor,
+		.useroffset	= useroffset,
+		.usersize	= usersize,
+	};
+
+	return __kmem_cache_create_args(name, size, &kmem_args, flags);
+}
 
 /* If NULL is passed for @args, use this variant with default arguments. */
 static inline struct kmem_cache *
-- 
cgit v1.2.3


From 781aee755638dbb6dbfe022bdd2f732621ec9534 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 5 Sep 2024 09:56:59 +0200
Subject: slab: make __kmem_cache_create() static inline

Make __kmem_cache_create() a static inline function.

Signed-off-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 634717c9f73b..331412a9f4f2 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -262,10 +262,17 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
 					    unsigned int object_size,
 					    struct kmem_cache_args *args,
 					    slab_flags_t flags);
+static inline struct kmem_cache *
+__kmem_cache_create(const char *name, unsigned int size, unsigned int align,
+		    slab_flags_t flags, void (*ctor)(void *))
+{
+	struct kmem_cache_args kmem_args = {
+		.align	= align,
+		.ctor	= ctor,
+	};
 
-struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size,
-				       unsigned int align, slab_flags_t flags,
-				       void (*ctor)(void *));
+	return __kmem_cache_create_args(name, size, &kmem_args, flags);
+}
 
 /**
  * kmem_cache_create_usercopy - Create a cache with a region suitable
-- 
cgit v1.2.3


From 4ba4f1afb6a9fed8ef896c2363076e36572f71da Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:37 -0700
Subject: perf: Generic hotplug support for a PMU with a scope

The perf subsystem assumes that the counters of a PMU are per-CPU. So
the user space tool reads a counter from each CPU in the system wide
mode. However, many PMUs don't have a per-CPU counter. The counter is
effective for a scope, e.g., a die or a socket. To address this, a
cpumask is exposed by the kernel driver to restrict to one CPU to stand
for a specific scope. In case the given CPU is removed,
the hotplug support has to be implemented for each such driver.

The codes to support the cpumask and hotplug are very similar.
- Expose a cpumask into sysfs
- Pickup another CPU in the same scope if the given CPU is removed.
- Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
- In event init, always set the CPU in the cpumask to event->cpu

Similar duplicated codes are implemented for each such PMU driver. It
would be good to introduce a generic infrastructure to avoid such
duplication.

5 popular scopes are implemented here, core, die, cluster, pkg, and
the system-wide. The scope can be set when a PMU is registered. If so, a
"cpumask" is automatically exposed for the PMU.

The "cpumask" is from the perf_online_<scope>_mask, which is to track
the active CPU for each scope. They are set when the first CPU of the
scope is online via the generic perf hotplug support. When a
corresponding CPU is removed, the perf_online_<scope>_mask is updated
accordingly and the PMU will be moved to a new CPU from the same scope
if possible.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240802151643.1691631-2-kan.liang@linux.intel.com
---
 include/linux/perf_event.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 701549967c18..a3cbcd727366 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -295,6 +295,19 @@ struct perf_event_pmu_context;
 #define PERF_PMU_CAP_AUX_OUTPUT			0x0080
 #define PERF_PMU_CAP_EXTENDED_HW_TYPE		0x0100
 
+/**
+ * pmu::scope
+ */
+enum perf_pmu_scope {
+	PERF_PMU_SCOPE_NONE	= 0,
+	PERF_PMU_SCOPE_CORE,
+	PERF_PMU_SCOPE_DIE,
+	PERF_PMU_SCOPE_CLUSTER,
+	PERF_PMU_SCOPE_PKG,
+	PERF_PMU_SCOPE_SYS_WIDE,
+	PERF_PMU_MAX_SCOPE,
+};
+
 struct perf_output_handle;
 
 #define PMU_NULL_DEV	((void *)(~0UL))
@@ -318,6 +331,11 @@ struct pmu {
 	 */
 	int				capabilities;
 
+	/*
+	 * PMU scope
+	 */
+	unsigned int			scope;
+
 	int __percpu			*pmu_disable_count;
 	struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
 	atomic_t			exclusive_cnt; /* < 0: cpu; > 0: tsk */
-- 
cgit v1.2.3


From a48a36b316ae5d3ab83f9b545dba15998e96d59c Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:38 -0700
Subject: perf: Add PERF_EV_CAP_READ_SCOPE

Usually, an event can be read from any CPU of the scope. It doesn't need
to be read from the advertised CPU.

Add a new event cap, PERF_EV_CAP_READ_SCOPE. An event of a PMU with
scope can be read from any active CPU in the scope.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240802151643.1691631-3-kan.liang@linux.intel.com
---
 include/linux/perf_event.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a3cbcd727366..794f66057878 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -636,10 +636,13 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *,
  * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
  * cannot be a group leader. If an event with this flag is detached from the
  * group it is scheduled out and moved into an unrecoverable ERROR state.
+ * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the
+ * PMU scope where it is active.
  */
 #define PERF_EV_CAP_SOFTWARE		BIT(0)
 #define PERF_EV_CAP_READ_ACTIVE_PKG	BIT(1)
 #define PERF_EV_CAP_SIBLING		BIT(2)
+#define PERF_EV_CAP_READ_SCOPE		BIT(3)
 
 #define SWEVENT_HLIST_BITS		8
 #define SWEVENT_HLIST_SIZE		(1 << SWEVENT_HLIST_BITS)
-- 
cgit v1.2.3


From 08155c7f2a2cc6ff99886ea5743da274be24ebe4 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 2 Aug 2024 08:16:39 -0700
Subject: perf/x86/intel/cstate: Clean up cpumask and hotplug

There are three cstate PMUs with different scopes, core, die and module.
The scopes are supported by the generic perf_event subsystem now.

Set the scope for each PMU and remove all the cpumask and hotplug codes.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240802151643.1691631-4-kan.liang@linux.intel.com
---
 include/linux/cpuhotplug.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 9316c39260e0..2101ae2ecfca 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -152,7 +152,6 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
 	CPUHP_AP_PERF_X86_STARTING,
 	CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
-	CPUHP_AP_PERF_X86_CSTATE_STARTING,
 	CPUHP_AP_PERF_XTENSA_STARTING,
 	CPUHP_AP_ARM_VFP_STARTING,
 	CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
@@ -209,7 +208,6 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
 	CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
 	CPUHP_AP_PERF_X86_RAPL_ONLINE,
-	CPUHP_AP_PERF_X86_CSTATE_ONLINE,
 	CPUHP_AP_PERF_S390_CF_ONLINE,
 	CPUHP_AP_PERF_S390_SF_ONLINE,
 	CPUHP_AP_PERF_ARM_CCI_ONLINE,
-- 
cgit v1.2.3


From 740c1c84bfa3d8c63bd3b01fb570e7452f51fbd8 Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Tue, 10 Sep 2024 10:26:17 +0800
Subject: spi: remove spi_controller_is_slave() and spi_slave_abort()

spi_controller_is_slave() and spi_slave_abort() are all replaced,
so they can be removed.

No functional changed.

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Link: https://patch.msgid.link/20240910022618.1397-8-yangyingliang@huaweicloud.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index d47d5f14ff99..4b95663163e0 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -498,7 +498,6 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  *	     controller has native support for memory like operations.
  * @mem_caps: controller capabilities for the handling of memory operations.
  * @unprepare_message: undo any work done by prepare_message().
- * @slave_abort: abort the ongoing transfer request on an SPI slave controller
  * @target_abort: abort the ongoing transfer request on an SPI target controller
  * @cs_gpiods: Array of GPIO descriptors to use as chip select lines; one per CS
  *	number. Any individual value may be NULL for CS lines that
@@ -725,10 +724,7 @@ struct spi_controller {
 			       struct spi_message *message);
 	int (*unprepare_message)(struct spi_controller *ctlr,
 				 struct spi_message *message);
-	union {
-		int (*slave_abort)(struct spi_controller *ctlr);
-		int (*target_abort)(struct spi_controller *ctlr);
-	};
+	int (*target_abort)(struct spi_controller *ctlr);
 
 	/*
 	 * These hooks are for drivers that use a generic implementation
@@ -802,11 +798,6 @@ static inline void spi_controller_put(struct spi_controller *ctlr)
 		put_device(&ctlr->dev);
 }
 
-static inline bool spi_controller_is_slave(struct spi_controller *ctlr)
-{
-	return IS_ENABLED(CONFIG_SPI_SLAVE) && ctlr->slave;
-}
-
 static inline bool spi_controller_is_target(struct spi_controller *ctlr)
 {
 	return IS_ENABLED(CONFIG_SPI_SLAVE) && ctlr->target;
@@ -1296,7 +1287,6 @@ extern int devm_spi_optimize_message(struct device *dev, struct spi_device *spi,
 
 extern int spi_setup(struct spi_device *spi);
 extern int spi_async(struct spi_device *spi, struct spi_message *message);
-extern int spi_slave_abort(struct spi_device *spi);
 extern int spi_target_abort(struct spi_device *spi);
 
 static inline size_t
-- 
cgit v1.2.3


From d6de45e3c6f3713d3825d3e2860c11d24e0f941f Mon Sep 17 00:00:00 2001
From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Date: Tue, 10 Sep 2024 00:35:03 +0200
Subject: platform/x86: asus-wmi: Disable OOBE experience on Zenbook S 16
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The OOBE experience fades the keyboard backlight in & out continuously,
and make the backlight uncontrollable using its device.

Workaround taken from
https://wiki.archlinux.org/index.php?title=ASUS_Zenbook_UM5606&diff=next&oldid=815547

Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20240909223503.1445779-1-bas@basnieuwenhuizen.nl
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 include/linux/platform_data/x86/asus-wmi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 0aeeae1c1943..ae9bf7479e7b 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -62,6 +62,7 @@
 #define ASUS_WMI_DEVID_KBD_BACKLIGHT	0x00050021
 #define ASUS_WMI_DEVID_LIGHT_SENSOR	0x00050022 /* ?? */
 #define ASUS_WMI_DEVID_LIGHTBAR		0x00050025
+#define ASUS_WMI_DEVID_OOBE		0x0005002F
 /* This can only be used to disable the screen, not re-enable */
 #define ASUS_WMI_DEVID_SCREENPAD_POWER	0x00050031
 /* Writing a brightness re-enables the screen if disabled */
-- 
cgit v1.2.3


From 070a5e6295e88a2b75cbfef04ff8b4ec05775e6d Mon Sep 17 00:00:00 2001
From: Furong Xu <0x1207@gmail.com>
Date: Fri, 6 Sep 2024 22:30:06 +0800
Subject: net: stmmac: move stmmac_fpe_cfg to stmmac_priv data

By moving the fpe_cfg field to the stmmac_priv data, stmmac_fpe_cfg
becomes platform-data eventually, instead of a run-time config.

Suggested-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Furong Xu <0x1207@gmail.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Link: https://patch.msgid.link/d9b3d7ecb308c5e39778a4c8ae9df288a2754379.1725631883.git.0x1207@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/stmmac.h | 28 ----------------------------
 1 file changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 338991c08f00..d79ff252cfdc 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -138,33 +138,6 @@ struct stmmac_txq_cfg {
 	int tbs_en;
 };
 
-/* FPE link state */
-enum stmmac_fpe_state {
-	FPE_STATE_OFF = 0,
-	FPE_STATE_CAPABLE = 1,
-	FPE_STATE_ENTERING_ON = 2,
-	FPE_STATE_ON = 3,
-};
-
-/* FPE link-partner hand-shaking mPacket type */
-enum stmmac_mpacket_type {
-	MPACKET_VERIFY = 0,
-	MPACKET_RESPONSE = 1,
-};
-
-enum stmmac_fpe_task_state_t {
-	__FPE_REMOVING,
-	__FPE_TASK_SCHED,
-};
-
-struct stmmac_fpe_cfg {
-	bool enable;				/* FPE enable */
-	bool hs_enable;				/* FPE handshake enable */
-	enum stmmac_fpe_state lp_fpe_state;	/* Link Partner FPE state */
-	enum stmmac_fpe_state lo_fpe_state;	/* Local station FPE state */
-	u32 fpe_csr;				/* MAC_FPE_CTRL_STS reg cache */
-};
-
 struct stmmac_safety_feature_cfg {
 	u32 tsoee;
 	u32 mrxpee;
@@ -232,7 +205,6 @@ struct plat_stmmacenet_data {
 	struct fwnode_handle *port_node;
 	struct device_node *mdio_node;
 	struct stmmac_dma_cfg *dma_cfg;
-	struct stmmac_fpe_cfg *fpe_cfg;
 	struct stmmac_safety_feature_cfg *safety_feat_cfg;
 	int clk_csr;
 	int has_gmac;
-- 
cgit v1.2.3


From d591f6804e7e1310881c9224d72247a2b65039af Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 27 Aug 2024 18:48:46 -0500
Subject: PCI: Wait for device readiness with Configuration RRS

After a device reset, delays are required before the device can
successfully complete config accesses.  PCIe r6.0, sec 6.6, specifies some
delays required before software can perform config accesses.  Devices that
require more time after those delays may respond to config accesses with
Configuration Request Retry Status (RRS) completions.

Callers of pci_dev_wait() are responsible for delays until the device can
respond to config accesses.  pci_dev_wait() waits any additional time until
the device can successfully complete config accesses.

Reading config space of devices that are not present or not ready typically
returns ~0 (PCI_ERROR_RESPONSE).  Previously we polled the Command register
until we got a value other than ~0.  This is sometimes a problem because
Root Complex handling of RRS completions may include several retries and
implementation-specific behavior that is invisible to software (see sec
2.3.2), so the exponential backoff in pci_dev_wait() may not work as
intended.

Linux enables Configuration RRS Software Visibility on all Root Ports that
support it.  If it is enabled, read the Vendor ID instead of the Command
register.  RRS completions cause immediate return of the 0x0001 reserved
Vendor ID value, so the pci_dev_wait() backoff works correctly.

When a read of Vendor ID eventually completes successfully by returning a
non-0x0001 value (the Vendor ID or 0xffff for VFs), the device should be
initialized and ready to respond to config requests.

For conventional PCI devices or devices below Root Ports that don't support
Configuration RRS Software Visibility, poll the Command register as before.

This was developed independently, but is very similar to Stanislav
Spassov's previous work at
https://lore.kernel.org/linux-pci/20200223122057.6504-1-stanspas@amazon.com

Link: https://lore.kernel.org/r/20240827234848.4429-2-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Duc Dang <ducdang@google.com>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4cf89a4b4cbc..121d8d94d6d0 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -371,6 +371,7 @@ struct pci_dev {
 					   can be generated */
 	unsigned int	pme_poll:1;	/* Poll device's PME status bit */
 	unsigned int	pinned:1;	/* Whether this dev is pinned */
+	unsigned int	config_crs_sv:1; /* Config CRS software visibility */
 	unsigned int	imm_ready:1;	/* Supports Immediate Readiness */
 	unsigned int	d1_support:1;	/* Low power state D1 is supported */
 	unsigned int	d2_support:1;	/* Low power state D2 is supported */
-- 
cgit v1.2.3


From 87f10faf166a9114aa0d4132298cad379de16fdd Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 27 Aug 2024 18:48:48 -0500
Subject: PCI: Rename CRS Completion Status to RRS

PCIe r6.0 changed the abbreviation for "Configuration Request Retry Status"
Completion Status from "CRS" to "RRS" and uses the terminology of
"Configuration RRS Software Visibility" instead of "CRS Software
Visibility".

Align the Linux usage with the r6.0 spec language.  No functional change
intended.

It's confusing to make this change, but I think "RRS" *is* a better
abbreviation because it was easy to interpret "CRS" as "Completion Retry
Status", which really didn't make any sense.

Link: https://lore.kernel.org/r/20240827234848.4429-4-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/bcma/bcma_driver_pci.h | 2 +-
 include/linux/pci.h                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h
index 68da8dba5162..dba41b65ae0d 100644
--- a/include/linux/bcma/bcma_driver_pci.h
+++ b/include/linux/bcma/bcma_driver_pci.h
@@ -203,7 +203,7 @@ struct pci_dev;
 #define BCMA_CORE_PCI_MDIO_RXCTRL0		0x840
 
 /* PCIE Root Capability Register bits (Host mode only) */
-#define BCMA_CORE_PCI_RC_CRS_VISIBILITY		0x0001
+#define BCMA_CORE_PCI_RC_RRS_VISIBILITY		0x0001
 
 struct bcma_drv_pci;
 struct bcma_bus;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 121d8d94d6d0..27d8ce977674 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -371,7 +371,7 @@ struct pci_dev {
 					   can be generated */
 	unsigned int	pme_poll:1;	/* Poll device's PME status bit */
 	unsigned int	pinned:1;	/* Whether this dev is pinned */
-	unsigned int	config_crs_sv:1; /* Config CRS software visibility */
+	unsigned int	config_rrs_sv:1; /* Config RRS software visibility */
 	unsigned int	imm_ready:1;	/* Supports Immediate Readiness */
 	unsigned int	d1_support:1;	/* Low power state D1 is supported */
 	unsigned int	d2_support:1;	/* Low power state D2 is supported */
-- 
cgit v1.2.3


From 884ee6dc85b959bc152f15bca80c30f06069e6c4 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Fri, 6 Sep 2024 14:27:24 +0800
Subject: f2fs: get rid of online repaire on corrupted directory

syzbot reports a f2fs bug as below:

kernel BUG at fs/f2fs/inode.c:896!
RIP: 0010:f2fs_evict_inode+0x1598/0x15c0 fs/f2fs/inode.c:896
Call Trace:
 evict+0x532/0x950 fs/inode.c:704
 dispose_list fs/inode.c:747 [inline]
 evict_inodes+0x5f9/0x690 fs/inode.c:797
 generic_shutdown_super+0x9d/0x2d0 fs/super.c:627
 kill_block_super+0x44/0x90 fs/super.c:1696
 kill_f2fs_super+0x344/0x690 fs/f2fs/super.c:4898
 deactivate_locked_super+0xc4/0x130 fs/super.c:473
 cleanup_mnt+0x41f/0x4b0 fs/namespace.c:1373
 task_work_run+0x24f/0x310 kernel/task_work.c:228
 ptrace_notify+0x2d2/0x380 kernel/signal.c:2402
 ptrace_report_syscall include/linux/ptrace.h:415 [inline]
 ptrace_report_syscall_exit include/linux/ptrace.h:477 [inline]
 syscall_exit_work+0xc6/0x190 kernel/entry/common.c:173
 syscall_exit_to_user_mode_prepare kernel/entry/common.c:200 [inline]
 __syscall_exit_to_user_mode_work kernel/entry/common.c:205 [inline]
 syscall_exit_to_user_mode+0x279/0x370 kernel/entry/common.c:218
 do_syscall_64+0x100/0x230 arch/x86/entry/common.c:89
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0010:f2fs_evict_inode+0x1598/0x15c0 fs/f2fs/inode.c:896

Online repaire on corrupted directory in f2fs_lookup() can generate
dirty data/meta while racing w/ readonly remount, it may leave dirty
inode after filesystem becomes readonly, however, checkpoint() will
skips flushing dirty inode in a state of readonly mode, result in
above panic.

Let's get rid of online repaire in f2fs_lookup(), and leave the work
to fsck.f2fs.

Fixes: 510022a85839 ("f2fs: add F2FS_INLINE_DOTS to recover missing dot dentries")
Reported-by: syzbot+ebea2790904673d7c618@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/000000000000a7b20f061ff2d56a@google.com
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index f17f89a259e2..b0b821edfd97 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -278,7 +278,7 @@ struct node_footer {
 #define F2FS_INLINE_DATA	0x02	/* file inline data flag */
 #define F2FS_INLINE_DENTRY	0x04	/* file inline dentry flag */
 #define F2FS_DATA_EXIST		0x08	/* file inline data exist flag */
-#define F2FS_INLINE_DOTS	0x10	/* file having implicit dot dentries */
+#define F2FS_INLINE_DOTS	0x10	/* file having implicit dot dentries (obsolete) */
 #define F2FS_EXTRA_ATTR		0x20	/* file having extra attribute */
 #define F2FS_PIN_FILE		0x40	/* file should not be gced */
 #define F2FS_COMPRESS_RELEASED	0x80	/* file released compressed blocks */
-- 
cgit v1.2.3


From 540c830212edc908361c39d40df21772a8bda308 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 9 Sep 2024 20:30:18 +0000
Subject: firmware: imx: remove duplicate scmi_imx_misc_ctrl_get()

These two functions have a stub definition when CONFIG_IMX_SCMI_MISC_EXT
is not set, which conflict with the global definition:

In file included from drivers/firmware/imx/sm-misc.c:6:
include/linux/firmware/imx/sm.h:30:1: error: expected identifier or '(' before '{' token
   30 | {
      | ^
drivers/firmware/imx/sm-misc.c:26:5: error: redefinition of 'scmi_imx_misc_ctrl_get'
   26 | int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val)
      |     ^~~~~~~~~~~~~~~~~~~~~~
include/linux/firmware/imx/sm.h:24:19: note: previous definition of 'scmi_imx_misc_ctrl_get' with type 'int(u32,  u32 *, u32 *)' {aka 'int(unsigned int,  unsigned int *, unsigned int *)'}
   24 | static inline int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val)
      |                   ^~~~~~~~~~~~~~~~~~~~~~

There is no real need for the #ifdef, and removing this avoids
the build failure.

Fixes: 0b4f8a68b292 ("firmware: imx: Add i.MX95 MISC driver")
Link: https://lore.kernel.org/r/20240909203023.1275232-1-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/firmware/imx/sm.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h
index 62a2690e2abd..9b85a3f028d1 100644
--- a/include/linux/firmware/imx/sm.h
+++ b/include/linux/firmware/imx/sm.h
@@ -17,18 +17,7 @@
 #define SCMI_IMX_CTRL_SAI4_MCLK		4	/* WAKE SAI4 MCLK */
 #define SCMI_IMX_CTRL_SAI5_MCLK		5	/* WAKE SAI5 MCLK */
 
-#if IS_ENABLED(CONFIG_IMX_SCMI_MISC_EXT)
 int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val);
 int scmi_imx_misc_ctrl_set(u32 id, u32 val);
-#else
-static inline int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val)
-{
-	return -EOPNOTSUPP;
-}
 
-static inline int scmi_imx_misc_ctrl_set(u32 id, u32 val);
-{
-	return -EOPNOTSUPP;
-}
-#endif
 #endif
-- 
cgit v1.2.3


From 4b3fc475c61fa8733a9acc9d743f6cc9ca4915f5 Mon Sep 17 00:00:00 2001
From: Russell King <linux@armlinux.org.uk>
Date: Fri, 6 Sep 2024 16:05:07 +0530
Subject: net: phylink: Add phylink_set_fixed_link() to configure fixed link
 state in phylink

The function allows for the configuration of a fixed link state for a given
phylink instance. This addition is particularly useful for network devices that
operate with a fixed link configuration, where the link parameters do not change
dynamically. By using `phylink_set_fixed_link()`, drivers can easily set up
the fixed link state during initialization or configuration changes.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King <linux@armlinux.org.uk>
Signed-off-by: Raju Lakkaraju <Raju.Lakkaraju@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 2381e07429a2..5c01048860c4 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -598,6 +598,8 @@ int phylink_fwnode_phy_connect(struct phylink *pl,
 			       const struct fwnode_handle *fwnode,
 			       u32 flags);
 void phylink_disconnect_phy(struct phylink *);
+int phylink_set_fixed_link(struct phylink *,
+			   const struct phylink_link_state *);
 
 void phylink_mac_change(struct phylink *, bool up);
 void phylink_pcs_change(struct phylink_pcs *, bool up);
-- 
cgit v1.2.3


From cef7dde8836ab09a3bfe96ada4f18ef2496eacc9 Mon Sep 17 00:00:00 2001
From: Michael Guralnik <michaelgur@nvidia.com>
Date: Mon, 9 Sep 2024 13:04:57 +0300
Subject: net/mlx5: Expand mkey page size to support 6 bits

Protect the usage of the 6th bit with the relevant capability to ensure
we are using the new page sizes with FW that supports the bit extension.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/20240909100504.29797-2-michaelgur@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 970c9d8473ef..ec1117d4e441 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1988,7 +1988,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8         migratable[0x1];
 	u8         reserved_at_81[0x11];
 	u8         query_vuid[0x1];
-	u8         reserved_at_93[0xd];
+	u8         reserved_at_93[0x5];
+	u8         umr_log_entity_size_5[0x1];
+	u8         reserved_at_99[0x7];
 
 	u8	   max_reformat_insert_size[0x8];
 	u8	   max_reformat_insert_offset[0x8];
@@ -4212,8 +4214,7 @@ struct mlx5_ifc_mkc_bits {
 
 	u8         reserved_at_1c0[0x19];
 	u8         relaxed_ordering_read[0x1];
-	u8         reserved_at_1d9[0x1];
-	u8         log_page_size[0x5];
+	u8         log_page_size[0x6];
 
 	u8         reserved_at_1e0[0x20];
 };
-- 
cgit v1.2.3


From 6cd9171d04cff79abe78c166927ab8563bf95fe5 Mon Sep 17 00:00:00 2001
From: Michael Guralnik <michaelgur@nvidia.com>
Date: Mon, 9 Sep 2024 13:04:58 +0300
Subject: net/mlx5: Expose HW bits for Memory scheme ODP

Expose IFC bits to support the new memory scheme on demand paging.
Change the macro reading odp capabilities to be able to read from the
new IFC layout and align the code in upper layers to be compiled.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/20240909100504.29797-3-michaelgur@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/device.h   |  4 +++
 include/linux/mlx5/mlx5_ifc.h | 57 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 50 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index ba875a619b97..bd081f276654 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1369,6 +1369,10 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_ODP(mdev, cap)\
 	MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, cap)
 
+#define MLX5_CAP_ODP_SCHEME(mdev, cap)                       \
+	MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, \
+		 transport_page_fault_scheme_cap.cap)
+
 #define MLX5_CAP_ODP_MAX(mdev, cap)\
 	MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->max, cap)
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ec1117d4e441..3e3336bb9191 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1326,11 +1326,13 @@ struct mlx5_ifc_atomic_caps_bits {
 	u8         reserved_at_e0[0x720];
 };
 
-struct mlx5_ifc_odp_cap_bits {
+struct mlx5_ifc_odp_scheme_cap_bits {
 	u8         reserved_at_0[0x40];
 
 	u8         sig[0x1];
-	u8         reserved_at_41[0x1f];
+	u8         reserved_at_41[0x4];
+	u8         page_prefetch[0x1];
+	u8         reserved_at_46[0x1a];
 
 	u8         reserved_at_60[0x20];
 
@@ -1344,7 +1346,20 @@ struct mlx5_ifc_odp_cap_bits {
 
 	struct mlx5_ifc_odp_per_transport_service_cap_bits dc_odp_caps;
 
-	u8         reserved_at_120[0x6E0];
+	u8         reserved_at_120[0xe0];
+};
+
+struct mlx5_ifc_odp_cap_bits {
+	struct mlx5_ifc_odp_scheme_cap_bits transport_page_fault_scheme_cap;
+
+	struct mlx5_ifc_odp_scheme_cap_bits memory_page_fault_scheme_cap;
+
+	u8         reserved_at_400[0x200];
+
+	u8         mem_page_fault[0x1];
+	u8         reserved_at_601[0x1f];
+
+	u8         reserved_at_620[0x1e0];
 };
 
 struct mlx5_ifc_tls_cap_bits {
@@ -2034,7 +2049,8 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   min_mkey_log_entity_size_fixed_buffer[0x5];
 	u8	   ec_vf_vport_base[0x10];
 
-	u8	   reserved_at_3a0[0x10];
+	u8	   reserved_at_3a0[0xa];
+	u8	   max_mkey_log_entity_size_mtt[0x6];
 	u8	   max_rqt_vhca_id[0x10];
 
 	u8	   reserved_at_3c0[0x20];
@@ -7258,6 +7274,30 @@ struct mlx5_ifc_qp_2err_in_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_trans_page_fault_info_bits {
+	u8         error[0x1];
+	u8         reserved_at_1[0x4];
+	u8         page_fault_type[0x3];
+	u8         wq_number[0x18];
+
+	u8         reserved_at_20[0x8];
+	u8         fault_token[0x18];
+};
+
+struct mlx5_ifc_mem_page_fault_info_bits {
+	u8          error[0x1];
+	u8          reserved_at_1[0xf];
+	u8          fault_token_47_32[0x10];
+
+	u8          fault_token_31_0[0x20];
+};
+
+union mlx5_ifc_page_fault_resume_in_page_fault_info_auto_bits {
+	struct mlx5_ifc_trans_page_fault_info_bits trans_page_fault_info;
+	struct mlx5_ifc_mem_page_fault_info_bits mem_page_fault_info;
+	u8          reserved_at_0[0x40];
+};
+
 struct mlx5_ifc_page_fault_resume_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -7274,13 +7314,8 @@ struct mlx5_ifc_page_fault_resume_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         error[0x1];
-	u8         reserved_at_41[0x4];
-	u8         page_fault_type[0x3];
-	u8         wq_number[0x18];
-
-	u8         reserved_at_60[0x8];
-	u8         token[0x18];
+	union mlx5_ifc_page_fault_resume_in_page_fault_info_auto_bits
+		page_fault_info;
 };
 
 struct mlx5_ifc_nop_out_bits {
-- 
cgit v1.2.3


From 64c68385a39bb676c76da36164ab696e8da78842 Mon Sep 17 00:00:00 2001
From: Michael Guralnik <michaelgur@nvidia.com>
Date: Mon, 9 Sep 2024 13:04:59 +0300
Subject: RDMA/mlx5: Add new ODP memory scheme eqe format

Add new fields to support the new memory scheme page fault and extend
the token field to u64 as in the new scheme the token is 48 bit.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/20240909100504.29797-4-michaelgur@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/device.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index bd081f276654..154095256d0d 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -211,6 +211,7 @@ enum {
 enum {
 	MLX5_PFAULT_SUBTYPE_WQE = 0,
 	MLX5_PFAULT_SUBTYPE_RDMA = 1,
+	MLX5_PFAULT_SUBTYPE_MEMORY = 2,
 };
 
 enum wqe_page_fault_type {
@@ -646,10 +647,11 @@ struct mlx5_eqe_page_req {
 	__be32		rsvd1[5];
 };
 
+#define MEMORY_SCHEME_PAGE_FAULT_GRANULARITY 4096
 struct mlx5_eqe_page_fault {
-	__be32 bytes_committed;
 	union {
 		struct {
+			__be32  bytes_committed;
 			u16     reserved1;
 			__be16  wqe_index;
 			u16	reserved2;
@@ -659,6 +661,7 @@ struct mlx5_eqe_page_fault {
 			__be32  pftype_wq;
 		} __packed wqe;
 		struct {
+			__be32  bytes_committed;
 			__be32  r_key;
 			u16	reserved1;
 			__be16  packet_length;
@@ -666,6 +669,23 @@ struct mlx5_eqe_page_fault {
 			__be64  rdma_va;
 			__be32  pftype_token;
 		} __packed rdma;
+		struct {
+			u8      flags;
+			u8      reserved1;
+			__be16  post_demand_fault_pages;
+			__be16  pre_demand_fault_pages;
+			__be16  token47_32;
+			__be32  token31_0;
+			/*
+			 * FW changed from specifying the fault size in byte
+			 * count to 4k pages granularity. The size specified
+			 * in pages uses bits 31:12, to keep backward
+			 * compatibility.
+			 */
+			__be32 demand_fault_pages;
+			__be32  mkey;
+			__be64  va;
+		} __packed memory;
 	} __packed;
 } __packed;
 
-- 
cgit v1.2.3


From c912ac66b3fc92acb0079fff7d030e9be0756c36 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 9 Sep 2024 15:41:05 +0300
Subject: platform/x86: intel_scu_ipc: Move intel_scu_ipc.h out of
 arch/x86/include/asm

This is a platform/x86 library that is mostly being used by other
drivers not directly under arch/x86 anyway (with the exception of the
Intel MID setup code) so it makes sense that it lives under the
platform_data/x86/ directory instead.

No functional changes intended.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20240909124952.1152017-3-andriy.shevchenko@linux.intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/intel_scu_ipc.h | 68 +++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 include/linux/platform_data/x86/intel_scu_ipc.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/intel_scu_ipc.h b/include/linux/platform_data/x86/intel_scu_ipc.h
new file mode 100644
index 000000000000..0ca9962e97f2
--- /dev/null
+++ b/include/linux/platform_data/x86/intel_scu_ipc.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PLATFORM_X86_INTEL_SCU_IPC_H_
+#define __PLATFORM_X86_INTEL_SCU_IPC_H_
+
+#include <linux/ioport.h>
+
+struct device;
+struct intel_scu_ipc_dev;
+
+/**
+ * struct intel_scu_ipc_data - Data used to configure SCU IPC
+ * @mem: Base address of SCU IPC MMIO registers
+ * @irq: The IRQ number used for SCU (optional)
+ */
+struct intel_scu_ipc_data {
+	struct resource mem;
+	int irq;
+};
+
+struct intel_scu_ipc_dev *
+__intel_scu_ipc_register(struct device *parent,
+			 const struct intel_scu_ipc_data *scu_data,
+			 struct module *owner);
+
+#define intel_scu_ipc_register(parent, scu_data)  \
+	__intel_scu_ipc_register(parent, scu_data, THIS_MODULE)
+
+void intel_scu_ipc_unregister(struct intel_scu_ipc_dev *scu);
+
+struct intel_scu_ipc_dev *
+__devm_intel_scu_ipc_register(struct device *parent,
+			      const struct intel_scu_ipc_data *scu_data,
+			      struct module *owner);
+
+#define devm_intel_scu_ipc_register(parent, scu_data)  \
+	__devm_intel_scu_ipc_register(parent, scu_data, THIS_MODULE)
+
+struct intel_scu_ipc_dev *intel_scu_ipc_dev_get(void);
+void intel_scu_ipc_dev_put(struct intel_scu_ipc_dev *scu);
+struct intel_scu_ipc_dev *devm_intel_scu_ipc_dev_get(struct device *dev);
+
+int intel_scu_ipc_dev_ioread8(struct intel_scu_ipc_dev *scu, u16 addr,
+			      u8 *data);
+int intel_scu_ipc_dev_iowrite8(struct intel_scu_ipc_dev *scu, u16 addr,
+			       u8 data);
+int intel_scu_ipc_dev_readv(struct intel_scu_ipc_dev *scu, u16 *addr,
+			    u8 *data, size_t len);
+int intel_scu_ipc_dev_writev(struct intel_scu_ipc_dev *scu, u16 *addr,
+			     u8 *data, size_t len);
+
+int intel_scu_ipc_dev_update(struct intel_scu_ipc_dev *scu, u16 addr,
+			     u8 data, u8 mask);
+
+int intel_scu_ipc_dev_simple_command(struct intel_scu_ipc_dev *scu, int cmd,
+				     int sub);
+int intel_scu_ipc_dev_command_with_size(struct intel_scu_ipc_dev *scu, int cmd,
+					int sub, const void *in, size_t inlen,
+					size_t size, void *out, size_t outlen);
+
+static inline int intel_scu_ipc_dev_command(struct intel_scu_ipc_dev *scu, int cmd,
+					    int sub, const void *in, size_t inlen,
+					    void *out, size_t outlen)
+{
+	return intel_scu_ipc_dev_command_with_size(scu, cmd, sub, in, inlen,
+						   inlen, out, outlen);
+}
+
+#endif
-- 
cgit v1.2.3


From 5f1cda51107fd5e53cd012bd1f6f39aede96bdbe Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 9 Sep 2024 15:41:06 +0300
Subject: platform/x86: intel_scu_wdt: Move intel_scu_wdt.h to x86 subfolder

This is a platform/x86 library that can only be used on x86 devices.
so it makes sense that it lives under the platform_data/x86/ directory
instead.

No functional changes intended.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20240909124952.1152017-4-andriy.shevchenko@linux.intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/intel-mid_wdt.h     | 19 -------------------
 include/linux/platform_data/x86/intel-mid_wdt.h | 19 +++++++++++++++++++
 2 files changed, 19 insertions(+), 19 deletions(-)
 delete mode 100644 include/linux/platform_data/intel-mid_wdt.h
 create mode 100644 include/linux/platform_data/x86/intel-mid_wdt.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/intel-mid_wdt.h b/include/linux/platform_data/intel-mid_wdt.h
deleted file mode 100644
index 8dba70b4b020..000000000000
--- a/include/linux/platform_data/intel-mid_wdt.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- *      intel-mid_wdt: generic Intel MID SCU watchdog driver
- *
- *      Copyright (C) 2014 Intel Corporation. All rights reserved.
- *      Contact: David Cohen <david.a.cohen@linux.intel.com>
- */
-
-#ifndef __INTEL_MID_WDT_H__
-#define __INTEL_MID_WDT_H__
-
-#include <linux/platform_device.h>
-
-struct intel_mid_wdt_pdata {
-	int irq;
-	int (*probe)(struct platform_device *pdev);
-};
-
-#endif /*__INTEL_MID_WDT_H__*/
diff --git a/include/linux/platform_data/x86/intel-mid_wdt.h b/include/linux/platform_data/x86/intel-mid_wdt.h
new file mode 100644
index 000000000000..e5c0210d0fec
--- /dev/null
+++ b/include/linux/platform_data/x86/intel-mid_wdt.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *      intel-mid_wdt: generic Intel MID SCU watchdog driver
+ *
+ *      Copyright (C) 2014 Intel Corporation. All rights reserved.
+ *      Contact: David Cohen <david.a.cohen@linux.intel.com>
+ */
+
+#ifndef __PLATFORM_X86_INTEL_MID_WDT_H_
+#define __PLATFORM_X86_INTEL_MID_WDT_H_
+
+#include <linux/platform_device.h>
+
+struct intel_mid_wdt_pdata {
+	int irq;
+	int (*probe)(struct platform_device *pdev);
+};
+
+#endif	/* __PLATFORM_X86_INTEL_MID_WDT_H_ */
-- 
cgit v1.2.3


From d3bfbfb1248498656cd25c51e41c1e31219bd0dd Mon Sep 17 00:00:00 2001
From: Kundan Kumar <kundan.kumar@samsung.com>
Date: Wed, 11 Sep 2024 12:19:34 +0530
Subject: mm: release number of pages of a folio

Add a new function unpin_user_folio() to put the refs of a folio by
npages count.

The check for BIO_PAGE_PINNED flag is removed as it is already checked
in bio_release_pages().

Signed-off-by: Kundan Kumar <kundan.kumar@samsung.com>
Tested-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://lore.kernel.org/r/20240911064935.5630-4-kundan.kumar@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c4b238a20b76..2fd88cd5997d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1597,6 +1597,7 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
 void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
 				      bool make_dirty);
 void unpin_user_pages(struct page **pages, unsigned long npages);
+void unpin_user_folio(struct folio *folio, unsigned long npages);
 void unpin_folios(struct folio **folios, unsigned long nfolios);
 
 static inline bool is_cow_mapping(vm_flags_t flags)
-- 
cgit v1.2.3


From b03ffc76b83c1a7d058454efbcf1bf0e345ef1c2 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Fri, 6 Sep 2024 15:13:31 +0200
Subject: soc: qcom: geni-se: add GP_LENGTH/IRQ_EN_SET/IRQ_EN_CLEAR registers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For UART devices the M_GP_LENGTH is the TX word count. For other
devices this is the transaction word count.

For UART devices the S_GP_LENGTH is the RX word count.

The IRQ_EN set/clear registers allow you to set or clear bits in the
IRQ_EN register without needing a read-modify-write.

Acked-by: Bjorn Andersson <andersson@kernel.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20240610152420.v4.1.Ife7ced506aef1be3158712aa3ff34a006b973559@changeid
Tested-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Link: https://lore.kernel.org/r/20240906131336.23625-4-johan+linaro@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/soc/qcom/geni-se.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/geni-se.h b/include/linux/soc/qcom/geni-se.h
index 0f038a1a0330..c3bca9c0bf2c 100644
--- a/include/linux/soc/qcom/geni-se.h
+++ b/include/linux/soc/qcom/geni-se.h
@@ -88,11 +88,15 @@ struct geni_se {
 #define SE_GENI_M_IRQ_STATUS		0x610
 #define SE_GENI_M_IRQ_EN		0x614
 #define SE_GENI_M_IRQ_CLEAR		0x618
+#define SE_GENI_M_IRQ_EN_SET		0x61c
+#define SE_GENI_M_IRQ_EN_CLEAR		0x620
 #define SE_GENI_S_CMD0			0x630
 #define SE_GENI_S_CMD_CTRL_REG		0x634
 #define SE_GENI_S_IRQ_STATUS		0x640
 #define SE_GENI_S_IRQ_EN		0x644
 #define SE_GENI_S_IRQ_CLEAR		0x648
+#define SE_GENI_S_IRQ_EN_SET		0x64c
+#define SE_GENI_S_IRQ_EN_CLEAR		0x650
 #define SE_GENI_TX_FIFOn		0x700
 #define SE_GENI_RX_FIFOn		0x780
 #define SE_GENI_TX_FIFO_STATUS		0x800
@@ -101,6 +105,8 @@ struct geni_se {
 #define SE_GENI_RX_WATERMARK_REG	0x810
 #define SE_GENI_RX_RFR_WATERMARK_REG	0x814
 #define SE_GENI_IOS			0x908
+#define SE_GENI_M_GP_LENGTH		0x910
+#define SE_GENI_S_GP_LENGTH		0x914
 #define SE_DMA_TX_IRQ_STAT		0xc40
 #define SE_DMA_TX_IRQ_CLR		0xc44
 #define SE_DMA_TX_FSM_RST		0xc58
@@ -234,6 +240,9 @@ struct geni_se {
 #define IO2_DATA_IN			BIT(1)
 #define RX_DATA_IN			BIT(0)
 
+/* SE_GENI_M_GP_LENGTH and SE_GENI_S_GP_LENGTH fields */
+#define GP_LENGTH			GENMASK(31, 0)
+
 /* SE_DMA_TX_IRQ_STAT Register fields */
 #define TX_DMA_DONE			BIT(0)
 #define TX_EOT				BIT(1)
-- 
cgit v1.2.3


From 4c59c59ef3ab9275f2a8f84dcffbd16c285ce8ea Mon Sep 17 00:00:00 2001
From: Nick Chan <towinchenmi@gmail.com>
Date: Wed, 11 Sep 2024 13:02:11 +0800
Subject: tty: serial: samsung: Use bit manipulation macros for APPLE_S5L_*

New entries using BIT() will be added soon, so change the existing ones to
use bit manipulation macros including BIT() and GENMASK() for
consistency.

Suggested-by: Krzysztof Kozlowski <krzk@kernel.org>
Tested-by: Janne Grunau <j@jannau.net>
Reviewed-by: Neal Gompa <neal@gompa.dev>
Signed-off-by: Nick Chan <towinchenmi@gmail.com>
Reviewed-by: Andi Shyti <andi.shyti@kernel.org>
Link: https://lore.kernel.org/r/20240911050741.14477-2-towinchenmi@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_s3c.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h
index 1672cf0810ef..2a934e20ca4b 100644
--- a/include/linux/serial_s3c.h
+++ b/include/linux/serial_s3c.h
@@ -249,9 +249,9 @@
 #define APPLE_S5L_UCON_RXTO_ENA		9
 #define APPLE_S5L_UCON_RXTHRESH_ENA	12
 #define APPLE_S5L_UCON_TXTHRESH_ENA	13
-#define APPLE_S5L_UCON_RXTO_ENA_MSK	(1 << APPLE_S5L_UCON_RXTO_ENA)
-#define APPLE_S5L_UCON_RXTHRESH_ENA_MSK	(1 << APPLE_S5L_UCON_RXTHRESH_ENA)
-#define APPLE_S5L_UCON_TXTHRESH_ENA_MSK	(1 << APPLE_S5L_UCON_TXTHRESH_ENA)
+#define APPLE_S5L_UCON_RXTO_ENA_MSK	BIT(APPLE_S5L_UCON_RXTO_ENA)
+#define APPLE_S5L_UCON_RXTHRESH_ENA_MSK	BIT(APPLE_S5L_UCON_RXTHRESH_ENA)
+#define APPLE_S5L_UCON_TXTHRESH_ENA_MSK	BIT(APPLE_S5L_UCON_TXTHRESH_ENA)
 
 #define APPLE_S5L_UCON_DEFAULT		(S3C2410_UCON_TXIRQMODE | \
 					 S3C2410_UCON_RXIRQMODE | \
@@ -260,10 +260,10 @@
 					 APPLE_S5L_UCON_RXTHRESH_ENA_MSK | \
 					 APPLE_S5L_UCON_TXTHRESH_ENA_MSK)
 
-#define APPLE_S5L_UTRSTAT_RXTHRESH	(1<<4)
-#define APPLE_S5L_UTRSTAT_TXTHRESH	(1<<5)
-#define APPLE_S5L_UTRSTAT_RXTO		(1<<9)
-#define APPLE_S5L_UTRSTAT_ALL_FLAGS	(0x3f0)
+#define APPLE_S5L_UTRSTAT_RXTHRESH	BIT(4)
+#define APPLE_S5L_UTRSTAT_TXTHRESH	BIT(5)
+#define APPLE_S5L_UTRSTAT_RXTO		BIT(9)
+#define APPLE_S5L_UTRSTAT_ALL_FLAGS	GENMASK(9, 4)
 
 #ifndef __ASSEMBLY__
 
-- 
cgit v1.2.3


From 5ed771f174726ae879945d4f148a9005ac909cb7 Mon Sep 17 00:00:00 2001
From: Nick Chan <towinchenmi@gmail.com>
Date: Wed, 11 Sep 2024 13:02:13 +0800
Subject: tty: serial: samsung: Fix serial rx on Apple A7-A9

Apple's older A7-A9 SoCs seems to use bit 3 in UTRSTAT as RXTO, which is
enabled by bit 11 in UCON.

Access these bits in addition to the original RXTO and RXTO enable bits,
to allow serial rx to function on A7-A9 SoCs. This change does not
appear to affect the A10 SoC and up.

Tested-by: Janne Grunau <j@jannau.net>
Reviewed-by: Neal Gompa <neal@gompa.dev>
Signed-off-by: Nick Chan <towinchenmi@gmail.com>
Reviewed-by: Andi Shyti <andi.shyti@kernel.org>
Link: https://lore.kernel.org/r/20240911050741.14477-4-towinchenmi@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_s3c.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h
index 2a934e20ca4b..102aa33d956c 100644
--- a/include/linux/serial_s3c.h
+++ b/include/linux/serial_s3c.h
@@ -246,24 +246,28 @@
 				 S5PV210_UFCON_TXTRIG4 |	\
 				 S5PV210_UFCON_RXTRIG4)
 
-#define APPLE_S5L_UCON_RXTO_ENA		9
-#define APPLE_S5L_UCON_RXTHRESH_ENA	12
-#define APPLE_S5L_UCON_TXTHRESH_ENA	13
-#define APPLE_S5L_UCON_RXTO_ENA_MSK	BIT(APPLE_S5L_UCON_RXTO_ENA)
-#define APPLE_S5L_UCON_RXTHRESH_ENA_MSK	BIT(APPLE_S5L_UCON_RXTHRESH_ENA)
-#define APPLE_S5L_UCON_TXTHRESH_ENA_MSK	BIT(APPLE_S5L_UCON_TXTHRESH_ENA)
+#define APPLE_S5L_UCON_RXTO_ENA			9
+#define APPLE_S5L_UCON_RXTO_LEGACY_ENA		11
+#define APPLE_S5L_UCON_RXTHRESH_ENA		12
+#define APPLE_S5L_UCON_TXTHRESH_ENA		13
+#define APPLE_S5L_UCON_RXTO_ENA_MSK		BIT(APPLE_S5L_UCON_RXTO_ENA)
+#define APPLE_S5L_UCON_RXTO_LEGACY_ENA_MSK	BIT(APPLE_S5L_UCON_RXTO_LEGACY_ENA)
+#define APPLE_S5L_UCON_RXTHRESH_ENA_MSK		BIT(APPLE_S5L_UCON_RXTHRESH_ENA)
+#define APPLE_S5L_UCON_TXTHRESH_ENA_MSK		BIT(APPLE_S5L_UCON_TXTHRESH_ENA)
 
 #define APPLE_S5L_UCON_DEFAULT		(S3C2410_UCON_TXIRQMODE | \
 					 S3C2410_UCON_RXIRQMODE | \
 					 S3C2410_UCON_RXFIFO_TOI)
 #define APPLE_S5L_UCON_MASK		(APPLE_S5L_UCON_RXTO_ENA_MSK | \
+					 APPLE_S5L_UCON_RXTO_LEGACY_ENA_MSK | \
 					 APPLE_S5L_UCON_RXTHRESH_ENA_MSK | \
 					 APPLE_S5L_UCON_TXTHRESH_ENA_MSK)
 
+#define APPLE_S5L_UTRSTAT_RXTO_LEGACY	BIT(3)
 #define APPLE_S5L_UTRSTAT_RXTHRESH	BIT(4)
 #define APPLE_S5L_UTRSTAT_TXTHRESH	BIT(5)
 #define APPLE_S5L_UTRSTAT_RXTO		BIT(9)
-#define APPLE_S5L_UTRSTAT_ALL_FLAGS	GENMASK(9, 4)
+#define APPLE_S5L_UTRSTAT_ALL_FLAGS	GENMASK(9, 3)
 
 #ifndef __ASSEMBLY__
 
-- 
cgit v1.2.3


From 6746ee4c3a189f8b60694f01e7e29bc5ff7972e0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 11 Sep 2024 17:34:37 +0100
Subject: io_uring/cmd: expose iowq to cmds

When an io_uring request needs blocking context we offload it to the
io_uring's thread pool called io-wq. We can get there off ->uring_cmd
by returning -EAGAIN, but there is no straightforward way of doing that
from an asynchronous callback. Add a helper that would transfer a
command to a blocking context.

Note, we do an extra hop via task_work before io_queue_iowq(), that's a
limitation of io_uring infra we have that can likely be lifted later
if that would ever become a problem.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f735f807d7c8ba50c9452c69dfe5d3e9e535037b.1726072086.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring/cmd.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 447fbfd32215..86ceb3383e49 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -48,6 +48,9 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
 void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
 		unsigned int issue_flags);
 
+/* Execute the request from a blocking context */
+void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd);
+
 #else
 static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd)
@@ -67,6 +70,9 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
 		unsigned int issue_flags)
 {
 }
+static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
+{
+}
 #endif
 
 /*
-- 
cgit v1.2.3


From a6ccb48e13662bcb98282e051512b9686b02d353 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 11 Sep 2024 17:34:38 +0100
Subject: io_uring/cmd: give inline space in request to cmds

Some io_uring commands can use some inline space in io_kiocb. We have 32
bytes in struct io_uring_cmd, expose it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7ca779a61ee5e166e535d70df9c7f07b15d8a0ce.1726072086.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring/cmd.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 86ceb3383e49..c189d36ad55e 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -23,6 +23,15 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
 	return sqe->cmd;
 }
 
+static inline void io_uring_cmd_private_sz_check(size_t cmd_sz)
+{
+	BUILD_BUG_ON(cmd_sz > sizeof_field(struct io_uring_cmd, pdu));
+}
+#define io_uring_cmd_to_pdu(cmd, pdu_type) ( \
+	io_uring_cmd_private_sz_check(sizeof(pdu_type)), \
+	((pdu_type *)&(cmd)->pdu) \
+)
+
 #if defined(CONFIG_IO_URING)
 int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd);
-- 
cgit v1.2.3


From a12c883a0a6a005cfb3ad01feaf783e2248bfc3e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 11 Sep 2024 17:34:39 +0100
Subject: filemap: introduce filemap_invalidate_pages

kiocb_invalidate_pages() is useful for the write path, however not
everything is backed by kiocb and we want to reuse the function for bio
based discard implementation. Extract and and reuse a new helper called
filemap_invalidate_pages(), which takes a argument indicating whether it
should be non-blocking and might return -EAGAIN.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f81374b52c92d0dce0f01a279d1eed42b54056aa.1726072086.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/pagemap.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index d9c7edb6422b..e39c3a7ce33c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -32,6 +32,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 		pgoff_t start, pgoff_t end);
 int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
 void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count);
+int filemap_invalidate_pages(struct address_space *mapping,
+			     loff_t pos, loff_t end, bool nowait);
 
 int write_inode_now(struct inode *, int sync);
 int filemap_fdatawrite(struct address_space *);
-- 
cgit v1.2.3


From 45b8fc3096542a53bfd245a9ad8ef870384b4897 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 29 Aug 2024 10:42:27 -0700
Subject: lib/buildid: rename build_id_parse() into build_id_parse_nofault()

Make it clear that build_id_parse() assumes that it can take no page
fault by renaming it and current few users to build_id_parse_nofault().

Also add build_id_parse() stub which for now falls back to non-sleepable
implementation, but will be changed in subsequent patches to take
advantage of sleepable context. PROCMAP_QUERY ioctl() on
/proc/<pid>/maps file is using build_id_parse() and will automatically
take advantage of more reliable sleepable context implementation.

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240829174232.3133883-6-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/buildid.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/buildid.h b/include/linux/buildid.h
index 20aa3c2d89f7..014a88c41073 100644
--- a/include/linux/buildid.h
+++ b/include/linux/buildid.h
@@ -7,8 +7,8 @@
 #define BUILD_ID_SIZE_MAX 20
 
 struct vm_area_struct;
-int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
-		   __u32 *size);
+int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size);
+int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size);
 int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size);
 
 #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO)
-- 
cgit v1.2.3


From d4dd9775ec242425576af93daadb80a34083a53c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 29 Aug 2024 10:42:31 -0700
Subject: bpf: wire up sleepable bpf_get_stack() and bpf_get_task_stack()
 helpers

Add sleepable implementations of bpf_get_stack() and
bpf_get_task_stack() helpers and allow them to be used from sleepable
BPF program (e.g., sleepable uprobes).

Note, the stack trace IPs capturing itself is not sleepable (that would
need to be a separate project), only build ID fetching is sleepable and
thus more reliable, as it will wait for data to be paged in, if
necessary. For that we make use of sleepable build_id_parse()
implementation.

Now that build ID related internals in kernel/bpf/stackmap.c can be used
both in sleepable and non-sleepable contexts, we need to add additional
rcu_read_lock()/rcu_read_unlock() protection around fetching
perf_callchain_entry, but with the refactoring in previous commit it's
now pretty straightforward. We make sure to do rcu_read_unlock (in
sleepable mode only) right before stack_map_get_build_id_offset() call
which can sleep. By that time we don't have any more use of
perf_callchain_entry.

Note, bpf_get_task_stack() will fail for user mode if task != current.
And for kernel mode build ID are irrelevant. So in that sense adding
sleepable bpf_get_task_stack() implementation is a no-op. It feel right
to wire this up for symmetry and completeness, but I'm open to just
dropping it until we support `user && crosstask` condition.

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240829174232.3133883-10-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6f87fb014fba..7f3e8477d5e7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3200,7 +3200,9 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
+extern const struct bpf_func_proto bpf_get_stack_sleepable_proto;
 extern const struct bpf_func_proto bpf_get_task_stack_proto;
+extern const struct bpf_func_proto bpf_get_task_stack_sleepable_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto_pe;
 extern const struct bpf_func_proto bpf_get_stack_proto_pe;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
-- 
cgit v1.2.3


From 6513eb3d3191574b58859ef2d6dc26c0277c6f81 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 10 Sep 2024 17:35:35 -0400
Subject: net: tighten bad gso csum offset check in virtio_net_hdr

The referenced commit drops bad input, but has false positives.
Tighten the check to avoid these.

The check detects illegal checksum offload requests, which produce
csum_start/csum_off beyond end of packet after segmentation.

But it is based on two incorrect assumptions:

1. virtio_net_hdr_to_skb with VIRTIO_NET_HDR_GSO_TCP[46] implies GSO.
True in callers that inject into the tx path, such as tap.
But false in callers that inject into rx, like virtio-net.
Here, the flags indicate GRO, and CHECKSUM_UNNECESSARY or
CHECKSUM_NONE without VIRTIO_NET_HDR_F_NEEDS_CSUM is normal.

2. TSO requires checksum offload, i.e., ip_summed == CHECKSUM_PARTIAL.
False, as tcp[46]_gso_segment will fix up csum_start and offset for
all other ip_summed by calling __tcp_v4_send_check.

Because of 2, we can limit the scope of the fix to virtio_net_hdr
that do try to set these fields, with a bogus value.

Link: https://lore.kernel.org/netdev/20240909094527.GA3048202@port70.net/
Fixes: 89add40066f9 ("net: drop bad gso csum_start and offset in virtio_net_hdr")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Cc: stable@vger.kernel.org
Link: https://patch.msgid.link/20240910213553.839926-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/virtio_net.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 6c395a2600e8..276ca543ef44 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -173,7 +173,8 @@ retry:
 			break;
 		case SKB_GSO_TCPV4:
 		case SKB_GSO_TCPV6:
-			if (skb->csum_offset != offsetof(struct tcphdr, check))
+			if (skb->ip_summed == CHECKSUM_PARTIAL &&
+			    skb->csum_offset != offsetof(struct tcphdr, check))
 				return -EINVAL;
 			break;
 		}
-- 
cgit v1.2.3


From 170aafe35cb98e0f3fbacb446ea86389fbce22ea Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Tue, 10 Sep 2024 17:14:47 +0000
Subject: netdev: support binding dma-buf to netdevice

Add a netdev_dmabuf_binding struct which represents the
dma-buf-to-netdevice binding. The netlink API will bind the dma-buf to
rx queues on the netdevice. On the binding, the dma_buf_attach
& dma_buf_map_attachment will occur. The entries in the sg_table from
mapping will be inserted into a genpool to make it ready
for allocation.

The chunks in the genpool are owned by a dmabuf_chunk_owner struct which
holds the dma-buf offset of the base of the chunk and the dma_addr of
the chunk. Both are needed to use allocations that come from this chunk.

We create a new type that represents an allocation from the genpool:
net_iov. We setup the net_iov allocation size in the
genpool to PAGE_SIZE for simplicity: to match the PAGE_SIZE normally
allocated by the page pool and given to the drivers.

The user can unbind the dmabuf from the netdevice by closing the netlink
socket that established the binding. We do this so that the binding is
automatically unbound even if the userspace process crashes.

The binding and unbinding leaves an indicator in struct netdev_rx_queue
that the given queue is bound, and the binding is actuated by resetting
the rx queue using the queue API.

The netdev_dmabuf_binding struct is refcounted, and releases its
resources only when all the refs are released.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com> # excluding netlink
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20240910171458.219195-4-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2465bdb6037f..e87b5e488325 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3953,6 +3953,8 @@ u8 dev_xdp_prog_count(struct net_device *dev);
 int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf);
 u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);
 
+u32 dev_get_min_mp_channel_count(const struct net_device *dev);
+
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb);
-- 
cgit v1.2.3


From 9f6b619edf2e85746f261b42ae8f818a59d126f7 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Tue, 10 Sep 2024 17:14:51 +0000
Subject: net: support non paged skb frags

Make skb_frag_page() fail in the case where the frag is not backed
by a page, and fix its relevant callers to handle this case.

Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20240910171458.219195-8-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h     | 42 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/skbuff_ref.h |  9 ++++-----
 2 files changed, 45 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5803eb8a157d..1019f14583c9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3524,21 +3524,58 @@ static inline void skb_frag_off_copy(skb_frag_t *fragto,
 	fragto->offset = fragfrom->offset;
 }
 
+/* Return: true if the skb_frag contains a net_iov. */
+static inline bool skb_frag_is_net_iov(const skb_frag_t *frag)
+{
+	return netmem_is_net_iov(frag->netmem);
+}
+
+/**
+ * skb_frag_net_iov - retrieve the net_iov referred to by fragment
+ * @frag: the fragment
+ *
+ * Return: the &struct net_iov associated with @frag. Returns NULL if this
+ * frag has no associated net_iov.
+ */
+static inline struct net_iov *skb_frag_net_iov(const skb_frag_t *frag)
+{
+	if (!skb_frag_is_net_iov(frag))
+		return NULL;
+
+	return netmem_to_net_iov(frag->netmem);
+}
+
 /**
  * skb_frag_page - retrieve the page referred to by a paged fragment
  * @frag: the paged fragment
  *
- * Returns the &struct page associated with @frag.
+ * Return: the &struct page associated with @frag. Returns NULL if this frag
+ * has no associated page.
  */
 static inline struct page *skb_frag_page(const skb_frag_t *frag)
 {
+	if (skb_frag_is_net_iov(frag))
+		return NULL;
+
 	return netmem_to_page(frag->netmem);
 }
 
+/**
+ * skb_frag_netmem - retrieve the netmem referred to by a fragment
+ * @frag: the fragment
+ *
+ * Return: the &netmem_ref associated with @frag.
+ */
+static inline netmem_ref skb_frag_netmem(const skb_frag_t *frag)
+{
+	return frag->netmem;
+}
+
 int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
 		    unsigned int headroom);
 int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
 			 struct bpf_prog *prog);
+
 /**
  * skb_frag_address - gets the address of the data contained in a paged fragment
  * @frag: the paged fragment buffer
@@ -3548,6 +3585,9 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
  */
 static inline void *skb_frag_address(const skb_frag_t *frag)
 {
+	if (!skb_frag_page(frag))
+		return NULL;
+
 	return page_address(skb_frag_page(frag)) + skb_frag_off(frag);
 }
 
diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h
index 16c241a23472..0f3c58007488 100644
--- a/include/linux/skbuff_ref.h
+++ b/include/linux/skbuff_ref.h
@@ -34,14 +34,13 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
 
 bool napi_pp_put_page(netmem_ref netmem);
 
-static inline void
-skb_page_unref(struct page *page, bool recycle)
+static inline void skb_page_unref(netmem_ref netmem, bool recycle)
 {
 #ifdef CONFIG_PAGE_POOL
-	if (recycle && napi_pp_put_page(page_to_netmem(page)))
+	if (recycle && napi_pp_put_page(netmem))
 		return;
 #endif
-	put_page(page);
+	put_page(netmem_to_page(netmem));
 }
 
 /**
@@ -54,7 +53,7 @@ skb_page_unref(struct page *page, bool recycle)
  */
 static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
 {
-	skb_page_unref(skb_frag_page(frag), recycle);
+	skb_page_unref(skb_frag_netmem(frag), recycle);
 }
 
 /**
-- 
cgit v1.2.3


From 65249feb6b3df9e17bab5911ee56fa7b0971e231 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Tue, 10 Sep 2024 17:14:52 +0000
Subject: net: add support for skbs with unreadable frags

For device memory TCP, we expect the skb headers to be available in host
memory for access, and we expect the skb frags to be in device memory
and unaccessible to the host. We expect there to be no mixing and
matching of device memory frags (unaccessible) with host memory frags
(accessible) in the same skb.

Add a skb->devmem flag which indicates whether the frags in this skb
are device memory frags or not.

__skb_fill_netmem_desc() now checks frags added to skbs for net_iov,
and marks the skb as skb->devmem accordingly.

Add checks through the network stack to avoid accessing the frags of
devmem skbs and avoid coalescing devmem skbs with non devmem skbs.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20240910171458.219195-9-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1019f14583c9..39f1d16f3628 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -827,6 +827,8 @@ enum skb_tstamp_type {
  *	@csum_level: indicates the number of consecutive checksums found in
  *		the packet minus one that have been verified as
  *		CHECKSUM_UNNECESSARY (max 3)
+ *	@unreadable: indicates that at least 1 of the fragments in this skb is
+ *		unreadable.
  *	@dst_pending_confirm: need to confirm neighbour
  *	@decrypted: Decrypted SKB
  *	@slow_gro: state present at GRO time, slower prepare step required
@@ -1008,7 +1010,7 @@ struct sk_buff {
 #if IS_ENABLED(CONFIG_IP_SCTP)
 	__u8			csum_not_inet:1;
 #endif
-
+	__u8			unreadable:1;
 #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
 	__u16			tc_index;	/* traffic control index */
 #endif
@@ -1824,6 +1826,12 @@ static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb)
 		__skb_zcopy_downgrade_managed(skb);
 }
 
+/* Return true if frags in this skb are readable by the host. */
+static inline bool skb_frags_readable(const struct sk_buff *skb)
+{
+	return !skb->unreadable;
+}
+
 static inline void skb_mark_not_on_list(struct sk_buff *skb)
 {
 	skb->next = NULL;
@@ -2540,10 +2548,17 @@ static inline void skb_len_add(struct sk_buff *skb, int delta)
 static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i,
 					  netmem_ref netmem, int off, int size)
 {
-	struct page *page = netmem_to_page(netmem);
+	struct page *page;
 
 	__skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size);
 
+	if (netmem_is_net_iov(netmem)) {
+		skb->unreadable = true;
+		return;
+	}
+
+	page = netmem_to_page(netmem);
+
 	/* Propagate page pfmemalloc to the skb if we can. The problem is
 	 * that not all callers have unique ownership of the page but rely
 	 * on page_is_pfmemalloc doing the right thing(tm).
-- 
cgit v1.2.3


From 8f0b3cc9a4c102c24808c87f1bc943659d7a7f9f Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Tue, 10 Sep 2024 17:14:53 +0000
Subject: tcp: RX path for devmem TCP

In tcp_recvmsg_locked(), detect if the skb being received by the user
is a devmem skb. In this case - if the user provided the MSG_SOCK_DEVMEM
flag - pass it to tcp_recvmsg_devmem() for custom handling.

tcp_recvmsg_devmem() copies any data in the skb header to the linear
buffer, and returns a cmsg to the user indicating the number of bytes
returned in the linear buffer.

tcp_recvmsg_devmem() then loops over the unaccessible devmem skb frags,
and returns to the user a cmsg_devmem indicating the location of the
data in the dmabuf device memory. cmsg_devmem contains this information:

1. the offset into the dmabuf where the payload starts. 'frag_offset'.
2. the size of the frag. 'frag_size'.
3. an opaque token 'frag_token' to return to the kernel when the buffer
is to be released.

The pages awaiting freeing are stored in the newly added
sk->sk_user_frags, and each page passed to userspace is get_page()'d.
This reference is dropped once the userspace indicates that it is
done reading this page.  All pages are released when the socket is
destroyed.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20240910171458.219195-10-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/socket.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index df9cdb8bbfb8..d18cc47e89bd 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -327,6 +327,7 @@ struct ucred {
 					  * plain text and require encryption
 					  */
 
+#define MSG_SOCK_DEVMEM 0x2000000	/* Receive devmem skbs as cmsg */
 #define MSG_ZEROCOPY	0x4000000	/* Use user data in kernel path */
 #define MSG_SPLICE_PAGES 0x8000000	/* Splice the pages from the iterator in sendmsg() */
 #define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
-- 
cgit v1.2.3


From aa58bec064ab16224645776ce8e0af2fee46136a Mon Sep 17 00:00:00 2001
From: Parthiban Veerasooran <Parthiban.Veerasooran@microchip.com>
Date: Mon, 9 Sep 2024 13:55:02 +0530
Subject: net: ethernet: oa_tc6: implement register write operation

Implement register write operation according to the control communication
specified in the OPEN Alliance 10BASE-T1x MACPHY Serial Interface
document. Control write commands are used by the SPI host to write
registers within the MAC-PHY. Each control write commands are composed of
a 32 bits control command header followed by register write data.

The MAC-PHY ignores the final 32 bits of data from the SPI host at the
end of the control write command. The write command and data is also
echoed from the MAC-PHY back to the SPI host to enable the SPI host to
identify which register write failed in the case of any bus errors.
Control write commands can write either a single register or multiple
consecutive registers. When multiple consecutive registers are written,
the address is automatically post-incremented by the MAC-PHY. Writing to
any unimplemented or undefined registers shall be ignored and yield no
effect.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Parthiban Veerasooran <Parthiban.Veerasooran@microchip.com>
Link: https://patch.msgid.link/20240909082514.262942-3-Parthiban.Veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/oa_tc6.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 include/linux/oa_tc6.h

(limited to 'include/linux')

diff --git a/include/linux/oa_tc6.h b/include/linux/oa_tc6.h
new file mode 100644
index 000000000000..99c490f1c8a8
--- /dev/null
+++ b/include/linux/oa_tc6.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * OPEN Alliance 10BASE‑T1x MAC‑PHY Serial Interface framework
+ *
+ * Link: https://opensig.org/download/document/OPEN_Alliance_10BASET1x_MAC-PHY_Serial_Interface_V1.1.pdf
+ *
+ * Author: Parthiban Veerasooran <parthiban.veerasooran@microchip.com>
+ */
+
+#include <linux/spi/spi.h>
+
+struct oa_tc6;
+
+struct oa_tc6 *oa_tc6_init(struct spi_device *spi);
+int oa_tc6_write_register(struct oa_tc6 *tc6, u32 address, u32 value);
+int oa_tc6_write_registers(struct oa_tc6 *tc6, u32 address, u32 value[],
+			   u8 length);
-- 
cgit v1.2.3


From 375d1e0278cca70ce801d52c6b95c7a3c00f249c Mon Sep 17 00:00:00 2001
From: Parthiban Veerasooran <Parthiban.Veerasooran@microchip.com>
Date: Mon, 9 Sep 2024 13:55:03 +0530
Subject: net: ethernet: oa_tc6: implement register read operation

Implement register read operation according to the control communication
specified in the OPEN Alliance 10BASE-T1x MACPHY Serial Interface
document. Control read commands are used by the SPI host to read
registers within the MAC-PHY. Each control read commands are composed of
a 32 bits control command header.

The MAC-PHY ignores all data from the SPI host following the control
header for the remainder of the control read command. Control read
commands can read either a single register or multiple consecutive
registers. When multiple consecutive registers are read, the address is
automatically post-incremented by the MAC-PHY. Reading any unimplemented
or undefined registers shall return zero.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Parthiban Veerasooran <Parthiban.Veerasooran@microchip.com>
Link: https://patch.msgid.link/20240909082514.262942-4-Parthiban.Veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/oa_tc6.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/oa_tc6.h b/include/linux/oa_tc6.h
index 99c490f1c8a8..85aeecf87306 100644
--- a/include/linux/oa_tc6.h
+++ b/include/linux/oa_tc6.h
@@ -15,3 +15,6 @@ struct oa_tc6 *oa_tc6_init(struct spi_device *spi);
 int oa_tc6_write_register(struct oa_tc6 *tc6, u32 address, u32 value);
 int oa_tc6_write_registers(struct oa_tc6 *tc6, u32 address, u32 value[],
 			   u8 length);
+int oa_tc6_read_register(struct oa_tc6 *tc6, u32 address, u32 *value);
+int oa_tc6_read_registers(struct oa_tc6 *tc6, u32 address, u32 value[],
+			  u8 length);
-- 
cgit v1.2.3


From 8f9bf857e43b3f75a098e3af3a6fec2d03203a1e Mon Sep 17 00:00:00 2001
From: Parthiban Veerasooran <Parthiban.Veerasooran@microchip.com>
Date: Mon, 9 Sep 2024 13:55:06 +0530
Subject: net: ethernet: oa_tc6: implement internal PHY initialization

Internal PHY is initialized as per the PHY register capability supported
by the MAC-PHY. Direct PHY Register Access Capability indicates if PHY
registers are directly accessible within the SPI register memory space.
Indirect PHY Register Access Capability indicates if PHY registers are
indirectly accessible through the MDIO/MDC registers MDIOACCn defined in
OPEN Alliance specification. Currently the direct register access is only
supported.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Parthiban Veerasooran <Parthiban.Veerasooran@microchip.com>
Link: https://patch.msgid.link/20240909082514.262942-7-Parthiban.Veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/oa_tc6.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/oa_tc6.h b/include/linux/oa_tc6.h
index 85aeecf87306..606ba9f1e663 100644
--- a/include/linux/oa_tc6.h
+++ b/include/linux/oa_tc6.h
@@ -7,11 +7,13 @@
  * Author: Parthiban Veerasooran <parthiban.veerasooran@microchip.com>
  */
 
+#include <linux/etherdevice.h>
 #include <linux/spi/spi.h>
 
 struct oa_tc6;
 
-struct oa_tc6 *oa_tc6_init(struct spi_device *spi);
+struct oa_tc6 *oa_tc6_init(struct spi_device *spi, struct net_device *netdev);
+void oa_tc6_exit(struct oa_tc6 *tc6);
 int oa_tc6_write_register(struct oa_tc6 *tc6, u32 address, u32 value);
 int oa_tc6_write_registers(struct oa_tc6 *tc6, u32 address, u32 value[],
 			   u8 length);
-- 
cgit v1.2.3


From 53fbde8ab21e8c2c6187159cc17fc10cbf20900a Mon Sep 17 00:00:00 2001
From: Parthiban Veerasooran <Parthiban.Veerasooran@microchip.com>
Date: Mon, 9 Sep 2024 13:55:09 +0530
Subject: net: ethernet: oa_tc6: implement transmit path to transfer tx
 ethernet frames

The transmit ethernet frame will be converted into multiple transmit data
chunks. Each transmit data chunk consists of a 4 bytes header followed by
a 64 bytes transmit data chunk payload. The 4 bytes data header occurs at
the beginning of each transmit data chunk on MOSI. The data header
contains the information needed to determine the validity and location of
the transmit frame data within the data chunk payload. The number of
transmit data chunks transmitted to mac-phy is limited to the number
transmit credits available in the mac-phy. Initially the transmit credits
will be updated from the buffer status register and then it will be
updated from the footer received on each spi data transfer. The received
footer will be examined for the transmit errors if any.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Parthiban Veerasooran <Parthiban.Veerasooran@microchip.com>
Link: https://patch.msgid.link/20240909082514.262942-10-Parthiban.Veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/oa_tc6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/oa_tc6.h b/include/linux/oa_tc6.h
index 606ba9f1e663..5c7811ac9cbe 100644
--- a/include/linux/oa_tc6.h
+++ b/include/linux/oa_tc6.h
@@ -20,3 +20,4 @@ int oa_tc6_write_registers(struct oa_tc6 *tc6, u32 address, u32 value[],
 int oa_tc6_read_register(struct oa_tc6 *tc6, u32 address, u32 *value);
 int oa_tc6_read_registers(struct oa_tc6 *tc6, u32 address, u32 value[],
 			  u8 length);
+netdev_tx_t oa_tc6_start_xmit(struct oa_tc6 *tc6, struct sk_buff *skb);
-- 
cgit v1.2.3


From afd42170c8a6e68edd3f3a7f2aacd2bfbedb58b2 Mon Sep 17 00:00:00 2001
From: Parthiban Veerasooran <Parthiban.Veerasooran@microchip.com>
Date: Mon, 9 Sep 2024 13:55:12 +0530
Subject: net: ethernet: oa_tc6: add helper function to enable zero align rx
 frame

Zero align receive frame feature can be enabled to align all receive
ethernet frames data to start at the beginning of any receive data chunk
payload with a start word offset (SWO) of zero. Receive frames may begin
anywhere within the receive data chunk payload when this feature is not
enabled.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Parthiban Veerasooran <Parthiban.Veerasooran@microchip.com>
Link: https://patch.msgid.link/20240909082514.262942-13-Parthiban.Veerasooran@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/oa_tc6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/oa_tc6.h b/include/linux/oa_tc6.h
index 5c7811ac9cbe..15f58e3c56c7 100644
--- a/include/linux/oa_tc6.h
+++ b/include/linux/oa_tc6.h
@@ -21,3 +21,4 @@ int oa_tc6_read_register(struct oa_tc6 *tc6, u32 address, u32 *value);
 int oa_tc6_read_registers(struct oa_tc6 *tc6, u32 address, u32 value[],
 			  u8 length);
 netdev_tx_t oa_tc6_start_xmit(struct oa_tc6 *tc6, struct sk_buff *skb);
+int oa_tc6_zero_align_receive_frame_enable(struct oa_tc6 *tc6);
-- 
cgit v1.2.3


From 5e9b50dea970ae6d3e1309d4254157099734a2af Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:59 +0200
Subject: fs: add f_pipe

Only regular files with FMODE_ATOMIC_POS and directories need
f_pos_lock. Place a new f_pipe member in a union with f_pos_lock
that they can use and make them stop abusing f_version in follow-up
patches.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-18-6d3e4816aa7b@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3e6b3c1afb31..ca4925008244 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1001,6 +1001,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
  * @f_cred: stashed credentials of creator/opener
  * @f_path: path of the file
  * @f_pos_lock: lock protecting file position
+ * @f_pipe: specific to pipes
  * @f_pos: file position
  * @f_version: file version
  * @f_security: LSM security context of this file
@@ -1026,7 +1027,12 @@ struct file {
 	const struct cred		*f_cred;
 	/* --- cacheline 1 boundary (64 bytes) --- */
 	struct path			f_path;
-	struct mutex			f_pos_lock;
+	union {
+		/* regular files (with FMODE_ATOMIC_POS) and directories */
+		struct mutex		f_pos_lock;
+		/* pipes */
+		u64			f_pipe;
+	};
 	loff_t				f_pos;
 	u64				f_version;
 	/* --- cacheline 2 boundary (128 bytes) --- */
-- 
cgit v1.2.3


From 11068e0b64cbb540b96e577fcca0926242ecaf58 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:05:01 +0200
Subject: fs: remove f_version

Now that detecting concurrent seeks is done by the filesystems that
require it we can remove f_version and free up 8 bytes for future
extensions.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-20-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ca4925008244..7e11ce172140 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1003,7 +1003,6 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
  * @f_pos_lock: lock protecting file position
  * @f_pipe: specific to pipes
  * @f_pos: file position
- * @f_version: file version
  * @f_security: LSM security context of this file
  * @f_owner: file owner
  * @f_wb_err: writeback error
@@ -1034,11 +1033,10 @@ struct file {
 		u64			f_pipe;
 	};
 	loff_t				f_pos;
-	u64				f_version;
-	/* --- cacheline 2 boundary (128 bytes) --- */
 #ifdef CONFIG_SECURITY
 	void				*f_security;
 #endif
+	/* --- cacheline 2 boundary (128 bytes) --- */
 	struct fown_struct		*f_owner;
 	errseq_t			f_wb_err;
 	errseq_t			f_sb_err;
-- 
cgit v1.2.3


From 2077006d4725c82c6e9612cec3a6c140921b067f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 10 Sep 2024 10:16:39 +0200
Subject: uidgid: make sure we fit into one cacheline

When I expanded uidgid mappings I intended for a struct uid_gid_map to
fit into a single cacheline on x86 as they tend to be pretty
performance sensitive (idmapped mounts etc). But a 4 byte hole was added
that brought it over 64 bytes. Fix that by adding the static extent
array and the extent counter into a substruct. C's type punning for
unions guarantees that we can access ->nr_extents even if the last
written to member wasn't within the same object. This is also what we
rely on in struct_group() and friends. This of course relies on
non-strict aliasing which we don't do.

99) If the member used to read the contents of a union object is not the
    same as the member last used to store a value in the object, the
    appropriate part of the object representation of the value is
    reinterpreted as an object representation in the new type as
    described in 6.2.6 (a process sometimes called "type punning").

Link: https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2310.pdf
Link: https://lore.kernel.org/r/20240910-work-uid_gid_map-v1-1-e6bc761363ed@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/user_namespace.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 6030a8235617..3625096d5f85 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -21,9 +21,11 @@ struct uid_gid_extent {
 };
 
 struct uid_gid_map { /* 64 bytes -- 1 cache line */
-	u32 nr_extents;
 	union {
-		struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
+		struct {
+			struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
+			u32 nr_extents;
+		};
 		struct {
 			struct uid_gid_extent *forward;
 			struct uid_gid_extent *reverse;
-- 
cgit v1.2.3


From db0aa2e9566fda2d23dc8f6c102856ead95578a4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 19 Jun 2024 00:20:42 +0100
Subject: mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of
 folios

Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.

The folio_queue structure looks like:

	struct folio_queue {
		struct folio_batch	vec;
		u8			orders[PAGEVEC_SIZE];
		struct folio_queue	*next;
		struct folio_queue	*prev;
		unsigned long		marks;
		unsigned long		marks2;
	};

It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.

A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions.  Two mark bits are
available per slot.  The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.

The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.

With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock.  Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.

[Questions/thoughts]

 (1) To manage this, I need a head pointer, a tail pointer, a tail slot
     number (assuming insertion happens at the tail end and the next
     pointers point from head to tail).  Should I put these into a struct
     of their own, say "folio_queue_head" or "rolling_buffer"?

     I will end up with two of these in netfs_io_request eventually, one
     keeping track of the pagecache I'm dealing with for buffered I/O and
     the other to hold a bounce buffer when we need one.

 (2) Should I make the slots {folio,off,len} or bio_vec?

 (3) This is intended to replace ITER_XARRAY eventually.  Using an xarray
     in I/O iteration requires the taking of the RCU read lock, doing
     copying under the RCU read lock, walking the xarray (which may change
     under us), handling retries and dealing with special values.

     The advantage of ITER_XARRAY is that when we're dealing with the
     pagecache directly, we don't need any allocation - but if we're doing
     encrypted comms, there's a good chance we'd be using a bounce buffer
     anyway.

     This will require afs, erofs, cifs, orangefs and fscache to be
     converted to not use this.  afs still uses it for dirs and symlinks;
     some of erofs usages should be easy to change, but there's one which
     won't be so easy; ceph's use via fscache can be fixed by porting ceph
     to netfslib; cifs is using xarray as a bounce buffer - that can be
     moved to use sheaves instead; and orangefs has a similar problem to
     erofs - maybe orangefs could use netfslib?

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/folio_queue.h | 138 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iov_iter.h    |  57 ++++++++++++++++++
 include/linux/uio.h         |  12 ++++
 3 files changed, 207 insertions(+)
 create mode 100644 include/linux/folio_queue.h

(limited to 'include/linux')

diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h
new file mode 100644
index 000000000000..52773613bf23
--- /dev/null
+++ b/include/linux/folio_queue.h
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Queue of folios definitions
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifndef _LINUX_FOLIO_QUEUE_H
+#define _LINUX_FOLIO_QUEUE_H
+
+#include <linux/pagevec.h>
+
+/*
+ * Segment in a queue of running buffers.  Each segment can hold a number of
+ * folios and a portion of the queue can be referenced with the ITER_FOLIOQ
+ * iterator.  The possibility exists of inserting non-folio elements into the
+ * queue (such as gaps).
+ *
+ * Explicit prev and next pointers are used instead of a list_head to make it
+ * easier to add segments to tail and remove them from the head without the
+ * need for a lock.
+ */
+struct folio_queue {
+	struct folio_batch	vec;		/* Folios in the queue segment */
+	u8			orders[PAGEVEC_SIZE]; /* Order of each folio */
+	struct folio_queue	*next;		/* Next queue segment or NULL */
+	struct folio_queue	*prev;		/* Previous queue segment of NULL */
+	unsigned long		marks;		/* 1-bit mark per folio */
+	unsigned long		marks2;		/* Second 1-bit mark per folio */
+#if PAGEVEC_SIZE > BITS_PER_LONG
+#error marks is not big enough
+#endif
+};
+
+static inline void folioq_init(struct folio_queue *folioq)
+{
+	folio_batch_init(&folioq->vec);
+	folioq->next = NULL;
+	folioq->prev = NULL;
+	folioq->marks = 0;
+	folioq->marks2 = 0;
+}
+
+static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq)
+{
+	return PAGEVEC_SIZE;
+}
+
+static inline unsigned int folioq_count(struct folio_queue *folioq)
+{
+	return folio_batch_count(&folioq->vec);
+}
+
+static inline bool folioq_full(struct folio_queue *folioq)
+{
+	//return !folio_batch_space(&folioq->vec);
+	return folioq_count(folioq) >= folioq_nr_slots(folioq);
+}
+
+static inline bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot)
+{
+	return test_bit(slot, &folioq->marks);
+}
+
+static inline void folioq_mark(struct folio_queue *folioq, unsigned int slot)
+{
+	set_bit(slot, &folioq->marks);
+}
+
+static inline void folioq_unmark(struct folio_queue *folioq, unsigned int slot)
+{
+	clear_bit(slot, &folioq->marks);
+}
+
+static inline bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot)
+{
+	return test_bit(slot, &folioq->marks2);
+}
+
+static inline void folioq_mark2(struct folio_queue *folioq, unsigned int slot)
+{
+	set_bit(slot, &folioq->marks2);
+}
+
+static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot)
+{
+	clear_bit(slot, &folioq->marks2);
+}
+
+static inline unsigned int __folio_order(struct folio *folio)
+{
+	if (!folio_test_large(folio))
+		return 0;
+	return folio->_flags_1 & 0xff;
+}
+
+static inline unsigned int folioq_append(struct folio_queue *folioq, struct folio *folio)
+{
+	unsigned int slot = folioq->vec.nr++;
+
+	folioq->vec.folios[slot] = folio;
+	folioq->orders[slot] = __folio_order(folio);
+	return slot;
+}
+
+static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct folio *folio)
+{
+	unsigned int slot = folioq->vec.nr++;
+
+	folioq->vec.folios[slot] = folio;
+	folioq->orders[slot] = __folio_order(folio);
+	folioq_mark(folioq, slot);
+	return slot;
+}
+
+static inline struct folio *folioq_folio(const struct folio_queue *folioq, unsigned int slot)
+{
+	return folioq->vec.folios[slot];
+}
+
+static inline unsigned int folioq_folio_order(const struct folio_queue *folioq, unsigned int slot)
+{
+	return folioq->orders[slot];
+}
+
+static inline size_t folioq_folio_size(const struct folio_queue *folioq, unsigned int slot)
+{
+	return PAGE_SIZE << folioq_folio_order(folioq, slot);
+}
+
+static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot)
+{
+	folioq->vec.folios[slot] = NULL;
+	folioq_unmark(folioq, slot);
+	folioq_unmark2(folioq, slot);
+}
+
+#endif /* _LINUX_FOLIO_QUEUE_H */
diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h
index 270454a6703d..a223370a59a7 100644
--- a/include/linux/iov_iter.h
+++ b/include/linux/iov_iter.h
@@ -10,6 +10,7 @@
 
 #include <linux/uio.h>
 #include <linux/bvec.h>
+#include <linux/folio_queue.h>
 
 typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len,
 			     void *priv, void *priv2);
@@ -140,6 +141,60 @@ size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
 	return progress;
 }
 
+/*
+ * Handle ITER_FOLIOQ.
+ */
+static __always_inline
+size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+		      iov_step_f step)
+{
+	const struct folio_queue *folioq = iter->folioq;
+	unsigned int slot = iter->folioq_slot;
+	size_t progress = 0, skip = iter->iov_offset;
+
+	if (slot == folioq_nr_slots(folioq)) {
+		/* The iterator may have been extended. */
+		folioq = folioq->next;
+		slot = 0;
+	}
+
+	do {
+		struct folio *folio = folioq_folio(folioq, slot);
+		size_t part, remain, consumed;
+		size_t fsize;
+		void *base;
+
+		if (!folio)
+			break;
+
+		fsize = folioq_folio_size(folioq, slot);
+		base = kmap_local_folio(folio, skip);
+		part = umin(len, PAGE_SIZE - skip % PAGE_SIZE);
+		remain = step(base, progress, part, priv, priv2);
+		kunmap_local(base);
+		consumed = part - remain;
+		len -= consumed;
+		progress += consumed;
+		skip += consumed;
+		if (skip >= fsize) {
+			skip = 0;
+			slot++;
+			if (slot == folioq_nr_slots(folioq) && folioq->next) {
+				folioq = folioq->next;
+				slot = 0;
+			}
+		}
+		if (remain)
+			break;
+	} while (len);
+
+	iter->folioq_slot = slot;
+	iter->folioq = folioq;
+	iter->iov_offset = skip;
+	iter->count -= progress;
+	return progress;
+}
+
 /*
  * Handle ITER_XARRAY.
  */
@@ -249,6 +304,8 @@ size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
 		return iterate_bvec(iter, len, priv, priv2, step);
 	if (iov_iter_is_kvec(iter))
 		return iterate_kvec(iter, len, priv, priv2, step);
+	if (iov_iter_is_folioq(iter))
+		return iterate_folioq(iter, len, priv, priv2, step);
 	if (iov_iter_is_xarray(iter))
 		return iterate_xarray(iter, len, priv, priv2, step);
 	return iterate_discard(iter, len, priv, priv2, step);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 7020adedfa08..845d110acadc 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -11,6 +11,7 @@
 #include <uapi/linux/uio.h>
 
 struct page;
+struct folio_queue;
 
 typedef unsigned int __bitwise iov_iter_extraction_t;
 
@@ -25,6 +26,7 @@ enum iter_type {
 	ITER_IOVEC,
 	ITER_BVEC,
 	ITER_KVEC,
+	ITER_FOLIOQ,
 	ITER_XARRAY,
 	ITER_DISCARD,
 };
@@ -66,6 +68,7 @@ struct iov_iter {
 				const struct iovec *__iov;
 				const struct kvec *kvec;
 				const struct bio_vec *bvec;
+				const struct folio_queue *folioq;
 				struct xarray *xarray;
 				void __user *ubuf;
 			};
@@ -74,6 +77,7 @@ struct iov_iter {
 	};
 	union {
 		unsigned long nr_segs;
+		u8 folioq_slot;
 		loff_t xarray_start;
 	};
 };
@@ -126,6 +130,11 @@ static inline bool iov_iter_is_discard(const struct iov_iter *i)
 	return iov_iter_type(i) == ITER_DISCARD;
 }
 
+static inline bool iov_iter_is_folioq(const struct iov_iter *i)
+{
+	return iov_iter_type(i) == ITER_FOLIOQ;
+}
+
 static inline bool iov_iter_is_xarray(const struct iov_iter *i)
 {
 	return iov_iter_type(i) == ITER_XARRAY;
@@ -273,6 +282,9 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec
 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
 			unsigned long nr_segs, size_t count);
 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
+void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
+			  const struct folio_queue *folioq,
+			  unsigned int first_slot, unsigned int offset, size_t count);
 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
 		     loff_t start, size_t count);
 ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
-- 
cgit v1.2.3


From 197a3de607d92b3d72e69edf5470e0a8fae548cc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 14 Aug 2024 16:14:21 +0100
Subject: iov_iter: Provide copy_folio_from_iter()

Provide a copy_folio_from_iter() wrapper.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Christian Brauner <christian@brauner.io>
cc: Matthew Wilcox <willy@infradead.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
Link: https://lore.kernel.org/r/20240814203850.2240469-14-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/uio.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 845d110acadc..853f9de5aa05 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -189,6 +189,12 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
 	return copy_page_to_iter(&folio->page, offset, bytes, i);
 }
 
+static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset,
+					  size_t bytes, struct iov_iter *i)
+{
+	return copy_page_from_iter(&folio->page, offset, bytes, i);
+}
+
 static inline size_t copy_folio_from_iter_atomic(struct folio *folio,
 		size_t offset, size_t bytes, struct iov_iter *i)
 {
-- 
cgit v1.2.3


From cd0277ed0c188dd40e7744e89299af7b78831ca4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 29 May 2024 21:47:07 +0100
Subject: netfs: Use new folio_queue data type and iterator instead of xarray
 iter

Make the netfs write-side routines use the new folio_queue struct to hold a
rolling buffer of folios, with the issuer adding folios at the tail and the
collector removing them from the head as they're processed instead of using
an xarray.

This will allow a subsequent patch to simplify the write collector.

The primary mark (as tested by folioq_is_marked()) is used to note if the
corresponding folio needs putting.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20240814203850.2240469-16-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/netfs.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 63ab2c775a21..f7b444f4f251 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -38,10 +38,6 @@ static inline void folio_start_private_2(struct folio *folio)
 	folio_set_private_2(folio);
 }
 
-/* Marks used on xarray-based buffers */
-#define NETFS_BUF_PUT_MARK	XA_MARK_0	/* - Page needs putting  */
-#define NETFS_BUF_PAGECACHE_MARK XA_MARK_1	/* - Page needs wb/dirty flag wrangling */
-
 enum netfs_io_source {
 	NETFS_SOURCE_UNKNOWN,
 	NETFS_FILL_WITH_ZEROES,
@@ -233,6 +229,8 @@ struct netfs_io_request {
 	struct netfs_io_stream	io_streams[2];	/* Streams of parallel I/O operations */
 #define NR_IO_STREAMS 2 //wreq->nr_io_streams
 	struct netfs_group	*group;		/* Writeback group being written back */
+	struct folio_queue	*buffer;	/* Head of I/O buffer */
+	struct folio_queue	*buffer_tail;	/* Tail of I/O buffer */
 	struct iov_iter		iter;		/* Unencrypted-side iterator */
 	struct iov_iter		io_iter;	/* I/O (Encrypted-side) iterator */
 	void			*netfs_priv;	/* Private data for the netfs */
@@ -254,6 +252,8 @@ struct netfs_io_request {
 	short			error;		/* 0 or error that occurred */
 	enum netfs_io_origin	origin;		/* Origin of the request */
 	bool			direct_bv_unpin; /* T if direct_bv[] must be unpinned */
+	u8			buffer_head_slot; /* First slot in ->buffer */
+	u8			buffer_tail_slot; /* Next slot in ->buffer_tail */
 	unsigned long long	i_size;		/* Size of the file */
 	unsigned long long	start;		/* Start position */
 	atomic64_t		issued_to;	/* Write issuer folio cursor */
-- 
cgit v1.2.3


From 983cdcf8fe141b0ce16bc71959a5dc55bcb0764d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 6 Jun 2024 07:48:55 +0100
Subject: netfs: Simplify the writeback code

Use the new folio_queue structures to simplify the writeback code.  The
problem with referring to the i_pages xarray directly is that we may have
gaps in the sequence of folios we're writing from that we need to skip when
we're removing the writeback mark from the folios we're writing back from.

At the moment the code tries to deal with this by carefully tracking the
gaps in each writeback stream (eg. write to server and write to cache) and
divining when there's a gap that spans folios (something that's not helped
by folios not being a consistent size).

Instead, the folio_queue buffer contains pointers only the folios we're
dealing with, has them in ascending order and indicates a gap by placing
non-consequitive folios next to each other.  This makes it possible to
track where we need to clean up to by just keeping track of where we've
processed to on each stream and taking the minimum.

Note that the I/O iterator is always rounded up to the end of the folio,
even if that is beyond the EOF position, so that the cache can do DIO from
the page.  The excess space is cleared, though mmapped writes clobber it.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20240814203850.2240469-18-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/netfs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index f7b444f4f251..bd0e3d147822 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -257,7 +257,6 @@ struct netfs_io_request {
 	unsigned long long	i_size;		/* Size of the file */
 	unsigned long long	start;		/* Start position */
 	atomic64_t		issued_to;	/* Write issuer folio cursor */
-	unsigned long long	contiguity;	/* Tracking for gaps in the writeback sequence */
 	unsigned long long	collected_to;	/* Point we've collected to */
 	unsigned long long	cleaned_to;	/* Position we've cleaned folios to */
 	pgoff_t			no_unlock_folio; /* Don't unlock this folio after read */
-- 
cgit v1.2.3


From ee4cdf7ba857a894ad1650d6ab77669cbbfa329e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 2 Jul 2024 00:40:22 +0100
Subject: netfs: Speed up buffered reading

Improve the efficiency of buffered reads in a number of ways:

 (1) Overhaul the algorithm in general so that it's a lot more compact and
     split the read submission code between buffered and unbuffered
     versions.  The unbuffered version can be vastly simplified.

 (2) Read-result collection is handed off to a work queue rather than being
     done in the I/O thread.  Multiple subrequests can be processes
     simultaneously.

 (3) When a subrequest is collected, any folios it fully spans are
     collected and "spare" data on either side is donated to either the
     previous or the next subrequest in the sequence.

Notes:

 (*) Readahead expansion is massively slows down fio, presumably because it
     causes a load of extra allocations, both folio and xarray, up front
     before RPC requests can be transmitted.

 (*) RDMA with cifs does appear to work, both with SIW and RXE.

 (*) PG_private_2-based reading and copy-to-cache is split out into its own
     file and altered to use folio_queue.  Note that the copy to the cache
     now creates a new write transaction against the cache and adds the
     folios to be copied into it.  This allows it to use part of the
     writeback I/O code.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20240814203850.2240469-20-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/folio_queue.h | 18 ++++++++++++++++++
 include/linux/netfs.h       | 26 ++++++++++++++++++--------
 2 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h
index 52773613bf23..955680c3bb5f 100644
--- a/include/linux/folio_queue.h
+++ b/include/linux/folio_queue.h
@@ -27,6 +27,7 @@ struct folio_queue {
 	struct folio_queue	*prev;		/* Previous queue segment of NULL */
 	unsigned long		marks;		/* 1-bit mark per folio */
 	unsigned long		marks2;		/* Second 1-bit mark per folio */
+	unsigned long		marks3;		/* Third 1-bit mark per folio */
 #if PAGEVEC_SIZE > BITS_PER_LONG
 #error marks is not big enough
 #endif
@@ -39,6 +40,7 @@ static inline void folioq_init(struct folio_queue *folioq)
 	folioq->prev = NULL;
 	folioq->marks = 0;
 	folioq->marks2 = 0;
+	folioq->marks3 = 0;
 }
 
 static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq)
@@ -87,6 +89,21 @@ static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot)
 	clear_bit(slot, &folioq->marks2);
 }
 
+static inline bool folioq_is_marked3(const struct folio_queue *folioq, unsigned int slot)
+{
+	return test_bit(slot, &folioq->marks3);
+}
+
+static inline void folioq_mark3(struct folio_queue *folioq, unsigned int slot)
+{
+	set_bit(slot, &folioq->marks3);
+}
+
+static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot)
+{
+	clear_bit(slot, &folioq->marks3);
+}
+
 static inline unsigned int __folio_order(struct folio *folio)
 {
 	if (!folio_test_large(folio))
@@ -133,6 +150,7 @@ static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot)
 	folioq->vec.folios[slot] = NULL;
 	folioq_unmark(folioq, slot);
 	folioq_unmark2(folioq, slot);
+	folioq_unmark3(folioq, slot);
 }
 
 #endif /* _LINUX_FOLIO_QUEUE_H */
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index bd0e3d147822..c0f0c9c87d86 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -178,36 +178,43 @@ struct netfs_io_subrequest {
 	unsigned long long	start;		/* Where to start the I/O */
 	size_t			len;		/* Size of the I/O */
 	size_t			transferred;	/* Amount of data transferred */
+	size_t			consumed;	/* Amount of read data consumed */
+	size_t			prev_donated;	/* Amount of data donated from previous subreq */
+	size_t			next_donated;	/* Amount of data donated from next subreq */
 	refcount_t		ref;
 	short			error;		/* 0 or error that occurred */
 	unsigned short		debug_index;	/* Index in list (for debugging output) */
 	unsigned int		nr_segs;	/* Number of segs in io_iter */
 	enum netfs_io_source	source;		/* Where to read from/write to */
 	unsigned char		stream_nr;	/* I/O stream this belongs to */
+	unsigned char		curr_folioq_slot; /* Folio currently being read */
+	unsigned char		curr_folio_order; /* Order of folio */
+	struct folio_queue	*curr_folioq;	/* Queue segment in which current folio resides */
 	unsigned long		flags;
 #define NETFS_SREQ_COPY_TO_CACHE	0	/* Set if should copy the data to the cache */
 #define NETFS_SREQ_CLEAR_TAIL		1	/* Set if the rest of the read should be cleared */
-#define NETFS_SREQ_SHORT_IO		2	/* Set if the I/O was short */
 #define NETFS_SREQ_SEEK_DATA_READ	3	/* Set if ->read() should SEEK_DATA first */
 #define NETFS_SREQ_NO_PROGRESS		4	/* Set if we didn't manage to read any data */
 #define NETFS_SREQ_ONDEMAND		5	/* Set if it's from on-demand read mode */
 #define NETFS_SREQ_BOUNDARY		6	/* Set if ends on hard boundary (eg. ceph object) */
+#define NETFS_SREQ_HIT_EOF		7	/* Set if short due to EOF */
 #define NETFS_SREQ_IN_PROGRESS		8	/* Unlocked when the subrequest completes */
 #define NETFS_SREQ_NEED_RETRY		9	/* Set if the filesystem requests a retry */
 #define NETFS_SREQ_RETRYING		10	/* Set if we're retrying */
 #define NETFS_SREQ_FAILED		11	/* Set if the subreq failed unretryably */
-#define NETFS_SREQ_HIT_EOF		12	/* Set if we hit the EOF */
 };
 
 enum netfs_io_origin {
 	NETFS_READAHEAD,		/* This read was triggered by readahead */
 	NETFS_READPAGE,			/* This read is a synchronous read */
+	NETFS_READ_GAPS,		/* This read is a synchronous read to fill gaps */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
 	NETFS_DIO_READ,			/* This is a direct I/O read */
 	NETFS_WRITEBACK,		/* This write was triggered by writepages */
 	NETFS_WRITETHROUGH,		/* This write was made by netfs_perform_write() */
 	NETFS_UNBUFFERED_WRITE,		/* This is an unbuffered write */
 	NETFS_DIO_WRITE,		/* This is a direct I/O write */
+	NETFS_PGPRIV2_COPY_TO_CACHE,	/* [DEPRECATED] This is writing read data to the cache */
 	nr__netfs_io_origin
 } __mode(byte);
 
@@ -224,6 +231,7 @@ struct netfs_io_request {
 	struct address_space	*mapping;	/* The mapping being accessed */
 	struct kiocb		*iocb;		/* AIO completion vector */
 	struct netfs_cache_resources cache_resources;
+	struct readahead_control *ractl;	/* Readahead descriptor */
 	struct list_head	proc_link;	/* Link in netfs_iorequests */
 	struct list_head	subrequests;	/* Contributory I/O operations */
 	struct netfs_io_stream	io_streams[2];	/* Streams of parallel I/O operations */
@@ -244,12 +252,10 @@ struct netfs_io_request {
 	unsigned int		nr_group_rel;	/* Number of refs to release on ->group */
 	spinlock_t		lock;		/* Lock for queuing subreqs */
 	atomic_t		nr_outstanding;	/* Number of ops in progress */
-	atomic_t		nr_copy_ops;	/* Number of copy-to-cache ops in progress */
-	size_t			upper_len;	/* Length can be extended to here */
 	unsigned long long	submitted;	/* Amount submitted for I/O so far */
 	unsigned long long	len;		/* Length of the request */
 	size_t			transferred;	/* Amount to be indicated as transferred */
-	short			error;		/* 0 or error that occurred */
+	long			error;		/* 0 or error that occurred */
 	enum netfs_io_origin	origin;		/* Origin of the request */
 	bool			direct_bv_unpin; /* T if direct_bv[] must be unpinned */
 	u8			buffer_head_slot; /* First slot in ->buffer */
@@ -260,9 +266,9 @@ struct netfs_io_request {
 	unsigned long long	collected_to;	/* Point we've collected to */
 	unsigned long long	cleaned_to;	/* Position we've cleaned folios to */
 	pgoff_t			no_unlock_folio; /* Don't unlock this folio after read */
+	size_t			prev_donated;	/* Fallback for subreq->prev_donated */
 	refcount_t		ref;
 	unsigned long		flags;
-#define NETFS_RREQ_INCOMPLETE_IO	0	/* Some ioreqs terminated short or with error */
 #define NETFS_RREQ_COPY_TO_CACHE	1	/* Need to write to the cache */
 #define NETFS_RREQ_NO_UNLOCK_FOLIO	2	/* Don't unlock no_unlock_folio on completion */
 #define NETFS_RREQ_DONT_UNLOCK_FOLIOS	3	/* Don't unlock the folios on completion */
@@ -274,6 +280,7 @@ struct netfs_io_request {
 #define NETFS_RREQ_PAUSE		11	/* Pause subrequest generation */
 #define NETFS_RREQ_USE_IO_ITER		12	/* Use ->io_iter rather than ->i_pages */
 #define NETFS_RREQ_ALL_QUEUED		13	/* All subreqs are now queued */
+#define NETFS_RREQ_NEED_RETRY		14	/* Need to try retrying */
 #define NETFS_RREQ_USE_PGPRIV2		31	/* [DEPRECATED] Use PG_private_2 to mark
 						 * write to cache on read */
 	const struct netfs_request_ops *netfs_ops;
@@ -292,7 +299,7 @@ struct netfs_request_ops {
 
 	/* Read request handling */
 	void (*expand_readahead)(struct netfs_io_request *rreq);
-	bool (*clamp_length)(struct netfs_io_subrequest *subreq);
+	int (*prepare_read)(struct netfs_io_subrequest *subreq);
 	void (*issue_read)(struct netfs_io_subrequest *subreq);
 	bool (*is_still_valid)(struct netfs_io_request *rreq);
 	int (*check_write_begin)(struct file *file, loff_t pos, unsigned len,
@@ -422,7 +429,10 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp);
 vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group);
 
 /* (Sub)request management API. */
-void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool);
+void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq,
+				bool was_async);
+void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
+				  int error, bool was_async);
 void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
 			  enum netfs_sreq_ref_trace what);
 void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
-- 
cgit v1.2.3


From c4f1450ecccc5311db87f806998eda1c824c4e35 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 12 Jul 2024 12:44:30 +0100
Subject: cachefiles, netfs: Fix write to partial block at EOF

Because it uses DIO writes, cachefiles is unable to make a write to the
backing file if that write is not aligned to and sized according to the
backing file's DIO block alignment.  This makes it tricky to handle a write
to the cache where the EOF on the network file is not correctly aligned.

To get around this, netfslib attempts to tell the driver it is calling how
much more data there is available beyond the EOF that it can use to pad the
write (netfslib preclears the part of the folio above the EOF).  However,
it tries to tell the cache what the maximum length is, but doesn't
calculate this correctly; and, in any case, cachefiles actually ignores the
value and just skips the block.

Fix this by:

 (1) Change the value passed to indicate the amount of extra data that can
     be added to the operation (now ->submit_extendable_to).  This is much
     simpler to calculate as it's just the end of the folio minus the top
     of the data within the folio - rather than having to account for data
     spread over multiple folios.

 (2) Make cachefiles add some of this data if the subrequest it is given
     ends at the network file's i_size if the extra data is sufficient to
     pad out to a whole block.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20240814203850.2240469-22-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/netfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index c0f0c9c87d86..5eaceef41e6c 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -135,7 +135,7 @@ struct netfs_io_stream {
 	unsigned int		sreq_max_segs;	/* 0 or max number of segments in an iterator */
 	unsigned int		submit_off;	/* Folio offset we're submitting from */
 	unsigned int		submit_len;	/* Amount of data left to submit */
-	unsigned int		submit_max_len;	/* Amount I/O can be rounded up to */
+	unsigned int		submit_extendable_to; /* Amount I/O can be rounded up to */
 	void (*prepare_write)(struct netfs_io_subrequest *subreq);
 	void (*issue_write)(struct netfs_io_subrequest *subreq);
 	/* Collection tracking */
-- 
cgit v1.2.3


From 2982c8c19bab020e38da9d503aa21a3b389c53ac Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 26 Jul 2024 20:03:07 +0100
Subject: cifs: Use iterate_and_advance*() routines directly for hashing

Replace the bespoke cifs iterators of ITER_BVEC and ITER_KVEC to do hashing
with iterate_and_advance_kernel() - a variant on iterate_and_advance() that
only supports kernel-internal ITER_* types and not UBUF/IOVEC types.

The bespoke ITER_XARRAY is left because we don't really want to be calling
crypto_shash_update() under the RCU read lock for large amounts of data;
besides, ITER_XARRAY is going to be phased out.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Steve French <sfrench@samba.org>
cc: Paulo Alcantara <pc@manguebit.com>
cc: Tom Talpey <tom@talpey.com>
cc: Enzo Matsumiya <ematsumiya@suse.de>
cc: linux-cifs@vger.kernel.org
Link: https://lore.kernel.org/r/20240814203850.2240469-24-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/iov_iter.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h
index a223370a59a7..c4aa58032faf 100644
--- a/include/linux/iov_iter.h
+++ b/include/linux/iov_iter.h
@@ -328,4 +328,51 @@ size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv,
 	return iterate_and_advance2(iter, len, priv, NULL, ustep, step);
 }
 
+/**
+ * iterate_and_advance_kernel - Iterate over a kernel-internal iterator
+ * @iter: The iterator to iterate over.
+ * @len: The amount to iterate over.
+ * @priv: Data for the step functions.
+ * @priv2: More data for the step functions.
+ * @step: Function for other iterators; given kernel addresses.
+ *
+ * Iterate over the next part of an iterator, up to the specified length.  The
+ * buffer is presented in segments, which for kernel iteration are broken up by
+ * physical pages and mapped, with the mapped address being presented.
+ *
+ * [!] Note This will only handle BVEC, KVEC, FOLIOQ, XARRAY and DISCARD-type
+ * iterators; it will not handle UBUF or IOVEC-type iterators.
+ *
+ * A step functions, @step, must be provided, one for handling mapped kernel
+ * addresses and the other is given user addresses which have the potential to
+ * fault since no pinning is performed.
+ *
+ * The step functions are passed the address and length of the segment, @priv,
+ * @priv2 and the amount of data so far iterated over (which can, for example,
+ * be added to @priv to point to the right part of a second buffer).  The step
+ * functions should return the amount of the segment they didn't process (ie. 0
+ * indicates complete processsing).
+ *
+ * This function returns the amount of data processed (ie. 0 means nothing was
+ * processed and the value of @len means processes to completion).
+ */
+static __always_inline
+size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv,
+				  void *priv2, iov_step_f step)
+{
+	if (unlikely(iter->count < len))
+		len = iter->count;
+	if (unlikely(!len))
+		return 0;
+	if (iov_iter_is_bvec(iter))
+		return iterate_bvec(iter, len, priv, priv2, step);
+	if (iov_iter_is_kvec(iter))
+		return iterate_kvec(iter, len, priv, priv2, step);
+	if (iov_iter_is_folioq(iter))
+		return iterate_folioq(iter, len, priv, priv2, step);
+	if (iov_iter_is_xarray(iter))
+		return iterate_xarray(iter, len, priv, priv2, step);
+	return iterate_discard(iter, len, priv, priv2, step);
+}
+
 #endif /* _LINUX_IOV_ITER_H */
-- 
cgit v1.2.3


From c45b9a07b6392fa224ca76b89f24dae1046eef09 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Thu, 12 Sep 2024 22:30:34 +0900
Subject: Revert "firewire: core: use mutex to coordinate concurrent calls to
 flush completions"

This reverts commit d9605d67562505e27dcc0f71af418118d3db91e5, since this
commit is on the following reverted changes.

Link: https://lore.kernel.org/r/20240912133038.238786-2-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 include/linux/firewire.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 19e8c5f9537c..f815d12deda0 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -512,7 +512,6 @@ union fw_iso_callback {
 struct fw_iso_context {
 	struct fw_card *card;
 	struct work_struct work;
-	struct mutex flushing_completions_mutex;
 	int type;
 	int channel;
 	int speed;
-- 
cgit v1.2.3


From 4010cb1efda08ec6fd02ec5db9da909322ef352e Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Thu, 12 Sep 2024 22:30:37 +0900
Subject: firewire: core: update documentation of kernel APIs for flushing
 completions

There is a slight difference between fw_iso_context_flush_completions() and
fw_iso_context_schedule_flush_completions().

This commit updates the documentations for them.

Link: https://lore.kernel.org/r/20240912133038.238786-5-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 include/linux/firewire.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index f815d12deda0..b632eec3ab52 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -537,9 +537,11 @@ int fw_iso_context_flush_completions(struct fw_iso_context *ctx);
  * @ctx: the isochronous context
  *
  * Schedule a work item on workqueue to process the isochronous context. The registered callback
- * function is called in the worker if some packets have been already transferred since the last
- * time. If it is required to process the context in the current context,
- * fw_iso_context_flush_completions() is available instead.
+ * function is called by the worker when a queued packet buffer with the interrupt flag is
+ * completed, either after transmission in the IT context or after being filled in the IR context.
+ * The callback function is also called when the header buffer in the context becomes full, If it
+ * is required to process the context in the current context, fw_iso_context_flush_completions() is
+ * available instead.
  *
  * Context: Any context.
  */
-- 
cgit v1.2.3


From ede5bbe488d162bcd572880e58f9044c9df84050 Mon Sep 17 00:00:00 2001
From: Nikita Shubin <nikita.shubin@maquefel.me>
Date: Mon, 9 Sep 2024 11:10:27 +0300
Subject: ARM: ep93xx: add regmap aux_dev

The following driver's should be instantiated by ep93xx syscon driver:

- reboot
- pinctrl
- clock

They all require access to DEVCFG register with a shared lock held, to
avoid conflict writing to swlocked parts of DEVCFG.

Provide common resources such as base, regmap and spinlock via auxiliary
bus framework.

Signed-off-by: Nikita Shubin <nikita.shubin@maquefel.me>
Tested-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/soc/cirrus/ep93xx.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/cirrus/ep93xx.h b/include/linux/soc/cirrus/ep93xx.h
index 56fbe2dc59b1..a27447971302 100644
--- a/include/linux/soc/cirrus/ep93xx.h
+++ b/include/linux/soc/cirrus/ep93xx.h
@@ -3,6 +3,18 @@
 #define _SOC_EP93XX_H
 
 struct platform_device;
+struct regmap;
+struct spinlock_t;
+
+enum ep93xx_soc_model {
+	EP93XX_9301_SOC,
+	EP93XX_9307_SOC,
+	EP93XX_9312_SOC,
+};
+
+#include <linux/auxiliary_bus.h>
+#include <linux/compiler_types.h>
+#include <linux/container_of.h>
 
 #define EP93XX_CHIP_REV_D0	3
 #define EP93XX_CHIP_REV_D1	4
@@ -10,6 +22,20 @@ struct platform_device;
 #define EP93XX_CHIP_REV_E1	6
 #define EP93XX_CHIP_REV_E2	7
 
+struct ep93xx_regmap_adev {
+	struct auxiliary_device adev;
+	struct regmap *map;
+	void __iomem *base;
+	spinlock_t *lock;
+	void (*write)(struct regmap *map, spinlock_t *lock, unsigned int reg,
+		      unsigned int val);
+	void (*update_bits)(struct regmap *map, spinlock_t *lock,
+			    unsigned int reg, unsigned int mask, unsigned int val);
+};
+
+#define to_ep93xx_regmap_adev(_adev) \
+	container_of((_adev), struct ep93xx_regmap_adev, adev)
+
 #ifdef CONFIG_ARCH_EP93XX
 int ep93xx_pwm_acquire_gpio(struct platform_device *pdev);
 void ep93xx_pwm_release_gpio(struct platform_device *pdev);
-- 
cgit v1.2.3


From 2e7f55ce430240a5547b8a94b4c532fc8c20b18b Mon Sep 17 00:00:00 2001
From: Nikita Shubin <nikita.shubin@maquefel.me>
Date: Mon, 9 Sep 2024 11:10:34 +0300
Subject: dmaengine: cirrus: Convert to DT for Cirrus EP93xx

Convert Cirrus EP93xx DMA to device tree usage:

- add OF ID match table with data
- add of_probe for device tree
- add xlate for m2m/m2p
- drop subsys_initcall code
- drop platform probe
- drop platform structs usage

>From now on it only supports device tree probing.

Co-developed-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Signed-off-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Nikita Shubin <nikita.shubin@maquefel.me>
Tested-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/platform_data/dma-ep93xx.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/dma-ep93xx.h b/include/linux/platform_data/dma-ep93xx.h
index eb9805bb3fe8..9ec5cdd5a1eb 100644
--- a/include/linux/platform_data/dma-ep93xx.h
+++ b/include/linux/platform_data/dma-ep93xx.h
@@ -3,8 +3,11 @@
 #define __ASM_ARCH_DMA_H
 
 #include <linux/types.h>
+#include <linux/device.h>
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
+#include <linux/property.h>
+#include <linux/string.h>
 
 /*
  * M2P channels.
@@ -70,6 +73,9 @@ struct ep93xx_dma_platform_data {
 
 static inline bool ep93xx_dma_chan_is_m2p(struct dma_chan *chan)
 {
+	if (device_is_compatible(chan->device->dev, "cirrus,ep9301-dma-m2p"))
+		return true;
+
 	return !strcmp(dev_name(chan->device->dev), "ep93xx-dma-m2p");
 }
 
-- 
cgit v1.2.3


From b3ab5787e7acb02874ae86cbe13969d4dd01a585 Mon Sep 17 00:00:00 2001
From: Nikita Shubin <nikita.shubin@maquefel.me>
Date: Mon, 9 Sep 2024 11:10:48 +0300
Subject: input: keypad: ep93xx: add DT support for Cirrus EP93xx

- drop flags, they were not used anyway
- add OF ID match table
- process "autorepeat", "debounce-delay-ms", prescale from device tree
- drop platform data usage and it's header
- keymap goes from device tree now on

Signed-off-by: Nikita Shubin <nikita.shubin@maquefel.me>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/soc/cirrus/ep93xx.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/cirrus/ep93xx.h b/include/linux/soc/cirrus/ep93xx.h
index a27447971302..8942bfaf1545 100644
--- a/include/linux/soc/cirrus/ep93xx.h
+++ b/include/linux/soc/cirrus/ep93xx.h
@@ -41,8 +41,6 @@ int ep93xx_pwm_acquire_gpio(struct platform_device *pdev);
 void ep93xx_pwm_release_gpio(struct platform_device *pdev);
 int ep93xx_ide_acquire_gpio(struct platform_device *pdev);
 void ep93xx_ide_release_gpio(struct platform_device *pdev);
-int ep93xx_keypad_acquire_gpio(struct platform_device *pdev);
-void ep93xx_keypad_release_gpio(struct platform_device *pdev);
 int ep93xx_i2s_acquire(void);
 void ep93xx_i2s_release(void);
 unsigned int ep93xx_chip_revision(void);
@@ -52,8 +50,6 @@ static inline int ep93xx_pwm_acquire_gpio(struct platform_device *pdev) { return
 static inline void ep93xx_pwm_release_gpio(struct platform_device *pdev) {}
 static inline int ep93xx_ide_acquire_gpio(struct platform_device *pdev) { return 0; }
 static inline void ep93xx_ide_release_gpio(struct platform_device *pdev) {}
-static inline int ep93xx_keypad_acquire_gpio(struct platform_device *pdev) { return 0; }
-static inline void ep93xx_keypad_release_gpio(struct platform_device *pdev) {}
 static inline int ep93xx_i2s_acquire(void) { return 0; }
 static inline void ep93xx_i2s_release(void) {}
 static inline unsigned int ep93xx_chip_revision(void) { return 0; }
-- 
cgit v1.2.3


From a48ac3dc569771c18fcafbc8351d820cc343c54a Mon Sep 17 00:00:00 2001
From: Nikita Shubin <nikita.shubin@maquefel.me>
Date: Mon, 9 Sep 2024 11:10:58 +0300
Subject: pwm: ep93xx: drop legacy pinctrl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop legacy gpio request/free since we are using
pinctrl for this now.

Signed-off-by: Nikita Shubin <nikita.shubin@maquefel.me>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Acked-by: Thierry Reding <thierry.reding@gmail.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/soc/cirrus/ep93xx.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/cirrus/ep93xx.h b/include/linux/soc/cirrus/ep93xx.h
index 8942bfaf1545..f6376edc1b33 100644
--- a/include/linux/soc/cirrus/ep93xx.h
+++ b/include/linux/soc/cirrus/ep93xx.h
@@ -37,8 +37,6 @@ struct ep93xx_regmap_adev {
 	container_of((_adev), struct ep93xx_regmap_adev, adev)
 
 #ifdef CONFIG_ARCH_EP93XX
-int ep93xx_pwm_acquire_gpio(struct platform_device *pdev);
-void ep93xx_pwm_release_gpio(struct platform_device *pdev);
 int ep93xx_ide_acquire_gpio(struct platform_device *pdev);
 void ep93xx_ide_release_gpio(struct platform_device *pdev);
 int ep93xx_i2s_acquire(void);
@@ -46,8 +44,6 @@ void ep93xx_i2s_release(void);
 unsigned int ep93xx_chip_revision(void);
 
 #else
-static inline int ep93xx_pwm_acquire_gpio(struct platform_device *pdev) { return 0; }
-static inline void ep93xx_pwm_release_gpio(struct platform_device *pdev) {}
 static inline int ep93xx_ide_acquire_gpio(struct platform_device *pdev) { return 0; }
 static inline void ep93xx_ide_release_gpio(struct platform_device *pdev) {}
 static inline int ep93xx_i2s_acquire(void) { return 0; }
-- 
cgit v1.2.3


From a632229be268dde8f6d407638b5cfba8b78201d6 Mon Sep 17 00:00:00 2001
From: Nikita Shubin <nikita.shubin@maquefel.me>
Date: Mon, 9 Sep 2024 11:10:59 +0300
Subject: ata: pata_ep93xx: remove legacy pinctrl use

Drop legacy acquire/release since we are using pinctrl for this now.

Signed-off-by: Nikita Shubin <nikita.shubin@maquefel.me>
Tested-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Andy Shevchenko <andy@kernel.org>
Acked-by: Damien Le Moal <dlemoal@kernel.org>
Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/soc/cirrus/ep93xx.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/cirrus/ep93xx.h b/include/linux/soc/cirrus/ep93xx.h
index f6376edc1b33..142c33a2d7db 100644
--- a/include/linux/soc/cirrus/ep93xx.h
+++ b/include/linux/soc/cirrus/ep93xx.h
@@ -37,15 +37,11 @@ struct ep93xx_regmap_adev {
 	container_of((_adev), struct ep93xx_regmap_adev, adev)
 
 #ifdef CONFIG_ARCH_EP93XX
-int ep93xx_ide_acquire_gpio(struct platform_device *pdev);
-void ep93xx_ide_release_gpio(struct platform_device *pdev);
 int ep93xx_i2s_acquire(void);
 void ep93xx_i2s_release(void);
 unsigned int ep93xx_chip_revision(void);
 
 #else
-static inline int ep93xx_ide_acquire_gpio(struct platform_device *pdev) { return 0; }
-static inline void ep93xx_ide_release_gpio(struct platform_device *pdev) {}
 static inline int ep93xx_i2s_acquire(void) { return 0; }
 static inline void ep93xx_i2s_release(void) {}
 static inline unsigned int ep93xx_chip_revision(void) { return 0; }
-- 
cgit v1.2.3


From e5ef574dda702e62081d5c7991949cbdb7f65c08 Mon Sep 17 00:00:00 2001
From: Nikita Shubin <nikita.shubin@maquefel.me>
Date: Mon, 9 Sep 2024 11:11:00 +0300
Subject: ARM: ep93xx: delete all boardfiles

Delete the ep93xx board files.

Signed-off-by: Nikita Shubin <nikita.shubin@maquefel.me>
Tested-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/platform_data/spi-ep93xx.h | 15 ---------------
 1 file changed, 15 deletions(-)
 delete mode 100644 include/linux/platform_data/spi-ep93xx.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/spi-ep93xx.h b/include/linux/platform_data/spi-ep93xx.h
deleted file mode 100644
index b439f2a896e0..000000000000
--- a/include/linux/platform_data/spi-ep93xx.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ASM_MACH_EP93XX_SPI_H
-#define __ASM_MACH_EP93XX_SPI_H
-
-struct spi_device;
-
-/**
- * struct ep93xx_spi_info - EP93xx specific SPI descriptor
- * @use_dma: use DMA for the transfers
- */
-struct ep93xx_spi_info {
-	bool	use_dma;
-};
-
-#endif /* __ASM_MACH_EP93XX_SPI_H */
-- 
cgit v1.2.3


From 43528a72526152f17a918973b3b8b6f383b90f98 Mon Sep 17 00:00:00 2001
From: Nikita Shubin <nikita.shubin@maquefel.me>
Date: Mon, 9 Sep 2024 11:11:01 +0300
Subject: ARM: ep93xx: soc: drop defines

Remove unnecessary defines, as we dropped board files.

Signed-off-by: Nikita Shubin <nikita.shubin@maquefel.me>
Tested-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/platform_data/eth-ep93xx.h    | 10 ---------
 include/linux/platform_data/keypad-ep93xx.h | 32 -----------------------------
 include/linux/soc/cirrus/ep93xx.h           | 13 ------------
 3 files changed, 55 deletions(-)
 delete mode 100644 include/linux/platform_data/eth-ep93xx.h
 delete mode 100644 include/linux/platform_data/keypad-ep93xx.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/eth-ep93xx.h b/include/linux/platform_data/eth-ep93xx.h
deleted file mode 100644
index 8eef637a804d..000000000000
--- a/include/linux/platform_data/eth-ep93xx.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_PLATFORM_DATA_ETH_EP93XX
-#define _LINUX_PLATFORM_DATA_ETH_EP93XX
-
-struct ep93xx_eth_data {
-	unsigned char	dev_addr[6];
-	unsigned char	phy_id;
-};
-
-#endif
diff --git a/include/linux/platform_data/keypad-ep93xx.h b/include/linux/platform_data/keypad-ep93xx.h
deleted file mode 100644
index 3054fced8509..000000000000
--- a/include/linux/platform_data/keypad-ep93xx.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __KEYPAD_EP93XX_H
-#define __KEYPAD_EP93XX_H
-
-struct matrix_keymap_data;
-
-/* flags for the ep93xx_keypad driver */
-#define EP93XX_KEYPAD_DISABLE_3_KEY	(1<<0)	/* disable 3-key reset */
-#define EP93XX_KEYPAD_DIAG_MODE		(1<<1)	/* diagnostic mode */
-#define EP93XX_KEYPAD_BACK_DRIVE	(1<<2)	/* back driving mode */
-#define EP93XX_KEYPAD_TEST_MODE		(1<<3)	/* scan only column 0 */
-#define EP93XX_KEYPAD_AUTOREPEAT	(1<<4)	/* enable key autorepeat */
-
-/**
- * struct ep93xx_keypad_platform_data - platform specific device structure
- * @keymap_data:	pointer to &matrix_keymap_data
- * @debounce:		debounce start count; terminal count is 0xff
- * @prescale:		row/column counter pre-scaler load value
- * @flags:		see above
- */
-struct ep93xx_keypad_platform_data {
-	struct matrix_keymap_data *keymap_data;
-	unsigned int	debounce;
-	unsigned int	prescale;
-	unsigned int	flags;
-	unsigned int	clk_rate;
-};
-
-#define EP93XX_MATRIX_ROWS		(8)
-#define EP93XX_MATRIX_COLS		(8)
-
-#endif	/* __KEYPAD_EP93XX_H */
diff --git a/include/linux/soc/cirrus/ep93xx.h b/include/linux/soc/cirrus/ep93xx.h
index 142c33a2d7db..3e6cf2b25a97 100644
--- a/include/linux/soc/cirrus/ep93xx.h
+++ b/include/linux/soc/cirrus/ep93xx.h
@@ -2,7 +2,6 @@
 #ifndef _SOC_EP93XX_H
 #define _SOC_EP93XX_H
 
-struct platform_device;
 struct regmap;
 struct spinlock_t;
 
@@ -36,16 +35,4 @@ struct ep93xx_regmap_adev {
 #define to_ep93xx_regmap_adev(_adev) \
 	container_of((_adev), struct ep93xx_regmap_adev, adev)
 
-#ifdef CONFIG_ARCH_EP93XX
-int ep93xx_i2s_acquire(void);
-void ep93xx_i2s_release(void);
-unsigned int ep93xx_chip_revision(void);
-
-#else
-static inline int ep93xx_i2s_acquire(void) { return 0; }
-static inline void ep93xx_i2s_release(void) {}
-static inline unsigned int ep93xx_chip_revision(void) { return 0; }
-
-#endif
-
 #endif
-- 
cgit v1.2.3


From a015b1828653b591de0aa5303c0dbc4235935f94 Mon Sep 17 00:00:00 2001
From: Nikita Shubin <nikita.shubin@maquefel.me>
Date: Mon, 9 Sep 2024 11:11:03 +0300
Subject: dmaengine: cirrus: remove platform code

Remove DMA platform header, from now on we use device tree for DMA
clients.

Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Nikita Shubin <nikita.shubin@maquefel.me>
Tested-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/platform_data/dma-ep93xx.h | 100 -------------------------------
 1 file changed, 100 deletions(-)
 delete mode 100644 include/linux/platform_data/dma-ep93xx.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/dma-ep93xx.h b/include/linux/platform_data/dma-ep93xx.h
deleted file mode 100644
index 9ec5cdd5a1eb..000000000000
--- a/include/linux/platform_data/dma-ep93xx.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ASM_ARCH_DMA_H
-#define __ASM_ARCH_DMA_H
-
-#include <linux/types.h>
-#include <linux/device.h>
-#include <linux/dmaengine.h>
-#include <linux/dma-mapping.h>
-#include <linux/property.h>
-#include <linux/string.h>
-
-/*
- * M2P channels.
- *
- * Note that these values are also directly used for setting the PPALLOC
- * register.
- */
-#define EP93XX_DMA_I2S1		0
-#define EP93XX_DMA_I2S2		1
-#define EP93XX_DMA_AAC1		2
-#define EP93XX_DMA_AAC2		3
-#define EP93XX_DMA_AAC3		4
-#define EP93XX_DMA_I2S3		5
-#define EP93XX_DMA_UART1	6
-#define EP93XX_DMA_UART2	7
-#define EP93XX_DMA_UART3	8
-#define EP93XX_DMA_IRDA		9
-/* M2M channels */
-#define EP93XX_DMA_SSP		10
-#define EP93XX_DMA_IDE		11
-
-/**
- * struct ep93xx_dma_data - configuration data for the EP93xx dmaengine
- * @port: peripheral which is requesting the channel
- * @direction: TX/RX channel
- * @name: optional name for the channel, this is displayed in /proc/interrupts
- *
- * This information is passed as private channel parameter in a filter
- * function. Note that this is only needed for slave/cyclic channels.  For
- * memcpy channels %NULL data should be passed.
- */
-struct ep93xx_dma_data {
-	int				port;
-	enum dma_transfer_direction	direction;
-	const char			*name;
-};
-
-/**
- * struct ep93xx_dma_chan_data - platform specific data for a DMA channel
- * @name: name of the channel, used for getting the right clock for the channel
- * @base: mapped registers
- * @irq: interrupt number used by this channel
- */
-struct ep93xx_dma_chan_data {
-	const char			*name;
-	void __iomem			*base;
-	int				irq;
-};
-
-/**
- * struct ep93xx_dma_platform_data - platform data for the dmaengine driver
- * @channels: array of channels which are passed to the driver
- * @num_channels: number of channels in the array
- *
- * This structure is passed to the DMA engine driver via platform data. For
- * M2P channels, contract is that even channels are for TX and odd for RX.
- * There is no requirement for the M2M channels.
- */
-struct ep93xx_dma_platform_data {
-	struct ep93xx_dma_chan_data	*channels;
-	size_t				num_channels;
-};
-
-static inline bool ep93xx_dma_chan_is_m2p(struct dma_chan *chan)
-{
-	if (device_is_compatible(chan->device->dev, "cirrus,ep9301-dma-m2p"))
-		return true;
-
-	return !strcmp(dev_name(chan->device->dev), "ep93xx-dma-m2p");
-}
-
-/**
- * ep93xx_dma_chan_direction - returns direction the channel can be used
- * @chan: channel
- *
- * This function can be used in filter functions to find out whether the
- * channel supports given DMA direction. Only M2P channels have such
- * limitation, for M2M channels the direction is configurable.
- */
-static inline enum dma_transfer_direction
-ep93xx_dma_chan_direction(struct dma_chan *chan)
-{
-	if (!ep93xx_dma_chan_is_m2p(chan))
-		return DMA_TRANS_NONE;
-
-	/* even channels are for TX, odd for RX */
-	return (chan->chan_id % 2 == 0) ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM;
-}
-
-#endif /* __ASM_ARCH_DMA_H */
-- 
cgit v1.2.3


From 433d7ce2d86d21274838c9e8c796f4232cd13cdb Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 6 Aug 2024 15:38:12 -0700
Subject: security,bpf: constify struct path in bpf_token_create() LSM hook

There is no reason why struct path pointer shouldn't be const-qualified
when being passed into bpf_token_create() LSM hook. Add that const.

Acked-by: Paul Moore <paul@paul-moore.com> (LSM/SELinux)
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/lsm_hook_defs.h | 2 +-
 include/linux/security.h      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 855db460e08b..462b55378241 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -431,7 +431,7 @@ LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
 	 struct bpf_token *token)
 LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
 LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr,
-	 struct path *path)
+	 const struct path *path)
 LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token)
 LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd)
 LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap)
diff --git a/include/linux/security.h b/include/linux/security.h
index 1390f1efb4f0..31523a2c71c4 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2137,7 +2137,7 @@ extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 				  struct bpf_token *token);
 extern void security_bpf_prog_free(struct bpf_prog *prog);
 extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
-				     struct path *path);
+				     const struct path *path);
 extern void security_bpf_token_free(struct bpf_token *token);
 extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
 extern int security_bpf_token_capable(const struct bpf_token *token, int cap);
@@ -2177,7 +2177,7 @@ static inline void security_bpf_prog_free(struct bpf_prog *prog)
 { }
 
 static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
-				     struct path *path)
+					    const struct path *path)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From da2f660b3ba1be33310452959ab72d1d7ce39350 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Wed, 11 Sep 2024 13:17:46 -0700
Subject: net/mlx5: fs, make get_root_namespace API function

As preparation for HW Steering support, where the function
get_root_namespace() is needed to get root FDB, make it an API function
and rename it to mlx5_get_root_namespace().

Reviewed-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Link: https://patch.msgid.link/20240911201757.1505453-5-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/fs.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 3fb428ce7d1c..b744e554f014 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -342,4 +342,7 @@ void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev,
 				  struct mlx5_pkt_reformat *reformat);
 
 u32 mlx5_flow_table_id(struct mlx5_flow_table *ft);
+
+struct mlx5_flow_root_namespace *
+mlx5_get_root_namespace(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type ns_type);
 #endif
-- 
cgit v1.2.3


From 9947204cdad97d22d171039019a4aad4d6899cdd Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Wed, 11 Sep 2024 13:17:51 -0700
Subject: net/mlx5: Add device cap for supporting hot reset in sync reset flow

New devices with new FW can support sync reset for firmware activate
using hot reset. Add capability for supporting it and add MFRL field to
query from FW which type of PCI reset method to use while handling sync
reset events.

Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/20240911201757.1505453-10-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 2045575b70d4..620a5c305123 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1856,7 +1856,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_328[0x2];
 	u8	   relaxed_ordering_read[0x1];
 	u8         log_max_pd[0x5];
-	u8         reserved_at_330[0x6];
+	u8         reserved_at_330[0x5];
+	u8         pcie_reset_using_hotreset_method[0x1];
 	u8         pci_sync_for_fw_update_with_driver_unload[0x1];
 	u8         vnic_env_cnt_steering_fail[0x1];
 	u8         vport_counter_local_loopback[0x1];
@@ -11188,6 +11189,11 @@ struct mlx5_ifc_mcda_reg_bits {
 	u8         data[][0x20];
 };
 
+enum {
+	MLX5_MFRL_REG_PCI_RESET_METHOD_LINK_TOGGLE = 0,
+	MLX5_MFRL_REG_PCI_RESET_METHOD_HOT_RESET = 1,
+};
+
 enum {
 	MLX5_MFRL_REG_RESET_STATE_IDLE = 0,
 	MLX5_MFRL_REG_RESET_STATE_IN_NEGOTIATION = 1,
@@ -11215,7 +11221,8 @@ struct mlx5_ifc_mfrl_reg_bits {
 	u8         pci_sync_for_fw_update_start[0x1];
 	u8         pci_sync_for_fw_update_resp[0x2];
 	u8         rst_type_sel[0x3];
-	u8         reserved_at_28[0x4];
+	u8         pci_reset_req_method[0x3];
+	u8         reserved_at_2b[0x1];
 	u8         reset_state[0x4];
 	u8         reset_type[0x8];
 	u8         reset_level[0x8];
-- 
cgit v1.2.3


From 5bd877093fd0b2e9e5f0c03b466669f761b5849c Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Wed, 11 Sep 2024 13:17:55 -0700
Subject: net/mlx5: Add NOT_READY command return status

Add a new command status MLX5_CMD_STAT_NOT_READY to handle cases
where the firmware is not ready.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Link: https://patch.msgid.link/20240911201757.1505453-14-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/device.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index a94bc9e3af96..d0f7d1f36c5e 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1449,6 +1449,7 @@ enum {
 	MLX5_CMD_STAT_BAD_SYS_STATE_ERR		= 0x4,
 	MLX5_CMD_STAT_BAD_RES_ERR		= 0x5,
 	MLX5_CMD_STAT_RES_BUSY			= 0x6,
+	MLX5_CMD_STAT_NOT_READY			= 0x7,
 	MLX5_CMD_STAT_LIM_ERR			= 0x8,
 	MLX5_CMD_STAT_BAD_RES_STATE_ERR		= 0x9,
 	MLX5_CMD_STAT_IX_ERR			= 0xa,
-- 
cgit v1.2.3


From 907936b6f4e630718cc31ddea79cc76a3e32080a Mon Sep 17 00:00:00 2001
From: Michael Guralnik <michaelgur@nvidia.com>
Date: Mon, 9 Sep 2024 13:05:04 +0300
Subject: net/mlx5: Handle memory scheme ODP capabilities

When running over new FW that supports the new memory scheme ODP, set
the cap in the FW to signal the FW we are working in the new scheme.

In the memory scheme ODP the per_transport_service capabilities are RO
for the driver so we skip their setting.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/20240909100504.29797-9-michaelgur@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/device.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 154095256d0d..57c9b18c3adb 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1389,9 +1389,13 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_ODP(mdev, cap)\
 	MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, cap)
 
-#define MLX5_CAP_ODP_SCHEME(mdev, cap)                       \
-	MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, \
-		 transport_page_fault_scheme_cap.cap)
+#define MLX5_CAP_ODP_SCHEME(mdev, cap)                                \
+	(MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur,         \
+		  mem_page_fault) ?                                   \
+		 MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, \
+			  memory_page_fault_scheme_cap.cap) :         \
+		 MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, \
+			  transport_page_fault_scheme_cap.cap))
 
 #define MLX5_CAP_ODP_MAX(mdev, cap)\
 	MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->max, cap)
-- 
cgit v1.2.3


From 8d159eb2117b2e3697a31785662b653938f007cb Mon Sep 17 00:00:00 2001
From: Chiara Meiohas <cmeiohas@nvidia.com>
Date: Mon, 9 Sep 2024 20:30:23 +0300
Subject: RDMA/mlx5: Use IB set_netdev and get_netdev functions

The IB layer provides a common interface to store and get net
devices associated to an IB device port (ib_device_set_netdev()
and ib_device_get_netdev()).
Previously, mlx5_ib stored and managed the associated net devices
internally.

Replace internal net device management in mlx5_ib with
ib_device_set_netdev() when attaching/detaching  a net device and
ib_device_get_netdev() when retrieving the net device.

Export ib_device_get_netdev().

For mlx5 representors/PFs/VFs and lag creation we replace the netdev
assignments with the IB set/get netdev functions.

In active-backup mode lag the active slave net device is stored in the
lag itself. To assure the net device stored in a lag bond IB device is
the active slave we implement the following:
- mlx5_core: when modifying the slave of a bond we send the internal driver event
  MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE.
- mlx5_ib: when catching the event call ib_device_set_netdev()

This patch also ensures the correct IB events are sent in switchdev lag.

While at it, when in multiport eswitch mode, only a single IB device is
created for all ports. The said IB device will receive all netdev events
of its VFs once loaded, thus to avoid overwriting the mapping of PF IB
device to PF netdev, ignore NETDEV_REGISTER events if the ib device has
already been mapped to a netdev.

Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com>
Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/device.h | 1 +
 include/linux/mlx5/driver.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 57c9b18c3adb..dac33cfe9c0c 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -371,6 +371,7 @@ enum mlx5_driver_event {
 	MLX5_DRIVER_EVENT_SF_PEER_DEVLINK,
 	MLX5_DRIVER_EVENT_AFFILIATION_DONE,
 	MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
+	MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE,
 };
 
 enum {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a96438ded15f..46a7a3d11048 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -643,6 +643,7 @@ struct mlx5_priv {
 	struct mlx5_sf_hw_table *sf_hw_table;
 	struct mlx5_sf_table *sf_table;
 #endif
+	struct blocking_notifier_head lag_nh;
 };
 
 enum mlx5_device_state {
@@ -1181,7 +1182,6 @@ bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev);
 bool mlx5_lag_is_master(struct mlx5_core_dev *dev);
 bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev);
 bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev);
-struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev);
 u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev,
 			   struct net_device *slave);
 int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
-- 
cgit v1.2.3


From 96f8052822e03c6f49b6b28fc1d6e5e0522ecbb9 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 12 Sep 2024 15:39:54 -0700
Subject: locking/mutex: Define mutex_init() once

With CONFIG_PREEMPT_RT disabled __mutex_init() is a function. With
CONFIG_PREEMPT_RT enabled, __mutex_init() is a macro. I assume this is why
mutex_init() is defined twice as exactly the same macro.

Prepare for introducing a new macro for mutex initialization by combining
the two identical mutex_init() definitions into a single definition. This
patch does not change any functionality because the C preprocessor expands
macros when it encounters the macro name and not when a macro definition
is encountered. See also commit bb630f9f7a7d ("locking/rtmutex: Add mutex
variant for RT").

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20240912223956.3554086-2-bvanassche@acm.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mutex.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index a561c629d89f..ef617089db19 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -49,7 +49,6 @@ static inline void mutex_destroy(struct mutex *lock) {}
 
 #endif
 
-#ifndef CONFIG_PREEMPT_RT
 /**
  * mutex_init - initialize the mutex
  * @mutex: the mutex to be initialized
@@ -65,6 +64,7 @@ do {									\
 	__mutex_init((mutex), #mutex, &__key);				\
 } while (0)
 
+#ifndef CONFIG_PREEMPT_RT
 #define __MUTEX_INITIALIZER(lockname) \
 		{ .owner = ATOMIC_LONG_INIT(0) \
 		, .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
@@ -111,12 +111,6 @@ do {							\
 	__mutex_rt_init((mutex), name, key);		\
 } while (0)
 
-#define mutex_init(mutex)				\
-do {							\
-	static struct lock_class_key __key;		\
-							\
-	__mutex_init((mutex), #mutex, &__key);		\
-} while (0)
 #endif /* CONFIG_PREEMPT_RT */
 
 #ifdef CONFIG_DEBUG_MUTEXES
-- 
cgit v1.2.3


From e837d833a13461c10f265a65ce6612e6dd43e76f Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 12 Sep 2024 15:39:55 -0700
Subject: locking/mutex: Introduce mutex_init_with_key()

The following pattern occurs 5 times in kernel drivers:

	lockdep_register_key(key);
	__mutex_init(mutex, name, key);

In several cases the 'name' argument matches #mutex. Hence, introduce
the mutex_init_with_key() macro. This macro derives the 'name' argument
from the 'mutex' argument.

Suggested-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20240912223956.3554086-3-bvanassche@acm.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mutex.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index ef617089db19..2bf91b57591b 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -64,6 +64,17 @@ do {									\
 	__mutex_init((mutex), #mutex, &__key);				\
 } while (0)
 
+/**
+ * mutex_init_with_key - initialize a mutex with a given lockdep key
+ * @mutex: the mutex to be initialized
+ * @key: the lockdep key to be associated with the mutex
+ *
+ * Initialize the mutex to the unlocked state.
+ *
+ * It is not allowed to initialize an already locked mutex.
+ */
+#define mutex_init_with_key(mutex, key) __mutex_init((mutex), #mutex, (key))
+
 #ifndef CONFIG_PREEMPT_RT
 #define __MUTEX_INITIALIZER(lockname) \
 		{ .owner = ATOMIC_LONG_INIT(0) \
-- 
cgit v1.2.3


From 4b7ff9ab98af11a477d50f08382bcc4c2f899926 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 13 Sep 2024 10:15:56 +0200
Subject: mm, slab: restore kerneldoc for kmem_cache_create()

As kmem_cache_create() became a _Generic() wrapper macro, it currently
has no kerneldoc despite being the main API to use. Add it. Also adjust
kmem_cache_create_usercopy() kerneldoc to indicate it is now a legacy
wrapper.

Also expand the kerneldoc for struct kmem_cache_args, especially for the
freeptr_offset field, where important details were removed with the
removal of kmem_cache_create_rcu().

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/slab.h | 114 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 91 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 331412a9f4f2..6a8ab7ef3af7 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -242,19 +242,72 @@ bool slab_is_available(void);
 
 /**
  * struct kmem_cache_args - Less common arguments for kmem_cache_create()
- * @align: The required alignment for the objects.
- * @useroffset: Usercopy region offset
- * @usersize: Usercopy region size
- * @freeptr_offset: Custom offset for the free pointer in RCU caches
- * @use_freeptr_offset: Whether a @freeptr_offset is used
- * @ctor: A constructor for the objects.
+ *
+ * Any uninitialized fields of the structure are interpreted as unused. The
+ * exception is @freeptr_offset where %0 is a valid value, so
+ * @use_freeptr_offset must be also set to %true in order to interpret the field
+ * as used. For @useroffset %0 is also valid, but only with non-%0
+ * @usersize.
+ *
+ * When %NULL args is passed to kmem_cache_create(), it is equivalent to all
+ * fields unused.
  */
 struct kmem_cache_args {
+	/**
+	 * @align: The required alignment for the objects.
+	 *
+	 * %0 means no specific alignment is requested.
+	 */
 	unsigned int align;
+	/**
+	 * @useroffset: Usercopy region offset.
+	 *
+	 * %0 is a valid offset, when @usersize is non-%0
+	 */
 	unsigned int useroffset;
+	/**
+	 * @usersize: Usercopy region size.
+	 *
+	 * %0 means no usercopy region is specified.
+	 */
 	unsigned int usersize;
+	/**
+	 * @freeptr_offset: Custom offset for the free pointer
+	 * in &SLAB_TYPESAFE_BY_RCU caches
+	 *
+	 * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer
+	 * outside of the object. This might cause the object to grow in size.
+	 * Cache creators that have a reason to avoid this can specify a custom
+	 * free pointer offset in their struct where the free pointer will be
+	 * placed.
+	 *
+	 * Note that placing the free pointer inside the object requires the
+	 * caller to ensure that no fields are invalidated that are required to
+	 * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for
+	 * details).
+	 *
+	 * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset
+	 * is specified, %use_freeptr_offset must be set %true.
+	 *
+	 * Note that @ctor currently isn't supported with custom free pointers
+	 * as a @ctor requires an external free pointer.
+	 */
 	unsigned int freeptr_offset;
+	/**
+	 * @use_freeptr_offset: Whether a @freeptr_offset is used.
+	 */
 	bool use_freeptr_offset;
+	/**
+	 * @ctor: A constructor for the objects.
+	 *
+	 * The constructor is invoked for each object in a newly allocated slab
+	 * page. It is the cache user's responsibility to free object in the
+	 * same state as after calling the constructor, or deal appropriately
+	 * with any differences between a freshly constructed and a reallocated
+	 * object.
+	 *
+	 * %NULL means no constructor.
+	 */
 	void (*ctor)(void *);
 };
 
@@ -275,30 +328,20 @@ __kmem_cache_create(const char *name, unsigned int size, unsigned int align,
 }
 
 /**
- * kmem_cache_create_usercopy - Create a cache with a region suitable
- * for copying to userspace
+ * kmem_cache_create_usercopy - Create a kmem cache with a region suitable
+ * for copying to userspace.
  * @name: A string which is used in /proc/slabinfo to identify this cache.
  * @size: The size of objects to be created in this cache.
  * @align: The required alignment for the objects.
  * @flags: SLAB flags
  * @useroffset: Usercopy region offset
  * @usersize: Usercopy region size
- * @ctor: A constructor for the objects.
- *
- * Cannot be called within a interrupt, but can be interrupted.
- * The @ctor is run when new pages are allocated by the cache.
- *
- * The flags are
+ * @ctor: A constructor for the objects, or %NULL.
  *
- * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
- * to catch references to uninitialised memory.
- *
- * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
- * for buffer overruns.
- *
- * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
- * cacheline.  This can be beneficial if you're counting cycles as closely
- * as davem.
+ * This is a legacy wrapper, new code should use either KMEM_CACHE_USERCOPY()
+ * if whitelisting a single field is sufficient, or kmem_cache_create() with
+ * the necessary parameters passed via the args parameter (see
+ * &struct kmem_cache_args)
  *
  * Return: a pointer to the cache on success, NULL on failure.
  */
@@ -333,6 +376,31 @@ __kmem_cache_default_args(const char *name, unsigned int size,
 	return __kmem_cache_create_args(name, size, &kmem_default_args, flags);
 }
 
+/**
+ * kmem_cache_create - Create a kmem cache.
+ * @__name: A string which is used in /proc/slabinfo to identify this cache.
+ * @__object_size: The size of objects to be created in this cache.
+ * @__args: Optional arguments, see &struct kmem_cache_args. Passing %NULL
+ *	    means defaults will be used for all the arguments.
+ *
+ * This is currently implemented as a macro using ``_Generic()`` to call
+ * either the new variant of the function, or a legacy one.
+ *
+ * The new variant has 4 parameters:
+ * ``kmem_cache_create(name, object_size, args, flags)``
+ *
+ * See __kmem_cache_create_args() which implements this.
+ *
+ * The legacy variant has 5 parameters:
+ * ``kmem_cache_create(name, object_size, align, flags, ctor)``
+ *
+ * The align and ctor parameters map to the respective fields of
+ * &struct kmem_cache_args
+ *
+ * Context: Cannot be called within a interrupt, but can be interrupted.
+ *
+ * Return: a pointer to the cache on success, NULL on failure.
+ */
 #define kmem_cache_create(__name, __object_size, __args, ...)           \
 	_Generic((__args),                                              \
 		struct kmem_cache_args *: __kmem_cache_create_args,	\
-- 
cgit v1.2.3


From 6a36d828bdef0e02b1e6c12e2160f5b83be6aab5 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Fri, 13 Sep 2024 02:09:55 +0100
Subject: driver core: attribute_container: Remove unused functions

I can't find any use of 'attribute_container_add_class_device_adapter'
or 'attribute_container_trigger' in git history.
Their export decls went in 2006:
  commit 1740757e8f94 ("[PATCH] Driver Core: remove unused exports")

and their docs disappeared in 2016:
  commit 47cb398dd75a ("Docs: sphinxify device-drivers.tmpl")

Remove them.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Link: https://lore.kernel.org/r/20240913010955.1393995-1-linux@treblig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/attribute_container.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/attribute_container.h b/include/linux/attribute_container.h
index e4004d1e6725..b3643de9931d 100644
--- a/include/linux/attribute_container.h
+++ b/include/linux/attribute_container.h
@@ -61,14 +61,8 @@ int attribute_container_device_trigger_safe(struct device *dev,
 					    int (*undo)(struct attribute_container *,
 							struct device *,
 							struct device *));
-void attribute_container_trigger(struct device *dev, 
-				 int (*fn)(struct attribute_container *,
-					   struct device *));
 int attribute_container_add_attrs(struct device *classdev);
 int attribute_container_add_class_device(struct device *classdev);
-int attribute_container_add_class_device_adapter(struct attribute_container *cont,
-						 struct device *dev,
-						 struct device *classdev);
 void attribute_container_remove_attrs(struct device *classdev);
 void attribute_container_class_device_del(struct device *classdev);
 struct attribute_container *attribute_container_classdev_to_container(struct device *);
-- 
cgit v1.2.3


From 2cb4acf2140be8a4f299c0b887cc314845ef6ec8 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Fri, 13 Sep 2024 07:11:42 -0700
Subject: hwmon: Remove devm_hwmon_device_unregister() API function

devm_hwmon_device_unregister() has no in-tree user, and its implementation
is wrong since it does not pass the to-be-removed hardware monitoring
device as parameter. I do not envision a valid use for it; drivers needing
it should not have called devm_hwmon_device_register_with_info() in the
first place. Remove it.

Reported-by: Matthew Sanders <m@ttsande.rs>
Closes: https://lore.kernel.org/linux-hwmon/488b3bdf870ea76c4b943dbe5fd15ac8113019dc.camel@kernel.org/
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 include/linux/hwmon.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index e94314760aab..5c6a421ad580 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -481,7 +481,6 @@ devm_hwmon_device_register_with_info(struct device *dev,
 				const struct attribute_group **extra_groups);
 
 void hwmon_device_unregister(struct device *dev);
-void devm_hwmon_device_unregister(struct device *dev);
 
 int hwmon_notify_event(struct device *dev, enum hwmon_sensor_types type,
 		       u32 attr, int channel);
-- 
cgit v1.2.3


From d175ee98fe545d2c56df22751314584cce228307 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Mon, 2 Sep 2024 21:17:18 +0200
Subject: mm: Define VM_DROPPABLE for powerpc/32

Commit 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always
lazily freeable mappings") only adds VM_DROPPABLE for 64 bits
architectures.

In order to also use the getrandom vDSO implementation on powerpc/32,
use VM_ARCH_1 for VM_DROPPABLE on powerpc/32. This is possible because
VM_ARCH_1 is used for VM_SAO on powerpc and VM_SAO is only for
powerpc/64. It is used in combination with PROT_SAO in some parts of
code that are restricted to CONFIG_PPC64 through #ifdefs, it is
therefore possible to define VM_SAO for CONFIG_PPC64 only.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 include/linux/mm.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6549d0979b28..028847f39442 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -359,7 +359,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #if defined(CONFIG_X86)
 # define VM_PAT		VM_ARCH_1	/* PAT reserves whole VMA at once (x86) */
-#elif defined(CONFIG_PPC)
+#elif defined(CONFIG_PPC64)
 # define VM_SAO		VM_ARCH_1	/* Strong Access Ordering (powerpc) */
 #elif defined(CONFIG_PARISC)
 # define VM_GROWSUP	VM_ARCH_1
@@ -409,6 +409,8 @@ extern unsigned int kobjsize(const void *objp);
 #ifdef CONFIG_64BIT
 #define VM_DROPPABLE_BIT	40
 #define VM_DROPPABLE		BIT(VM_DROPPABLE_BIT)
+#elif defined(CONFIG_PPC32)
+#define VM_DROPPABLE		VM_ARCH_1
 #else
 #define VM_DROPPABLE		VM_NONE
 #endif
-- 
cgit v1.2.3


From 2b018086143d638de8d67ae5be6e8c1afb413193 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 13 Sep 2024 11:28:46 -0700
Subject: blk-mq: unconditional nr_integrity_segments

Always defining the field will make using it easier and less error prone
in future patches.

There shouldn't be any downside to this: the field fits in what would
otherwise be a 2-byte hole, so we're not saving space by conditionally
leaving it out.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20240913182854.2445457-2-kbusch@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8d304b1d16b1..4fecf46ef681 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -149,10 +149,7 @@ struct request {
 	 * physical address coalescing is performed.
 	 */
 	unsigned short nr_phys_segments;
-
-#ifdef CONFIG_BLK_DEV_INTEGRITY
 	unsigned short nr_integrity_segments;
-#endif
 
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 	struct bio_crypt_ctx *crypt_ctx;
-- 
cgit v1.2.3


From d2c5b1faccd5ef6352456f817e941945d3b3fe62 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 13 Sep 2024 11:28:50 -0700
Subject: block: provide a request helper for user integrity segments

Provide a helper to keep the request flags and nr_integrity_segments in
sync with the bio's integrity payload. This is an integrity equivalent
to the normal data helper function, 'blk_rq_map_user()'.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20240913182854.2445457-6-kbusch@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-integrity.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index de98049b7ded..793dbb1e0672 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -28,6 +28,8 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t,
 int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
 				   struct scatterlist *);
 int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
+int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
+			      ssize_t bytes, u32 seed);
 
 static inline bool
 blk_integrity_queue_supports_integrity(struct request_queue *q)
@@ -102,6 +104,13 @@ static inline int blk_rq_map_integrity_sg(struct request_queue *q,
 {
 	return 0;
 }
+static inline int blk_rq_integrity_map_user(struct request *rq,
+					    void __user *ubuf,
+					    ssize_t bytes,
+					    u32 seed)
+{
+	return -EINVAL;
+}
 static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
 {
 	return NULL;
-- 
cgit v1.2.3


From 76c313f658d2752e8527610677164aa7094ef7a5 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 13 Sep 2024 12:17:46 -0700
Subject: blk-integrity: improved sg segment mapping

Make the integrity mapping more like data mapping, blk_rq_map_sg. Use
the request to validate the segment count, and update the callers so
they don't have to.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/r/20240913191746.2628196-1-kbusch@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-integrity.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index 793dbb1e0672..676f8f860c47 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -25,8 +25,7 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t,
 }
 
 #ifdef CONFIG_BLK_DEV_INTEGRITY
-int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
-				   struct scatterlist *);
+int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
 int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
 int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
 			      ssize_t bytes, u32 seed);
@@ -98,8 +97,7 @@ static inline int blk_rq_count_integrity_sg(struct request_queue *q,
 {
 	return 0;
 }
-static inline int blk_rq_map_integrity_sg(struct request_queue *q,
-					  struct bio *b,
+static inline int blk_rq_map_integrity_sg(struct request *q,
 					  struct scatterlist *s)
 {
 	return 0;
-- 
cgit v1.2.3


From 32556ce93bc45c730829083cb60f95a2728ea48b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 Sep 2024 21:17:48 +0200
Subject: bpf: Fix helper writes to read-only maps

Lonial found an issue that despite user- and BPF-side frozen BPF map
(like in case of .rodata), it was still possible to write into it from
a BPF program side through specific helpers having ARG_PTR_TO_{LONG,INT}
as arguments.

In check_func_arg() when the argument is as mentioned, the meta->raw_mode
is never set. Later, check_helper_mem_access(), under the case of
PTR_TO_MAP_VALUE as register base type, it assumes BPF_READ for the
subsequent call to check_map_access_type() and given the BPF map is
read-only it succeeds.

The helpers really need to be annotated as ARG_PTR_TO_{LONG,INT} | MEM_UNINIT
when results are written into them as opposed to read out of them. The
latter indicates that it's okay to pass a pointer to uninitialized memory
as the memory is written to anyway.

However, ARG_PTR_TO_{LONG,INT} is a special case of ARG_PTR_TO_FIXED_SIZE_MEM
just with additional alignment requirement. So it is better to just get
rid of the ARG_PTR_TO_{LONG,INT} special cases altogether and reuse the
fixed size memory types. For this, add MEM_ALIGNED to additionally ensure
alignment given these helpers write directly into the args via *<ptr> = val.
The .arg*_size has been initialized reflecting the actual sizeof(*<ptr>).

MEM_ALIGNED can only be used in combination with MEM_FIXED_SIZE annotated
argument types, since in !MEM_FIXED_SIZE cases the verifier does not know
the buffer size a priori and therefore cannot blindly write *<ptr> = val.

Fixes: 57c3bb725a3d ("bpf: Introduce ARG_PTR_TO_{INT,LONG} arg types")
Reported-by: Lonial Con <kongln9170@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20240913191754.13290-3-daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7f3e8477d5e7..0c3893c47171 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -695,6 +695,11 @@ enum bpf_type_flag {
 	/* DYNPTR points to xdp_buff */
 	DYNPTR_TYPE_XDP		= BIT(16 + BPF_BASE_TYPE_BITS),
 
+	/* Memory must be aligned on some architectures, used in combination with
+	 * MEM_FIXED_SIZE.
+	 */
+	MEM_ALIGNED		= BIT(17 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
@@ -732,8 +737,6 @@ enum bpf_arg_type {
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
 	ARG_PTR_TO_SPIN_LOCK,	/* pointer to bpf_spin_lock */
 	ARG_PTR_TO_SOCK_COMMON,	/* pointer to sock_common */
-	ARG_PTR_TO_INT,		/* pointer to int */
-	ARG_PTR_TO_LONG,	/* pointer to long */
 	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock (fullsock) */
 	ARG_PTR_TO_BTF_ID,	/* pointer to in-kernel struct */
 	ARG_PTR_TO_RINGBUF_MEM,	/* pointer to dynamically reserved ringbuf memory */
-- 
cgit v1.2.3


From 0cca961a026177af69044f10d6ae76d8ce043764 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Thu, 12 Sep 2024 11:00:25 +0530
Subject: PCI: Pass domain number to pci_bus_release_domain_nr() explicitly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pci_bus_release_domain_nr() API is supposed to free the domain
number allocated by pci_bus_find_domain_nr(). Most of the callers of
pci_bus_find_domain_nr(), store the domain number in pci_bus::domain_nr.

As such, the pci_bus_release_domain_nr() implicitly frees the domain
number by dereferencing 'struct pci_bus'. However, one of the callers
of this API, the PCI endpoint subsystem, doesn't have 'struct pci_bus',
so it only passes NULL. Due to this, the API will end up dereferencing
the NULL pointer.

To fix this issue, pass the domain number to this API explicitly. Since
'struct pci_bus' is not used for anything else other than extracting the
domain number, it makes sense to pass the domain number directly.

Fixes: 0328947c5032 ("PCI: endpoint: Assign PCI domain number for endpoint controllers")
Closes: https://lore.kernel.org/linux-pci/c0c40ddb-bf64-4b22-9dd1-8dbb18aa2813@stanley.mountain
Link: https://lore.kernel.org/linux-pci/20240912053025.25314-1-manivannan.sadhasivam@linaro.org
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
[kwilczynski: commit log]
Signed-off-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
---
 include/linux/pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4cf89a4b4cbc..37d97bef060f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1884,7 +1884,7 @@ static inline int acpi_pci_bus_find_domain_nr(struct pci_bus *bus)
 { return 0; }
 #endif
 int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent);
-void pci_bus_release_domain_nr(struct pci_bus *bus, struct device *parent);
+void pci_bus_release_domain_nr(struct device *parent, int domain_nr);
 #endif
 
 /* Some architectures require additional setup to direct VGA traffic */
-- 
cgit v1.2.3


From 1b8c9cb3151a541a4197d2bb449233064f747832 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Thu, 12 Sep 2024 09:19:49 +0800
Subject: MIPS: Remove the obsoleted code for include/linux/mv643xx.h

Most of the drivers which used this header have been deleted, most
of these code is obsoleted, move the only defines that are actually
used into arch/powerpc/platforms/chrp/pegasos_eth.c and delete the
file completely.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Link: https://patch.msgid.link/20240912011949.2726928-1-cuigaosheng1@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mv643xx.h | 921 ------------------------------------------------
 1 file changed, 921 deletions(-)
 delete mode 100644 include/linux/mv643xx.h

(limited to 'include/linux')

diff --git a/include/linux/mv643xx.h b/include/linux/mv643xx.h
deleted file mode 100644
index 000b126acfb6..000000000000
--- a/include/linux/mv643xx.h
+++ /dev/null
@@ -1,921 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * mv643xx.h - MV-643XX Internal registers definition file.
- *
- * Copyright 2002 Momentum Computer, Inc.
- * 	Author: Matthew Dharm <mdharm@momenco.com>
- * Copyright 2002 GALILEO TECHNOLOGY, LTD. 
- */
-#ifndef __ASM_MV643XX_H
-#define __ASM_MV643XX_H
-
-#include <asm/types.h>
-#include <linux/mv643xx_eth.h>
-#include <linux/mv643xx_i2c.h>
-
-/****************************************/
-/* Processor Address Space              */
-/****************************************/
-
-/* DDR SDRAM BAR and size registers */
-
-#define MV64340_CS_0_BASE_ADDR                                      0x008
-#define MV64340_CS_0_SIZE                                           0x010
-#define MV64340_CS_1_BASE_ADDR                                      0x208
-#define MV64340_CS_1_SIZE                                           0x210
-#define MV64340_CS_2_BASE_ADDR                                      0x018
-#define MV64340_CS_2_SIZE                                           0x020
-#define MV64340_CS_3_BASE_ADDR                                      0x218
-#define MV64340_CS_3_SIZE                                           0x220
-
-/* Devices BAR and size registers */
-
-#define MV64340_DEV_CS0_BASE_ADDR                                   0x028
-#define MV64340_DEV_CS0_SIZE                                        0x030
-#define MV64340_DEV_CS1_BASE_ADDR                                   0x228
-#define MV64340_DEV_CS1_SIZE                                        0x230
-#define MV64340_DEV_CS2_BASE_ADDR                                   0x248
-#define MV64340_DEV_CS2_SIZE                                        0x250
-#define MV64340_DEV_CS3_BASE_ADDR                                   0x038
-#define MV64340_DEV_CS3_SIZE                                        0x040
-#define MV64340_BOOTCS_BASE_ADDR                                    0x238
-#define MV64340_BOOTCS_SIZE                                         0x240
-
-/* PCI 0 BAR and size registers */
-
-#define MV64340_PCI_0_IO_BASE_ADDR                                  0x048
-#define MV64340_PCI_0_IO_SIZE                                       0x050
-#define MV64340_PCI_0_MEMORY0_BASE_ADDR                             0x058
-#define MV64340_PCI_0_MEMORY0_SIZE                                  0x060
-#define MV64340_PCI_0_MEMORY1_BASE_ADDR                             0x080
-#define MV64340_PCI_0_MEMORY1_SIZE                                  0x088
-#define MV64340_PCI_0_MEMORY2_BASE_ADDR                             0x258
-#define MV64340_PCI_0_MEMORY2_SIZE                                  0x260
-#define MV64340_PCI_0_MEMORY3_BASE_ADDR                             0x280
-#define MV64340_PCI_0_MEMORY3_SIZE                                  0x288
-
-/* PCI 1 BAR and size registers */
-#define MV64340_PCI_1_IO_BASE_ADDR                                  0x090
-#define MV64340_PCI_1_IO_SIZE                                       0x098
-#define MV64340_PCI_1_MEMORY0_BASE_ADDR                             0x0a0
-#define MV64340_PCI_1_MEMORY0_SIZE                                  0x0a8
-#define MV64340_PCI_1_MEMORY1_BASE_ADDR                             0x0b0
-#define MV64340_PCI_1_MEMORY1_SIZE                                  0x0b8
-#define MV64340_PCI_1_MEMORY2_BASE_ADDR                             0x2a0
-#define MV64340_PCI_1_MEMORY2_SIZE                                  0x2a8
-#define MV64340_PCI_1_MEMORY3_BASE_ADDR                             0x2b0
-#define MV64340_PCI_1_MEMORY3_SIZE                                  0x2b8
-
-/* SRAM base address */
-#define MV64340_INTEGRATED_SRAM_BASE_ADDR                           0x268
-
-/* internal registers space base address */
-#define MV64340_INTERNAL_SPACE_BASE_ADDR                            0x068
-
-/* Enables the CS , DEV_CS , PCI 0 and PCI 1 
-   windows above */
-#define MV64340_BASE_ADDR_ENABLE                                    0x278
-
-/****************************************/
-/* PCI remap registers                  */
-/****************************************/
-      /* PCI 0 */
-#define MV64340_PCI_0_IO_ADDR_REMAP                                 0x0f0
-#define MV64340_PCI_0_MEMORY0_LOW_ADDR_REMAP                        0x0f8
-#define MV64340_PCI_0_MEMORY0_HIGH_ADDR_REMAP                       0x320
-#define MV64340_PCI_0_MEMORY1_LOW_ADDR_REMAP                        0x100
-#define MV64340_PCI_0_MEMORY1_HIGH_ADDR_REMAP                       0x328
-#define MV64340_PCI_0_MEMORY2_LOW_ADDR_REMAP                        0x2f8
-#define MV64340_PCI_0_MEMORY2_HIGH_ADDR_REMAP                       0x330
-#define MV64340_PCI_0_MEMORY3_LOW_ADDR_REMAP                        0x300
-#define MV64340_PCI_0_MEMORY3_HIGH_ADDR_REMAP                       0x338
-      /* PCI 1 */
-#define MV64340_PCI_1_IO_ADDR_REMAP                                 0x108
-#define MV64340_PCI_1_MEMORY0_LOW_ADDR_REMAP                        0x110
-#define MV64340_PCI_1_MEMORY0_HIGH_ADDR_REMAP                       0x340
-#define MV64340_PCI_1_MEMORY1_LOW_ADDR_REMAP                        0x118
-#define MV64340_PCI_1_MEMORY1_HIGH_ADDR_REMAP                       0x348
-#define MV64340_PCI_1_MEMORY2_LOW_ADDR_REMAP                        0x310
-#define MV64340_PCI_1_MEMORY2_HIGH_ADDR_REMAP                       0x350
-#define MV64340_PCI_1_MEMORY3_LOW_ADDR_REMAP                        0x318
-#define MV64340_PCI_1_MEMORY3_HIGH_ADDR_REMAP                       0x358
- 
-#define MV64340_CPU_PCI_0_HEADERS_RETARGET_CONTROL                  0x3b0
-#define MV64340_CPU_PCI_0_HEADERS_RETARGET_BASE                     0x3b8
-#define MV64340_CPU_PCI_1_HEADERS_RETARGET_CONTROL                  0x3c0
-#define MV64340_CPU_PCI_1_HEADERS_RETARGET_BASE                     0x3c8
-#define MV64340_CPU_GE_HEADERS_RETARGET_CONTROL                     0x3d0
-#define MV64340_CPU_GE_HEADERS_RETARGET_BASE                        0x3d8
-#define MV64340_CPU_IDMA_HEADERS_RETARGET_CONTROL                   0x3e0
-#define MV64340_CPU_IDMA_HEADERS_RETARGET_BASE                      0x3e8
-
-/****************************************/
-/*         CPU Control Registers        */
-/****************************************/
-
-#define MV64340_CPU_CONFIG                                          0x000
-#define MV64340_CPU_MODE                                            0x120
-#define MV64340_CPU_MASTER_CONTROL                                  0x160
-#define MV64340_CPU_CROSS_BAR_CONTROL_LOW                           0x150
-#define MV64340_CPU_CROSS_BAR_CONTROL_HIGH                          0x158
-#define MV64340_CPU_CROSS_BAR_TIMEOUT                               0x168
-
-/****************************************/
-/* SMP RegisterS                        */
-/****************************************/
-
-#define MV64340_SMP_WHO_AM_I                                        0x200
-#define MV64340_SMP_CPU0_DOORBELL                                   0x214
-#define MV64340_SMP_CPU0_DOORBELL_CLEAR                             0x21C
-#define MV64340_SMP_CPU1_DOORBELL                                   0x224
-#define MV64340_SMP_CPU1_DOORBELL_CLEAR                             0x22C
-#define MV64340_SMP_CPU0_DOORBELL_MASK                              0x234
-#define MV64340_SMP_CPU1_DOORBELL_MASK                              0x23C
-#define MV64340_SMP_SEMAPHOR0                                       0x244
-#define MV64340_SMP_SEMAPHOR1                                       0x24c
-#define MV64340_SMP_SEMAPHOR2                                       0x254
-#define MV64340_SMP_SEMAPHOR3                                       0x25c
-#define MV64340_SMP_SEMAPHOR4                                       0x264
-#define MV64340_SMP_SEMAPHOR5                                       0x26c
-#define MV64340_SMP_SEMAPHOR6                                       0x274
-#define MV64340_SMP_SEMAPHOR7                                       0x27c
-
-/****************************************/
-/*  CPU Sync Barrier Register           */
-/****************************************/
-
-#define MV64340_CPU_0_SYNC_BARRIER_TRIGGER                          0x0c0
-#define MV64340_CPU_0_SYNC_BARRIER_VIRTUAL                          0x0c8
-#define MV64340_CPU_1_SYNC_BARRIER_TRIGGER                          0x0d0
-#define MV64340_CPU_1_SYNC_BARRIER_VIRTUAL                          0x0d8
-
-/****************************************/
-/* CPU Access Protect                   */
-/****************************************/
-
-#define MV64340_CPU_PROTECT_WINDOW_0_BASE_ADDR                      0x180
-#define MV64340_CPU_PROTECT_WINDOW_0_SIZE                           0x188
-#define MV64340_CPU_PROTECT_WINDOW_1_BASE_ADDR                      0x190
-#define MV64340_CPU_PROTECT_WINDOW_1_SIZE                           0x198
-#define MV64340_CPU_PROTECT_WINDOW_2_BASE_ADDR                      0x1a0
-#define MV64340_CPU_PROTECT_WINDOW_2_SIZE                           0x1a8
-#define MV64340_CPU_PROTECT_WINDOW_3_BASE_ADDR                      0x1b0
-#define MV64340_CPU_PROTECT_WINDOW_3_SIZE                           0x1b8
-
-
-/****************************************/
-/*          CPU Error Report            */
-/****************************************/
-
-#define MV64340_CPU_ERROR_ADDR_LOW                                  0x070
-#define MV64340_CPU_ERROR_ADDR_HIGH                                 0x078
-#define MV64340_CPU_ERROR_DATA_LOW                                  0x128
-#define MV64340_CPU_ERROR_DATA_HIGH                                 0x130
-#define MV64340_CPU_ERROR_PARITY                                    0x138
-#define MV64340_CPU_ERROR_CAUSE                                     0x140
-#define MV64340_CPU_ERROR_MASK                                      0x148
-
-/****************************************/
-/*      CPU Interface Debug Registers 	*/
-/****************************************/
-
-#define MV64340_PUNIT_SLAVE_DEBUG_LOW                               0x360
-#define MV64340_PUNIT_SLAVE_DEBUG_HIGH                              0x368
-#define MV64340_PUNIT_MASTER_DEBUG_LOW                              0x370
-#define MV64340_PUNIT_MASTER_DEBUG_HIGH                             0x378
-#define MV64340_PUNIT_MMASK                                         0x3e4
-
-/****************************************/
-/*  Integrated SRAM Registers           */
-/****************************************/
-
-#define MV64340_SRAM_CONFIG                                         0x380
-#define MV64340_SRAM_TEST_MODE                                      0X3F4
-#define MV64340_SRAM_ERROR_CAUSE                                    0x388
-#define MV64340_SRAM_ERROR_ADDR                                     0x390
-#define MV64340_SRAM_ERROR_ADDR_HIGH                                0X3F8
-#define MV64340_SRAM_ERROR_DATA_LOW                                 0x398
-#define MV64340_SRAM_ERROR_DATA_HIGH                                0x3a0
-#define MV64340_SRAM_ERROR_DATA_PARITY                              0x3a8
-
-/****************************************/
-/* SDRAM Configuration                  */
-/****************************************/
-
-#define MV64340_SDRAM_CONFIG                                        0x1400
-#define MV64340_D_UNIT_CONTROL_LOW                                  0x1404
-#define MV64340_D_UNIT_CONTROL_HIGH                                 0x1424
-#define MV64340_SDRAM_TIMING_CONTROL_LOW                            0x1408
-#define MV64340_SDRAM_TIMING_CONTROL_HIGH                           0x140c
-#define MV64340_SDRAM_ADDR_CONTROL                                  0x1410
-#define MV64340_SDRAM_OPEN_PAGES_CONTROL                            0x1414
-#define MV64340_SDRAM_OPERATION                                     0x1418
-#define MV64340_SDRAM_MODE                                          0x141c
-#define MV64340_EXTENDED_DRAM_MODE                                  0x1420
-#define MV64340_SDRAM_CROSS_BAR_CONTROL_LOW                         0x1430
-#define MV64340_SDRAM_CROSS_BAR_CONTROL_HIGH                        0x1434
-#define MV64340_SDRAM_CROSS_BAR_TIMEOUT                             0x1438
-#define MV64340_SDRAM_ADDR_CTRL_PADS_CALIBRATION                    0x14c0
-#define MV64340_SDRAM_DATA_PADS_CALIBRATION                         0x14c4
-
-/****************************************/
-/* SDRAM Error Report                   */
-/****************************************/
-
-#define MV64340_SDRAM_ERROR_DATA_LOW                                0x1444
-#define MV64340_SDRAM_ERROR_DATA_HIGH                               0x1440
-#define MV64340_SDRAM_ERROR_ADDR                                    0x1450
-#define MV64340_SDRAM_RECEIVED_ECC                                  0x1448
-#define MV64340_SDRAM_CALCULATED_ECC                                0x144c
-#define MV64340_SDRAM_ECC_CONTROL                                   0x1454
-#define MV64340_SDRAM_ECC_ERROR_COUNTER                             0x1458
-
-/******************************************/
-/*  Controlled Delay Line (CDL) Registers */
-/******************************************/
-
-#define MV64340_DFCDL_CONFIG0                                       0x1480
-#define MV64340_DFCDL_CONFIG1                                       0x1484
-#define MV64340_DLL_WRITE                                           0x1488
-#define MV64340_DLL_READ                                            0x148c
-#define MV64340_SRAM_ADDR                                           0x1490
-#define MV64340_SRAM_DATA0                                          0x1494
-#define MV64340_SRAM_DATA1                                          0x1498
-#define MV64340_SRAM_DATA2                                          0x149c
-#define MV64340_DFCL_PROBE                                          0x14a0
-
-/******************************************/
-/*   Debug Registers                      */
-/******************************************/
-
-#define MV64340_DUNIT_DEBUG_LOW                                     0x1460
-#define MV64340_DUNIT_DEBUG_HIGH                                    0x1464
-#define MV64340_DUNIT_MMASK                                         0X1b40
-
-/****************************************/
-/* Device Parameters			*/
-/****************************************/
-
-#define MV64340_DEVICE_BANK0_PARAMETERS				    0x45c
-#define MV64340_DEVICE_BANK1_PARAMETERS				    0x460
-#define MV64340_DEVICE_BANK2_PARAMETERS				    0x464
-#define MV64340_DEVICE_BANK3_PARAMETERS				    0x468
-#define MV64340_DEVICE_BOOT_BANK_PARAMETERS			    0x46c
-#define MV64340_DEVICE_INTERFACE_CONTROL                            0x4c0
-#define MV64340_DEVICE_INTERFACE_CROSS_BAR_CONTROL_LOW              0x4c8
-#define MV64340_DEVICE_INTERFACE_CROSS_BAR_CONTROL_HIGH             0x4cc
-#define MV64340_DEVICE_INTERFACE_CROSS_BAR_TIMEOUT                  0x4c4
-
-/****************************************/
-/* Device interrupt registers		*/
-/****************************************/
-
-#define MV64340_DEVICE_INTERRUPT_CAUSE				    0x4d0
-#define MV64340_DEVICE_INTERRUPT_MASK				    0x4d4
-#define MV64340_DEVICE_ERROR_ADDR				    0x4d8
-#define MV64340_DEVICE_ERROR_DATA   				    0x4dc
-#define MV64340_DEVICE_ERROR_PARITY     			    0x4e0
-
-/****************************************/
-/* Device debug registers   		*/
-/****************************************/
-
-#define MV64340_DEVICE_DEBUG_LOW     				    0x4e4
-#define MV64340_DEVICE_DEBUG_HIGH     				    0x4e8
-#define MV64340_RUNIT_MMASK                                         0x4f0
-
-/****************************************/
-/* PCI Slave Address Decoding registers */
-/****************************************/
-
-#define MV64340_PCI_0_CS_0_BANK_SIZE                                0xc08
-#define MV64340_PCI_1_CS_0_BANK_SIZE                                0xc88
-#define MV64340_PCI_0_CS_1_BANK_SIZE                                0xd08
-#define MV64340_PCI_1_CS_1_BANK_SIZE                                0xd88
-#define MV64340_PCI_0_CS_2_BANK_SIZE                                0xc0c
-#define MV64340_PCI_1_CS_2_BANK_SIZE                                0xc8c
-#define MV64340_PCI_0_CS_3_BANK_SIZE                                0xd0c
-#define MV64340_PCI_1_CS_3_BANK_SIZE                                0xd8c
-#define MV64340_PCI_0_DEVCS_0_BANK_SIZE                             0xc10
-#define MV64340_PCI_1_DEVCS_0_BANK_SIZE                             0xc90
-#define MV64340_PCI_0_DEVCS_1_BANK_SIZE                             0xd10
-#define MV64340_PCI_1_DEVCS_1_BANK_SIZE                             0xd90
-#define MV64340_PCI_0_DEVCS_2_BANK_SIZE                             0xd18
-#define MV64340_PCI_1_DEVCS_2_BANK_SIZE                             0xd98
-#define MV64340_PCI_0_DEVCS_3_BANK_SIZE                             0xc14
-#define MV64340_PCI_1_DEVCS_3_BANK_SIZE                             0xc94
-#define MV64340_PCI_0_DEVCS_BOOT_BANK_SIZE                          0xd14
-#define MV64340_PCI_1_DEVCS_BOOT_BANK_SIZE                          0xd94
-#define MV64340_PCI_0_P2P_MEM0_BAR_SIZE                             0xd1c
-#define MV64340_PCI_1_P2P_MEM0_BAR_SIZE                             0xd9c
-#define MV64340_PCI_0_P2P_MEM1_BAR_SIZE                             0xd20
-#define MV64340_PCI_1_P2P_MEM1_BAR_SIZE                             0xda0
-#define MV64340_PCI_0_P2P_I_O_BAR_SIZE                              0xd24
-#define MV64340_PCI_1_P2P_I_O_BAR_SIZE                              0xda4
-#define MV64340_PCI_0_CPU_BAR_SIZE                                  0xd28
-#define MV64340_PCI_1_CPU_BAR_SIZE                                  0xda8
-#define MV64340_PCI_0_INTERNAL_SRAM_BAR_SIZE                        0xe00
-#define MV64340_PCI_1_INTERNAL_SRAM_BAR_SIZE                        0xe80
-#define MV64340_PCI_0_EXPANSION_ROM_BAR_SIZE                        0xd2c
-#define MV64340_PCI_1_EXPANSION_ROM_BAR_SIZE                        0xd9c
-#define MV64340_PCI_0_BASE_ADDR_REG_ENABLE                          0xc3c
-#define MV64340_PCI_1_BASE_ADDR_REG_ENABLE                          0xcbc
-#define MV64340_PCI_0_CS_0_BASE_ADDR_REMAP			    0xc48
-#define MV64340_PCI_1_CS_0_BASE_ADDR_REMAP			    0xcc8
-#define MV64340_PCI_0_CS_1_BASE_ADDR_REMAP			    0xd48
-#define MV64340_PCI_1_CS_1_BASE_ADDR_REMAP			    0xdc8
-#define MV64340_PCI_0_CS_2_BASE_ADDR_REMAP			    0xc4c
-#define MV64340_PCI_1_CS_2_BASE_ADDR_REMAP			    0xccc
-#define MV64340_PCI_0_CS_3_BASE_ADDR_REMAP			    0xd4c
-#define MV64340_PCI_1_CS_3_BASE_ADDR_REMAP			    0xdcc
-#define MV64340_PCI_0_CS_0_BASE_HIGH_ADDR_REMAP			    0xF04
-#define MV64340_PCI_1_CS_0_BASE_HIGH_ADDR_REMAP			    0xF84
-#define MV64340_PCI_0_CS_1_BASE_HIGH_ADDR_REMAP			    0xF08
-#define MV64340_PCI_1_CS_1_BASE_HIGH_ADDR_REMAP			    0xF88
-#define MV64340_PCI_0_CS_2_BASE_HIGH_ADDR_REMAP			    0xF0C
-#define MV64340_PCI_1_CS_2_BASE_HIGH_ADDR_REMAP			    0xF8C
-#define MV64340_PCI_0_CS_3_BASE_HIGH_ADDR_REMAP			    0xF10
-#define MV64340_PCI_1_CS_3_BASE_HIGH_ADDR_REMAP			    0xF90
-#define MV64340_PCI_0_DEVCS_0_BASE_ADDR_REMAP			    0xc50
-#define MV64340_PCI_1_DEVCS_0_BASE_ADDR_REMAP			    0xcd0
-#define MV64340_PCI_0_DEVCS_1_BASE_ADDR_REMAP			    0xd50
-#define MV64340_PCI_1_DEVCS_1_BASE_ADDR_REMAP			    0xdd0
-#define MV64340_PCI_0_DEVCS_2_BASE_ADDR_REMAP			    0xd58
-#define MV64340_PCI_1_DEVCS_2_BASE_ADDR_REMAP			    0xdd8
-#define MV64340_PCI_0_DEVCS_3_BASE_ADDR_REMAP           	    0xc54
-#define MV64340_PCI_1_DEVCS_3_BASE_ADDR_REMAP           	    0xcd4
-#define MV64340_PCI_0_DEVCS_BOOTCS_BASE_ADDR_REMAP      	    0xd54
-#define MV64340_PCI_1_DEVCS_BOOTCS_BASE_ADDR_REMAP      	    0xdd4
-#define MV64340_PCI_0_P2P_MEM0_BASE_ADDR_REMAP_LOW                  0xd5c
-#define MV64340_PCI_1_P2P_MEM0_BASE_ADDR_REMAP_LOW                  0xddc
-#define MV64340_PCI_0_P2P_MEM0_BASE_ADDR_REMAP_HIGH                 0xd60
-#define MV64340_PCI_1_P2P_MEM0_BASE_ADDR_REMAP_HIGH                 0xde0
-#define MV64340_PCI_0_P2P_MEM1_BASE_ADDR_REMAP_LOW                  0xd64
-#define MV64340_PCI_1_P2P_MEM1_BASE_ADDR_REMAP_LOW                  0xde4
-#define MV64340_PCI_0_P2P_MEM1_BASE_ADDR_REMAP_HIGH                 0xd68
-#define MV64340_PCI_1_P2P_MEM1_BASE_ADDR_REMAP_HIGH                 0xde8
-#define MV64340_PCI_0_P2P_I_O_BASE_ADDR_REMAP                       0xd6c
-#define MV64340_PCI_1_P2P_I_O_BASE_ADDR_REMAP                       0xdec 
-#define MV64340_PCI_0_CPU_BASE_ADDR_REMAP_LOW                       0xd70
-#define MV64340_PCI_1_CPU_BASE_ADDR_REMAP_LOW                       0xdf0
-#define MV64340_PCI_0_CPU_BASE_ADDR_REMAP_HIGH                      0xd74
-#define MV64340_PCI_1_CPU_BASE_ADDR_REMAP_HIGH                      0xdf4
-#define MV64340_PCI_0_INTEGRATED_SRAM_BASE_ADDR_REMAP               0xf00
-#define MV64340_PCI_1_INTEGRATED_SRAM_BASE_ADDR_REMAP               0xf80
-#define MV64340_PCI_0_EXPANSION_ROM_BASE_ADDR_REMAP                 0xf38
-#define MV64340_PCI_1_EXPANSION_ROM_BASE_ADDR_REMAP                 0xfb8
-#define MV64340_PCI_0_ADDR_DECODE_CONTROL                           0xd3c
-#define MV64340_PCI_1_ADDR_DECODE_CONTROL                           0xdbc
-#define MV64340_PCI_0_HEADERS_RETARGET_CONTROL                      0xF40
-#define MV64340_PCI_1_HEADERS_RETARGET_CONTROL                      0xFc0
-#define MV64340_PCI_0_HEADERS_RETARGET_BASE                         0xF44
-#define MV64340_PCI_1_HEADERS_RETARGET_BASE                         0xFc4
-#define MV64340_PCI_0_HEADERS_RETARGET_HIGH                         0xF48
-#define MV64340_PCI_1_HEADERS_RETARGET_HIGH                         0xFc8
-
-/***********************************/
-/*   PCI Control Register Map      */
-/***********************************/
-
-#define MV64340_PCI_0_DLL_STATUS_AND_COMMAND                        0x1d20
-#define MV64340_PCI_1_DLL_STATUS_AND_COMMAND                        0x1da0
-#define MV64340_PCI_0_MPP_PADS_DRIVE_CONTROL                        0x1d1C
-#define MV64340_PCI_1_MPP_PADS_DRIVE_CONTROL                        0x1d9C
-#define MV64340_PCI_0_COMMAND			         	    0xc00
-#define MV64340_PCI_1_COMMAND					    0xc80
-#define MV64340_PCI_0_MODE                                          0xd00
-#define MV64340_PCI_1_MODE                                          0xd80
-#define MV64340_PCI_0_RETRY	        	 		    0xc04
-#define MV64340_PCI_1_RETRY				            0xc84
-#define MV64340_PCI_0_READ_BUFFER_DISCARD_TIMER                     0xd04
-#define MV64340_PCI_1_READ_BUFFER_DISCARD_TIMER                     0xd84
-#define MV64340_PCI_0_MSI_TRIGGER_TIMER                             0xc38
-#define MV64340_PCI_1_MSI_TRIGGER_TIMER                             0xcb8
-#define MV64340_PCI_0_ARBITER_CONTROL                               0x1d00
-#define MV64340_PCI_1_ARBITER_CONTROL                               0x1d80
-#define MV64340_PCI_0_CROSS_BAR_CONTROL_LOW                         0x1d08
-#define MV64340_PCI_1_CROSS_BAR_CONTROL_LOW                         0x1d88
-#define MV64340_PCI_0_CROSS_BAR_CONTROL_HIGH                        0x1d0c
-#define MV64340_PCI_1_CROSS_BAR_CONTROL_HIGH                        0x1d8c
-#define MV64340_PCI_0_CROSS_BAR_TIMEOUT                             0x1d04
-#define MV64340_PCI_1_CROSS_BAR_TIMEOUT                             0x1d84
-#define MV64340_PCI_0_SYNC_BARRIER_TRIGGER_REG                      0x1D18
-#define MV64340_PCI_1_SYNC_BARRIER_TRIGGER_REG                      0x1D98
-#define MV64340_PCI_0_SYNC_BARRIER_VIRTUAL_REG                      0x1d10
-#define MV64340_PCI_1_SYNC_BARRIER_VIRTUAL_REG                      0x1d90
-#define MV64340_PCI_0_P2P_CONFIG                                    0x1d14
-#define MV64340_PCI_1_P2P_CONFIG                                    0x1d94
-
-#define MV64340_PCI_0_ACCESS_CONTROL_BASE_0_LOW                     0x1e00
-#define MV64340_PCI_0_ACCESS_CONTROL_BASE_0_HIGH                    0x1e04
-#define MV64340_PCI_0_ACCESS_CONTROL_SIZE_0                         0x1e08
-#define MV64340_PCI_0_ACCESS_CONTROL_BASE_1_LOW                     0x1e10
-#define MV64340_PCI_0_ACCESS_CONTROL_BASE_1_HIGH                    0x1e14
-#define MV64340_PCI_0_ACCESS_CONTROL_SIZE_1                         0x1e18
-#define MV64340_PCI_0_ACCESS_CONTROL_BASE_2_LOW                     0x1e20
-#define MV64340_PCI_0_ACCESS_CONTROL_BASE_2_HIGH                    0x1e24
-#define MV64340_PCI_0_ACCESS_CONTROL_SIZE_2                         0x1e28
-#define MV64340_PCI_0_ACCESS_CONTROL_BASE_3_LOW                     0x1e30
-#define MV64340_PCI_0_ACCESS_CONTROL_BASE_3_HIGH                    0x1e34
-#define MV64340_PCI_0_ACCESS_CONTROL_SIZE_3                         0x1e38
-#define MV64340_PCI_0_ACCESS_CONTROL_BASE_4_LOW                     0x1e40
-#define MV64340_PCI_0_ACCESS_CONTROL_BASE_4_HIGH                    0x1e44
-#define MV64340_PCI_0_ACCESS_CONTROL_SIZE_4                         0x1e48
-#define MV64340_PCI_0_ACCESS_CONTROL_BASE_5_LOW                     0x1e50
-#define MV64340_PCI_0_ACCESS_CONTROL_BASE_5_HIGH                    0x1e54
-#define MV64340_PCI_0_ACCESS_CONTROL_SIZE_5                         0x1e58
-
-#define MV64340_PCI_1_ACCESS_CONTROL_BASE_0_LOW                     0x1e80
-#define MV64340_PCI_1_ACCESS_CONTROL_BASE_0_HIGH                    0x1e84
-#define MV64340_PCI_1_ACCESS_CONTROL_SIZE_0                         0x1e88
-#define MV64340_PCI_1_ACCESS_CONTROL_BASE_1_LOW                     0x1e90
-#define MV64340_PCI_1_ACCESS_CONTROL_BASE_1_HIGH                    0x1e94
-#define MV64340_PCI_1_ACCESS_CONTROL_SIZE_1                         0x1e98
-#define MV64340_PCI_1_ACCESS_CONTROL_BASE_2_LOW                     0x1ea0
-#define MV64340_PCI_1_ACCESS_CONTROL_BASE_2_HIGH                    0x1ea4
-#define MV64340_PCI_1_ACCESS_CONTROL_SIZE_2                         0x1ea8
-#define MV64340_PCI_1_ACCESS_CONTROL_BASE_3_LOW                     0x1eb0
-#define MV64340_PCI_1_ACCESS_CONTROL_BASE_3_HIGH                    0x1eb4
-#define MV64340_PCI_1_ACCESS_CONTROL_SIZE_3                         0x1eb8
-#define MV64340_PCI_1_ACCESS_CONTROL_BASE_4_LOW                     0x1ec0
-#define MV64340_PCI_1_ACCESS_CONTROL_BASE_4_HIGH                    0x1ec4
-#define MV64340_PCI_1_ACCESS_CONTROL_SIZE_4                         0x1ec8
-#define MV64340_PCI_1_ACCESS_CONTROL_BASE_5_LOW                     0x1ed0
-#define MV64340_PCI_1_ACCESS_CONTROL_BASE_5_HIGH                    0x1ed4
-#define MV64340_PCI_1_ACCESS_CONTROL_SIZE_5                         0x1ed8
-
-/****************************************/
-/*   PCI Configuration Access Registers */
-/****************************************/
-
-#define MV64340_PCI_0_CONFIG_ADDR 				    0xcf8
-#define MV64340_PCI_0_CONFIG_DATA_VIRTUAL_REG                       0xcfc
-#define MV64340_PCI_1_CONFIG_ADDR 				    0xc78
-#define MV64340_PCI_1_CONFIG_DATA_VIRTUAL_REG                       0xc7c
-#define MV64340_PCI_0_INTERRUPT_ACKNOWLEDGE_VIRTUAL_REG	            0xc34
-#define MV64340_PCI_1_INTERRUPT_ACKNOWLEDGE_VIRTUAL_REG	            0xcb4
-
-/****************************************/
-/*   PCI Error Report Registers         */
-/****************************************/
-
-#define MV64340_PCI_0_SERR_MASK					    0xc28
-#define MV64340_PCI_1_SERR_MASK					    0xca8
-#define MV64340_PCI_0_ERROR_ADDR_LOW                                0x1d40
-#define MV64340_PCI_1_ERROR_ADDR_LOW                                0x1dc0
-#define MV64340_PCI_0_ERROR_ADDR_HIGH                               0x1d44
-#define MV64340_PCI_1_ERROR_ADDR_HIGH                               0x1dc4
-#define MV64340_PCI_0_ERROR_ATTRIBUTE                               0x1d48
-#define MV64340_PCI_1_ERROR_ATTRIBUTE                               0x1dc8
-#define MV64340_PCI_0_ERROR_COMMAND                                 0x1d50
-#define MV64340_PCI_1_ERROR_COMMAND                                 0x1dd0
-#define MV64340_PCI_0_ERROR_CAUSE                                   0x1d58
-#define MV64340_PCI_1_ERROR_CAUSE                                   0x1dd8
-#define MV64340_PCI_0_ERROR_MASK                                    0x1d5c
-#define MV64340_PCI_1_ERROR_MASK                                    0x1ddc
-
-/****************************************/
-/*   PCI Debug Registers                */
-/****************************************/
-
-#define MV64340_PCI_0_MMASK                                         0X1D24
-#define MV64340_PCI_1_MMASK                                         0X1DA4
-
-/*********************************************/
-/* PCI Configuration, Function 0, Registers  */
-/*********************************************/
-
-#define MV64340_PCI_DEVICE_AND_VENDOR_ID 			    0x000
-#define MV64340_PCI_STATUS_AND_COMMAND				    0x004
-#define MV64340_PCI_CLASS_CODE_AND_REVISION_ID			    0x008
-#define MV64340_PCI_BIST_HEADER_TYPE_LATENCY_TIMER_CACHE_LINE 	    0x00C
-
-#define MV64340_PCI_SCS_0_BASE_ADDR_LOW   	      		    0x010
-#define MV64340_PCI_SCS_0_BASE_ADDR_HIGH   		            0x014
-#define MV64340_PCI_SCS_1_BASE_ADDR_LOW  	     	            0x018
-#define MV64340_PCI_SCS_1_BASE_ADDR_HIGH 		            0x01C
-#define MV64340_PCI_INTERNAL_REG_MEM_MAPPED_BASE_ADDR_LOW      	    0x020
-#define MV64340_PCI_INTERNAL_REG_MEM_MAPPED_BASE_ADDR_HIGH     	    0x024
-#define MV64340_PCI_SUBSYSTEM_ID_AND_SUBSYSTEM_VENDOR_ID	    0x02c
-#define MV64340_PCI_EXPANSION_ROM_BASE_ADDR_REG	                    0x030
-#define MV64340_PCI_CAPABILTY_LIST_POINTER                          0x034
-#define MV64340_PCI_INTERRUPT_PIN_AND_LINE 			    0x03C
-       /* capability list */
-#define MV64340_PCI_POWER_MANAGEMENT_CAPABILITY                     0x040
-#define MV64340_PCI_POWER_MANAGEMENT_STATUS_AND_CONTROL             0x044
-#define MV64340_PCI_VPD_ADDR                                        0x048
-#define MV64340_PCI_VPD_DATA                                        0x04c
-#define MV64340_PCI_MSI_MESSAGE_CONTROL                             0x050
-#define MV64340_PCI_MSI_MESSAGE_ADDR                                0x054
-#define MV64340_PCI_MSI_MESSAGE_UPPER_ADDR                          0x058
-#define MV64340_PCI_MSI_MESSAGE_DATA                                0x05c
-#define MV64340_PCI_X_COMMAND                                       0x060
-#define MV64340_PCI_X_STATUS                                        0x064
-#define MV64340_PCI_COMPACT_PCI_HOT_SWAP                            0x068
-
-/***********************************************/
-/*   PCI Configuration, Function 1, Registers  */
-/***********************************************/
-
-#define MV64340_PCI_SCS_2_BASE_ADDR_LOW   			    0x110
-#define MV64340_PCI_SCS_2_BASE_ADDR_HIGH			    0x114
-#define MV64340_PCI_SCS_3_BASE_ADDR_LOW 			    0x118
-#define MV64340_PCI_SCS_3_BASE_ADDR_HIGH			    0x11c
-#define MV64340_PCI_INTERNAL_SRAM_BASE_ADDR_LOW          	    0x120
-#define MV64340_PCI_INTERNAL_SRAM_BASE_ADDR_HIGH         	    0x124
-
-/***********************************************/
-/*  PCI Configuration, Function 2, Registers   */
-/***********************************************/
-
-#define MV64340_PCI_DEVCS_0_BASE_ADDR_LOW	    		    0x210
-#define MV64340_PCI_DEVCS_0_BASE_ADDR_HIGH 			    0x214
-#define MV64340_PCI_DEVCS_1_BASE_ADDR_LOW 			    0x218
-#define MV64340_PCI_DEVCS_1_BASE_ADDR_HIGH      		    0x21c
-#define MV64340_PCI_DEVCS_2_BASE_ADDR_LOW 			    0x220
-#define MV64340_PCI_DEVCS_2_BASE_ADDR_HIGH      		    0x224
-
-/***********************************************/
-/*  PCI Configuration, Function 3, Registers   */
-/***********************************************/
-
-#define MV64340_PCI_DEVCS_3_BASE_ADDR_LOW	    		    0x310
-#define MV64340_PCI_DEVCS_3_BASE_ADDR_HIGH 			    0x314
-#define MV64340_PCI_BOOT_CS_BASE_ADDR_LOW			    0x318
-#define MV64340_PCI_BOOT_CS_BASE_ADDR_HIGH      		    0x31c
-#define MV64340_PCI_CPU_BASE_ADDR_LOW 				    0x220
-#define MV64340_PCI_CPU_BASE_ADDR_HIGH      			    0x224
-
-/***********************************************/
-/*  PCI Configuration, Function 4, Registers   */
-/***********************************************/
-
-#define MV64340_PCI_P2P_MEM0_BASE_ADDR_LOW  			    0x410
-#define MV64340_PCI_P2P_MEM0_BASE_ADDR_HIGH 			    0x414
-#define MV64340_PCI_P2P_MEM1_BASE_ADDR_LOW   			    0x418
-#define MV64340_PCI_P2P_MEM1_BASE_ADDR_HIGH 			    0x41c
-#define MV64340_PCI_P2P_I_O_BASE_ADDR                 	            0x420
-#define MV64340_PCI_INTERNAL_REGS_I_O_MAPPED_BASE_ADDR              0x424
-
-/****************************************/
-/* Messaging Unit Registers (I20)   	*/
-/****************************************/
-
-#define MV64340_I2O_INBOUND_MESSAGE_REG0_PCI_0_SIDE		    0x010
-#define MV64340_I2O_INBOUND_MESSAGE_REG1_PCI_0_SIDE  		    0x014
-#define MV64340_I2O_OUTBOUND_MESSAGE_REG0_PCI_0_SIDE 		    0x018
-#define MV64340_I2O_OUTBOUND_MESSAGE_REG1_PCI_0_SIDE  		    0x01C
-#define MV64340_I2O_INBOUND_DOORBELL_REG_PCI_0_SIDE  		    0x020
-#define MV64340_I2O_INBOUND_INTERRUPT_CAUSE_REG_PCI_0_SIDE          0x024
-#define MV64340_I2O_INBOUND_INTERRUPT_MASK_REG_PCI_0_SIDE	    0x028
-#define MV64340_I2O_OUTBOUND_DOORBELL_REG_PCI_0_SIDE 		    0x02C
-#define MV64340_I2O_OUTBOUND_INTERRUPT_CAUSE_REG_PCI_0_SIDE         0x030
-#define MV64340_I2O_OUTBOUND_INTERRUPT_MASK_REG_PCI_0_SIDE          0x034
-#define MV64340_I2O_INBOUND_QUEUE_PORT_VIRTUAL_REG_PCI_0_SIDE       0x040
-#define MV64340_I2O_OUTBOUND_QUEUE_PORT_VIRTUAL_REG_PCI_0_SIDE      0x044
-#define MV64340_I2O_QUEUE_CONTROL_REG_PCI_0_SIDE 		    0x050
-#define MV64340_I2O_QUEUE_BASE_ADDR_REG_PCI_0_SIDE 		    0x054
-#define MV64340_I2O_INBOUND_FREE_HEAD_POINTER_REG_PCI_0_SIDE        0x060
-#define MV64340_I2O_INBOUND_FREE_TAIL_POINTER_REG_PCI_0_SIDE        0x064
-#define MV64340_I2O_INBOUND_POST_HEAD_POINTER_REG_PCI_0_SIDE        0x068
-#define MV64340_I2O_INBOUND_POST_TAIL_POINTER_REG_PCI_0_SIDE        0x06C
-#define MV64340_I2O_OUTBOUND_FREE_HEAD_POINTER_REG_PCI_0_SIDE       0x070
-#define MV64340_I2O_OUTBOUND_FREE_TAIL_POINTER_REG_PCI_0_SIDE       0x074
-#define MV64340_I2O_OUTBOUND_POST_HEAD_POINTER_REG_PCI_0_SIDE       0x0F8
-#define MV64340_I2O_OUTBOUND_POST_TAIL_POINTER_REG_PCI_0_SIDE       0x0FC
-
-#define MV64340_I2O_INBOUND_MESSAGE_REG0_PCI_1_SIDE		    0x090
-#define MV64340_I2O_INBOUND_MESSAGE_REG1_PCI_1_SIDE  		    0x094
-#define MV64340_I2O_OUTBOUND_MESSAGE_REG0_PCI_1_SIDE 		    0x098
-#define MV64340_I2O_OUTBOUND_MESSAGE_REG1_PCI_1_SIDE  		    0x09C
-#define MV64340_I2O_INBOUND_DOORBELL_REG_PCI_1_SIDE  		    0x0A0
-#define MV64340_I2O_INBOUND_INTERRUPT_CAUSE_REG_PCI_1_SIDE          0x0A4
-#define MV64340_I2O_INBOUND_INTERRUPT_MASK_REG_PCI_1_SIDE	    0x0A8
-#define MV64340_I2O_OUTBOUND_DOORBELL_REG_PCI_1_SIDE 		    0x0AC
-#define MV64340_I2O_OUTBOUND_INTERRUPT_CAUSE_REG_PCI_1_SIDE         0x0B0
-#define MV64340_I2O_OUTBOUND_INTERRUPT_MASK_REG_PCI_1_SIDE          0x0B4
-#define MV64340_I2O_INBOUND_QUEUE_PORT_VIRTUAL_REG_PCI_1_SIDE       0x0C0
-#define MV64340_I2O_OUTBOUND_QUEUE_PORT_VIRTUAL_REG_PCI_1_SIDE      0x0C4
-#define MV64340_I2O_QUEUE_CONTROL_REG_PCI_1_SIDE 		    0x0D0
-#define MV64340_I2O_QUEUE_BASE_ADDR_REG_PCI_1_SIDE 		    0x0D4
-#define MV64340_I2O_INBOUND_FREE_HEAD_POINTER_REG_PCI_1_SIDE        0x0E0
-#define MV64340_I2O_INBOUND_FREE_TAIL_POINTER_REG_PCI_1_SIDE        0x0E4
-#define MV64340_I2O_INBOUND_POST_HEAD_POINTER_REG_PCI_1_SIDE        0x0E8
-#define MV64340_I2O_INBOUND_POST_TAIL_POINTER_REG_PCI_1_SIDE        0x0EC
-#define MV64340_I2O_OUTBOUND_FREE_HEAD_POINTER_REG_PCI_1_SIDE       0x0F0
-#define MV64340_I2O_OUTBOUND_FREE_TAIL_POINTER_REG_PCI_1_SIDE       0x0F4
-#define MV64340_I2O_OUTBOUND_POST_HEAD_POINTER_REG_PCI_1_SIDE       0x078
-#define MV64340_I2O_OUTBOUND_POST_TAIL_POINTER_REG_PCI_1_SIDE       0x07C
-
-#define MV64340_I2O_INBOUND_MESSAGE_REG0_CPU0_SIDE		    0x1C10
-#define MV64340_I2O_INBOUND_MESSAGE_REG1_CPU0_SIDE  		    0x1C14
-#define MV64340_I2O_OUTBOUND_MESSAGE_REG0_CPU0_SIDE 		    0x1C18
-#define MV64340_I2O_OUTBOUND_MESSAGE_REG1_CPU0_SIDE  		    0x1C1C
-#define MV64340_I2O_INBOUND_DOORBELL_REG_CPU0_SIDE  		    0x1C20
-#define MV64340_I2O_INBOUND_INTERRUPT_CAUSE_REG_CPU0_SIDE  	    0x1C24
-#define MV64340_I2O_INBOUND_INTERRUPT_MASK_REG_CPU0_SIDE	    0x1C28
-#define MV64340_I2O_OUTBOUND_DOORBELL_REG_CPU0_SIDE 		    0x1C2C
-#define MV64340_I2O_OUTBOUND_INTERRUPT_CAUSE_REG_CPU0_SIDE          0x1C30
-#define MV64340_I2O_OUTBOUND_INTERRUPT_MASK_REG_CPU0_SIDE           0x1C34
-#define MV64340_I2O_INBOUND_QUEUE_PORT_VIRTUAL_REG_CPU0_SIDE        0x1C40
-#define MV64340_I2O_OUTBOUND_QUEUE_PORT_VIRTUAL_REG_CPU0_SIDE       0x1C44
-#define MV64340_I2O_QUEUE_CONTROL_REG_CPU0_SIDE 		    0x1C50
-#define MV64340_I2O_QUEUE_BASE_ADDR_REG_CPU0_SIDE 		    0x1C54
-#define MV64340_I2O_INBOUND_FREE_HEAD_POINTER_REG_CPU0_SIDE         0x1C60
-#define MV64340_I2O_INBOUND_FREE_TAIL_POINTER_REG_CPU0_SIDE         0x1C64
-#define MV64340_I2O_INBOUND_POST_HEAD_POINTER_REG_CPU0_SIDE         0x1C68
-#define MV64340_I2O_INBOUND_POST_TAIL_POINTER_REG_CPU0_SIDE         0x1C6C
-#define MV64340_I2O_OUTBOUND_FREE_HEAD_POINTER_REG_CPU0_SIDE        0x1C70
-#define MV64340_I2O_OUTBOUND_FREE_TAIL_POINTER_REG_CPU0_SIDE        0x1C74
-#define MV64340_I2O_OUTBOUND_POST_HEAD_POINTER_REG_CPU0_SIDE        0x1CF8
-#define MV64340_I2O_OUTBOUND_POST_TAIL_POINTER_REG_CPU0_SIDE        0x1CFC
-#define MV64340_I2O_INBOUND_MESSAGE_REG0_CPU1_SIDE		    0x1C90
-#define MV64340_I2O_INBOUND_MESSAGE_REG1_CPU1_SIDE  		    0x1C94
-#define MV64340_I2O_OUTBOUND_MESSAGE_REG0_CPU1_SIDE 		    0x1C98
-#define MV64340_I2O_OUTBOUND_MESSAGE_REG1_CPU1_SIDE  		    0x1C9C
-#define MV64340_I2O_INBOUND_DOORBELL_REG_CPU1_SIDE  		    0x1CA0
-#define MV64340_I2O_INBOUND_INTERRUPT_CAUSE_REG_CPU1_SIDE  	    0x1CA4
-#define MV64340_I2O_INBOUND_INTERRUPT_MASK_REG_CPU1_SIDE	    0x1CA8
-#define MV64340_I2O_OUTBOUND_DOORBELL_REG_CPU1_SIDE 		    0x1CAC
-#define MV64340_I2O_OUTBOUND_INTERRUPT_CAUSE_REG_CPU1_SIDE          0x1CB0
-#define MV64340_I2O_OUTBOUND_INTERRUPT_MASK_REG_CPU1_SIDE           0x1CB4
-#define MV64340_I2O_INBOUND_QUEUE_PORT_VIRTUAL_REG_CPU1_SIDE        0x1CC0
-#define MV64340_I2O_OUTBOUND_QUEUE_PORT_VIRTUAL_REG_CPU1_SIDE       0x1CC4
-#define MV64340_I2O_QUEUE_CONTROL_REG_CPU1_SIDE 		    0x1CD0
-#define MV64340_I2O_QUEUE_BASE_ADDR_REG_CPU1_SIDE 		    0x1CD4
-#define MV64340_I2O_INBOUND_FREE_HEAD_POINTER_REG_CPU1_SIDE         0x1CE0
-#define MV64340_I2O_INBOUND_FREE_TAIL_POINTER_REG_CPU1_SIDE         0x1CE4
-#define MV64340_I2O_INBOUND_POST_HEAD_POINTER_REG_CPU1_SIDE         0x1CE8
-#define MV64340_I2O_INBOUND_POST_TAIL_POINTER_REG_CPU1_SIDE         0x1CEC
-#define MV64340_I2O_OUTBOUND_FREE_HEAD_POINTER_REG_CPU1_SIDE        0x1CF0
-#define MV64340_I2O_OUTBOUND_FREE_TAIL_POINTER_REG_CPU1_SIDE        0x1CF4
-#define MV64340_I2O_OUTBOUND_POST_HEAD_POINTER_REG_CPU1_SIDE        0x1C78
-#define MV64340_I2O_OUTBOUND_POST_TAIL_POINTER_REG_CPU1_SIDE        0x1C7C
-
-/****************************************/
-/*        Ethernet Unit Registers  		*/
-/****************************************/
-
-/*******************************************/
-/*          CUNIT  Registers               */
-/*******************************************/
-
-         /* Address Decoding Register Map */
-           
-#define MV64340_CUNIT_BASE_ADDR_REG0                                0xf200
-#define MV64340_CUNIT_BASE_ADDR_REG1                                0xf208
-#define MV64340_CUNIT_BASE_ADDR_REG2                                0xf210
-#define MV64340_CUNIT_BASE_ADDR_REG3                                0xf218
-#define MV64340_CUNIT_SIZE0                                         0xf204
-#define MV64340_CUNIT_SIZE1                                         0xf20c
-#define MV64340_CUNIT_SIZE2                                         0xf214
-#define MV64340_CUNIT_SIZE3                                         0xf21c
-#define MV64340_CUNIT_HIGH_ADDR_REMAP_REG0                          0xf240
-#define MV64340_CUNIT_HIGH_ADDR_REMAP_REG1                          0xf244
-#define MV64340_CUNIT_BASE_ADDR_ENABLE_REG                          0xf250
-#define MV64340_MPSC0_ACCESS_PROTECTION_REG                         0xf254
-#define MV64340_MPSC1_ACCESS_PROTECTION_REG                         0xf258
-#define MV64340_CUNIT_INTERNAL_SPACE_BASE_ADDR_REG                  0xf25C
-
-        /*  Error Report Registers  */
-
-#define MV64340_CUNIT_INTERRUPT_CAUSE_REG                           0xf310
-#define MV64340_CUNIT_INTERRUPT_MASK_REG                            0xf314
-#define MV64340_CUNIT_ERROR_ADDR                                    0xf318
-
-        /*  Cunit Control Registers */
-
-#define MV64340_CUNIT_ARBITER_CONTROL_REG                           0xf300
-#define MV64340_CUNIT_CONFIG_REG                                    0xb40c
-#define MV64340_CUNIT_CRROSBAR_TIMEOUT_REG                          0xf304
-
-        /*  Cunit Debug Registers   */
-
-#define MV64340_CUNIT_DEBUG_LOW                                     0xf340
-#define MV64340_CUNIT_DEBUG_HIGH                                    0xf344
-#define MV64340_CUNIT_MMASK                                         0xf380
-
-        /*  MPSCs Clocks Routing Registers  */
-
-#define MV64340_MPSC_ROUTING_REG                                    0xb400
-#define MV64340_MPSC_RX_CLOCK_ROUTING_REG                           0xb404
-#define MV64340_MPSC_TX_CLOCK_ROUTING_REG                           0xb408
-
-        /*  MPSCs Interrupts Registers    */
-
-#define MV64340_MPSC_CAUSE_REG(port)                               (0xb804 + (port<<3))
-#define MV64340_MPSC_MASK_REG(port)                                (0xb884 + (port<<3))
- 
-#define MV64340_MPSC_MAIN_CONFIG_LOW(port)                         (0x8000 + (port<<12))
-#define MV64340_MPSC_MAIN_CONFIG_HIGH(port)                        (0x8004 + (port<<12))    
-#define MV64340_MPSC_PROTOCOL_CONFIG(port)                         (0x8008 + (port<<12))    
-#define MV64340_MPSC_CHANNEL_REG1(port)                            (0x800c + (port<<12))    
-#define MV64340_MPSC_CHANNEL_REG2(port)                            (0x8010 + (port<<12))    
-#define MV64340_MPSC_CHANNEL_REG3(port)                            (0x8014 + (port<<12))    
-#define MV64340_MPSC_CHANNEL_REG4(port)                            (0x8018 + (port<<12))    
-#define MV64340_MPSC_CHANNEL_REG5(port)                            (0x801c + (port<<12))    
-#define MV64340_MPSC_CHANNEL_REG6(port)                            (0x8020 + (port<<12))    
-#define MV64340_MPSC_CHANNEL_REG7(port)                            (0x8024 + (port<<12))    
-#define MV64340_MPSC_CHANNEL_REG8(port)                            (0x8028 + (port<<12))    
-#define MV64340_MPSC_CHANNEL_REG9(port)                            (0x802c + (port<<12))    
-#define MV64340_MPSC_CHANNEL_REG10(port)                           (0x8030 + (port<<12))    
-        
-        /*  MPSC0 Registers      */
-
-
-/***************************************/
-/*          SDMA Registers             */
-/***************************************/
-
-#define MV64340_SDMA_CONFIG_REG(channel)                        (0x4000 + (channel<<13))        
-#define MV64340_SDMA_COMMAND_REG(channel)                       (0x4008 + (channel<<13))        
-#define MV64340_SDMA_CURRENT_RX_DESCRIPTOR_POINTER(channel)     (0x4810 + (channel<<13))        
-#define MV64340_SDMA_CURRENT_TX_DESCRIPTOR_POINTER(channel)     (0x4c10 + (channel<<13))        
-#define MV64340_SDMA_FIRST_TX_DESCRIPTOR_POINTER(channel)       (0x4c14 + (channel<<13)) 
-
-#define MV64340_SDMA_CAUSE_REG                                      0xb800
-#define MV64340_SDMA_MASK_REG                                       0xb880
-         
-/* BRG Interrupts */
-
-#define MV64340_BRG_CONFIG_REG(brg)                              (0xb200 + (brg<<3))
-#define MV64340_BRG_BAUDE_TUNING_REG(brg)                        (0xb208 + (brg<<3))
-#define MV64340_BRG_CAUSE_REG                                       0xb834
-#define MV64340_BRG_MASK_REG                                        0xb8b4
-
-/****************************************/
-/* DMA Channel Control			*/
-/****************************************/
-
-#define MV64340_DMA_CHANNEL0_CONTROL 				    0x840
-#define MV64340_DMA_CHANNEL0_CONTROL_HIGH			    0x880
-#define MV64340_DMA_CHANNEL1_CONTROL 				    0x844
-#define MV64340_DMA_CHANNEL1_CONTROL_HIGH			    0x884
-#define MV64340_DMA_CHANNEL2_CONTROL 				    0x848
-#define MV64340_DMA_CHANNEL2_CONTROL_HIGH			    0x888
-#define MV64340_DMA_CHANNEL3_CONTROL 				    0x84C
-#define MV64340_DMA_CHANNEL3_CONTROL_HIGH			    0x88C
-
-
-/****************************************/
-/*           IDMA Registers             */
-/****************************************/
-
-#define MV64340_DMA_CHANNEL0_BYTE_COUNT                             0x800
-#define MV64340_DMA_CHANNEL1_BYTE_COUNT                             0x804
-#define MV64340_DMA_CHANNEL2_BYTE_COUNT                             0x808
-#define MV64340_DMA_CHANNEL3_BYTE_COUNT                             0x80C
-#define MV64340_DMA_CHANNEL0_SOURCE_ADDR                            0x810
-#define MV64340_DMA_CHANNEL1_SOURCE_ADDR                            0x814
-#define MV64340_DMA_CHANNEL2_SOURCE_ADDR                            0x818
-#define MV64340_DMA_CHANNEL3_SOURCE_ADDR                            0x81c
-#define MV64340_DMA_CHANNEL0_DESTINATION_ADDR                       0x820
-#define MV64340_DMA_CHANNEL1_DESTINATION_ADDR                       0x824
-#define MV64340_DMA_CHANNEL2_DESTINATION_ADDR                       0x828
-#define MV64340_DMA_CHANNEL3_DESTINATION_ADDR                       0x82C
-#define MV64340_DMA_CHANNEL0_NEXT_DESCRIPTOR_POINTER                0x830
-#define MV64340_DMA_CHANNEL1_NEXT_DESCRIPTOR_POINTER                0x834
-#define MV64340_DMA_CHANNEL2_NEXT_DESCRIPTOR_POINTER                0x838
-#define MV64340_DMA_CHANNEL3_NEXT_DESCRIPTOR_POINTER                0x83C
-#define MV64340_DMA_CHANNEL0_CURRENT_DESCRIPTOR_POINTER             0x870
-#define MV64340_DMA_CHANNEL1_CURRENT_DESCRIPTOR_POINTER             0x874
-#define MV64340_DMA_CHANNEL2_CURRENT_DESCRIPTOR_POINTER             0x878
-#define MV64340_DMA_CHANNEL3_CURRENT_DESCRIPTOR_POINTER             0x87C
-
- /*  IDMA Address Decoding Base Address Registers  */
- 
-#define MV64340_DMA_BASE_ADDR_REG0                                  0xa00
-#define MV64340_DMA_BASE_ADDR_REG1                                  0xa08
-#define MV64340_DMA_BASE_ADDR_REG2                                  0xa10
-#define MV64340_DMA_BASE_ADDR_REG3                                  0xa18
-#define MV64340_DMA_BASE_ADDR_REG4                                  0xa20
-#define MV64340_DMA_BASE_ADDR_REG5                                  0xa28
-#define MV64340_DMA_BASE_ADDR_REG6                                  0xa30
-#define MV64340_DMA_BASE_ADDR_REG7                                  0xa38
- 
- /*  IDMA Address Decoding Size Address Register   */
- 
-#define MV64340_DMA_SIZE_REG0                                       0xa04
-#define MV64340_DMA_SIZE_REG1                                       0xa0c
-#define MV64340_DMA_SIZE_REG2                                       0xa14
-#define MV64340_DMA_SIZE_REG3                                       0xa1c
-#define MV64340_DMA_SIZE_REG4                                       0xa24
-#define MV64340_DMA_SIZE_REG5                                       0xa2c
-#define MV64340_DMA_SIZE_REG6                                       0xa34
-#define MV64340_DMA_SIZE_REG7                                       0xa3C
-
- /* IDMA Address Decoding High Address Remap and Access 
-                  Protection Registers                    */
-                  
-#define MV64340_DMA_HIGH_ADDR_REMAP_REG0                            0xa60
-#define MV64340_DMA_HIGH_ADDR_REMAP_REG1                            0xa64
-#define MV64340_DMA_HIGH_ADDR_REMAP_REG2                            0xa68
-#define MV64340_DMA_HIGH_ADDR_REMAP_REG3                            0xa6C
-#define MV64340_DMA_BASE_ADDR_ENABLE_REG                            0xa80
-#define MV64340_DMA_CHANNEL0_ACCESS_PROTECTION_REG                  0xa70
-#define MV64340_DMA_CHANNEL1_ACCESS_PROTECTION_REG                  0xa74
-#define MV64340_DMA_CHANNEL2_ACCESS_PROTECTION_REG                  0xa78
-#define MV64340_DMA_CHANNEL3_ACCESS_PROTECTION_REG                  0xa7c
-#define MV64340_DMA_ARBITER_CONTROL                                 0x860
-#define MV64340_DMA_CROSS_BAR_TIMEOUT                               0x8d0
-
- /*  IDMA Headers Retarget Registers   */
-
-#define MV64340_DMA_HEADERS_RETARGET_CONTROL                        0xa84
-#define MV64340_DMA_HEADERS_RETARGET_BASE                           0xa88
-
- /*  IDMA Interrupt Register  */
-
-#define MV64340_DMA_INTERRUPT_CAUSE_REG                             0x8c0
-#define MV64340_DMA_INTERRUPT_CAUSE_MASK                            0x8c4
-#define MV64340_DMA_ERROR_ADDR                                      0x8c8
-#define MV64340_DMA_ERROR_SELECT                                    0x8cc
-
- /*  IDMA Debug Register ( for internal use )    */
-
-#define MV64340_DMA_DEBUG_LOW                                       0x8e0
-#define MV64340_DMA_DEBUG_HIGH                                      0x8e4
-#define MV64340_DMA_SPARE                                           0xA8C
-
-/****************************************/
-/* Timer_Counter 			*/
-/****************************************/
-
-#define MV64340_TIMER_COUNTER0					    0x850
-#define MV64340_TIMER_COUNTER1					    0x854
-#define MV64340_TIMER_COUNTER2					    0x858
-#define MV64340_TIMER_COUNTER3					    0x85C
-#define MV64340_TIMER_COUNTER_0_3_CONTROL			    0x864
-#define MV64340_TIMER_COUNTER_0_3_INTERRUPT_CAUSE		    0x868
-#define MV64340_TIMER_COUNTER_0_3_INTERRUPT_MASK      		    0x86c
-
-/****************************************/
-/*         Watchdog registers  	        */
-/****************************************/
-
-#define MV64340_WATCHDOG_CONFIG_REG                                 0xb410
-#define MV64340_WATCHDOG_VALUE_REG                                  0xb414
-
-/****************************************/
-/* I2C Registers                        */
-/****************************************/
-
-#define MV64XXX_I2C_OFFSET                                          0xc000
-#define MV64XXX_I2C_REG_BLOCK_SIZE                                  0x0020
-
-/****************************************/
-/* GPP Interface Registers              */
-/****************************************/
-
-#define MV64340_GPP_IO_CONTROL                                      0xf100
-#define MV64340_GPP_LEVEL_CONTROL                                   0xf110
-#define MV64340_GPP_VALUE                                           0xf104
-#define MV64340_GPP_INTERRUPT_CAUSE                                 0xf108
-#define MV64340_GPP_INTERRUPT_MASK0                                 0xf10c
-#define MV64340_GPP_INTERRUPT_MASK1                                 0xf114
-#define MV64340_GPP_VALUE_SET                                       0xf118
-#define MV64340_GPP_VALUE_CLEAR                                     0xf11c
-
-/****************************************/
-/* Interrupt Controller Registers       */
-/****************************************/
-
-/****************************************/
-/* Interrupts	  			*/
-/****************************************/
-
-#define MV64340_MAIN_INTERRUPT_CAUSE_LOW                            0x004
-#define MV64340_MAIN_INTERRUPT_CAUSE_HIGH                           0x00c
-#define MV64340_CPU_INTERRUPT0_MASK_LOW                             0x014
-#define MV64340_CPU_INTERRUPT0_MASK_HIGH                            0x01c
-#define MV64340_CPU_INTERRUPT0_SELECT_CAUSE                         0x024
-#define MV64340_CPU_INTERRUPT1_MASK_LOW                             0x034
-#define MV64340_CPU_INTERRUPT1_MASK_HIGH                            0x03c
-#define MV64340_CPU_INTERRUPT1_SELECT_CAUSE                         0x044
-#define MV64340_INTERRUPT0_MASK_0_LOW                               0x054
-#define MV64340_INTERRUPT0_MASK_0_HIGH                              0x05c
-#define MV64340_INTERRUPT0_SELECT_CAUSE                             0x064
-#define MV64340_INTERRUPT1_MASK_0_LOW                               0x074
-#define MV64340_INTERRUPT1_MASK_0_HIGH                              0x07c
-#define MV64340_INTERRUPT1_SELECT_CAUSE                             0x084
-
-/****************************************/
-/*      MPP Interface Registers         */
-/****************************************/
-
-#define MV64340_MPP_CONTROL0                                        0xf000
-#define MV64340_MPP_CONTROL1                                        0xf004
-#define MV64340_MPP_CONTROL2                                        0xf008
-#define MV64340_MPP_CONTROL3                                        0xf00c
-
-/****************************************/
-/*    Serial Initialization registers   */
-/****************************************/
-
-#define MV64340_SERIAL_INIT_LAST_DATA                               0xf324
-#define MV64340_SERIAL_INIT_CONTROL                                 0xf328
-#define MV64340_SERIAL_INIT_STATUS                                  0xf32c
-
-extern void mv64340_irq_init(unsigned int base);
-
-#endif /* __ASM_MV643XX_H */
-- 
cgit v1.2.3


From 4208c562a27899212e8046080555e0f204e0579a Mon Sep 17 00:00:00 2001
From: Kanchan Joshi <joshi.k@samsung.com>
Date: Tue, 17 Sep 2024 10:24:57 +0530
Subject: block: remove bogus union

The union around bi_integrity field is pointless.
Remove it.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Link: https://lore.kernel.org/r/20240917045457.429698-1-joshi.k@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 36ed96133217..6d3a0fff2a1d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -248,11 +248,9 @@ struct bio {
 	struct bio_crypt_ctx	*bi_crypt_context;
 #endif
 
-	union {
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
-		struct bio_integrity_payload *bi_integrity; /* data integrity */
+	struct bio_integrity_payload *bi_integrity; /* data integrity */
 #endif
-	};
 
 	unsigned short		bi_vcnt;	/* how many bio_vec's */
 
-- 
cgit v1.2.3


From 6857be5fecaebd9773ff27b6d29b6fff3b1abbce Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 26 Aug 2024 16:43:35 -0400
Subject: mm: introduce ARCH_SUPPORTS_HUGE_PFNMAP and special bits to pmd/pud

Patch series "mm: Support huge pfnmaps", v2.

Overview
========

This series implements huge pfnmaps support for mm in general.  Huge
pfnmap allows e.g.  VM_PFNMAP vmas to map in either PMD or PUD levels,
similar to what we do with dax / thp / hugetlb so far to benefit from TLB
hits.  Now we extend that idea to PFN mappings, e.g.  PCI MMIO bars where
it can grow as large as 8GB or even bigger.

Currently, only x86_64 (1G+2M) and arm64 (2M) are supported.  The last
patch (from Alex Williamson) will be the first user of huge pfnmap, so as
to enable vfio-pci driver to fault in huge pfn mappings.

Implementation
==============

In reality, it's relatively simple to add such support comparing to many
other types of mappings, because of PFNMAP's specialties when there's no
vmemmap backing it, so that most of the kernel routines on huge mappings
should simply already fail for them, like GUPs or old-school follow_page()
(which is recently rewritten to be folio_walk* APIs by David).

One trick here is that we're still unmature on PUDs in generic paths here
and there, as DAX is so far the only user.  This patchset will add the 2nd
user of it.  Hugetlb can be a 3rd user if the hugetlb unification work can
go on smoothly, but to be discussed later.

The other trick is how to allow gup-fast working for such huge mappings
even if there's no direct sign of knowing whether it's a normal page or
MMIO mapping.  This series chose to keep the pte_special solution, so that
it reuses similar idea on setting a special bit to pfnmap PMDs/PUDs so
that gup-fast will be able to identify them and fail properly.

Along the way, we'll also notice that the major pgtable pfn walker, aka,
follow_pte(), will need to retire soon due to the fact that it only works
with ptes.  A new set of simple API is introduced (follow_pfnmap* API) to
be able to do whatever follow_pte() can already do, plus that it can also
process huge pfnmaps now.  Half of this series is about that and
converting all existing pfnmap walkers to use the new API properly.
Hopefully the new API also looks better to avoid exposing e.g.  pgtable
lock details into the callers, so that it can be used in an even more
straightforward way.

Here, three more options will be introduced and involved in huge pfnmap:

  - ARCH_SUPPORTS_HUGE_PFNMAP

    Arch developers will need to select this option when huge pfnmap is
    supported in arch's Kconfig.  After this patchset applied, both x86_64
    and arm64 will start to enable it by default.

  - ARCH_SUPPORTS_PMD_PFNMAP / ARCH_SUPPORTS_PUD_PFNMAP

    These options are for driver developers to identify whether current
    arch / config supports huge pfnmaps, making decision on whether it can
    use the huge pfnmap APIs to inject them.  One can refer to the last
    vfio-pci patch from Alex on the use of them properly in a device
    driver.

So after the whole set applied, and if one would enable some dynamic debug
lines in vfio-pci core files, we should observe things like:

  vfio-pci 0000:00:06.0: vfio_pci_mmap_huge_fault(,order = 9) BAR 0 page offset 0x0: 0x100
  vfio-pci 0000:00:06.0: vfio_pci_mmap_huge_fault(,order = 9) BAR 0 page offset 0x200: 0x100
  vfio-pci 0000:00:06.0: vfio_pci_mmap_huge_fault(,order = 9) BAR 0 page offset 0x400: 0x100

In this specific case, it says that vfio-pci faults in PMDs properly for a
few BAR0 offsets.

Patch Layout
============

Patch 1:         Introduce the new options mentioned above for huge PFNMAPs
Patch 2:         A tiny cleanup
Patch 3-8:       Preparation patches for huge pfnmap (include introduce
                 special bit for pmd/pud)
Patch 9-16:      Introduce follow_pfnmap*() API, use it everywhere, and
                 then drop follow_pte() API
Patch 17:        Add huge pfnmap support for x86_64
Patch 18:        Add huge pfnmap support for arm64
Patch 19:        Add vfio-pci support for all kinds of huge pfnmaps (Alex)

TODO
====

More architectures / More page sizes
------------------------------------

Currently only x86_64 (2M+1G) and arm64 (2M) are supported.  There seems
to have plan to support arm64 1G later on top of this series [2].

Any arch will need to first support THP / THP_1G, then provide a special
bit in pmds/puds to support huge pfnmaps.

remap_pfn_range() support
-------------------------

Currently, remap_pfn_range() still only maps PTEs.  With the new option,
remap_pfn_range() can logically start to inject either PMDs or PUDs when
the alignment requirements match on the VAs.

When the support is there, it should be able to silently benefit all
drivers that is using remap_pfn_range() in its mmap() handler on better
TLB hit rate and overall faster MMIO accesses similar to processor on
hugepages.

More driver support
-------------------

VFIO is so far the only consumer for the huge pfnmaps after this series
applied.  Besides above remap_pfn_range() generic optimization, device
driver can also try to optimize its mmap() on a better VA alignment for
either PMD/PUD sizes.  This may, iiuc, normally require userspace changes,
as the driver doesn't normally decide the VA to map a bar.  But I don't
think I know all the drivers to know the full picture.

Credits all go to Alex on help testing the GPU/NIC use cases above.

[0] https://lore.kernel.org/r/73ad9540-3fb8-4154-9a4f-30a0a2b03d41@lucifer.local
[1] https://lore.kernel.org/r/20240807194812.819412-1-peterx@redhat.com
[2] https://lore.kernel.org/r/498e0731-81a4-4f75-95b4-a8ad0bcc7665@huawei.com


This patch (of 19):

This patch introduces the option to introduce special pte bit into
pmd/puds.  Archs can start to define pmd_special / pud_special when
supported by selecting the new option.  Per-arch support will be added
later.

Before that, create fallbacks for these helpers so that they are always
available.

Link: https://lkml.kernel.org/r/20240826204353.2228736-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20240826204353.2228736-2-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gavin Shan <gshan@redhat.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 39f6faace590..c2307a2d275d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2643,6 +2643,30 @@ static inline pte_t pte_mkspecial(pte_t pte)
 }
 #endif
 
+#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
+static inline bool pmd_special(pmd_t pmd)
+{
+	return false;
+}
+
+static inline pmd_t pmd_mkspecial(pmd_t pmd)
+{
+	return pmd;
+}
+#endif	/* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */
+
+#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
+static inline bool pud_special(pud_t pud)
+{
+	return false;
+}
+
+static inline pud_t pud_mkspecial(pud_t pud)
+{
+	return pud;
+}
+#endif	/* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
+
 #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pte_devmap(pte_t pte)
 {
-- 
cgit v1.2.3


From ef713ec3a566d3e5e011c5d6201eb661ebf94c1f Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 26 Aug 2024 16:43:36 -0400
Subject: mm: drop is_huge_zero_pud()

It constantly returns false since 2017.  One assertion is added in 2019 but
it should never have triggered, IOW it means what is checked should be
asserted instead.

If it didn't exist for 7 years maybe it's good idea to remove it and only
add it when it comes.

Link: https://lkml.kernel.org/r/20240826204353.2228736-3-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gavin Shan <gshan@redhat.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e3896ebf0f94..ffca706bac81 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -433,11 +433,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
 	return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd);
 }
 
-static inline bool is_huge_zero_pud(pud_t pud)
-{
-	return false;
-}
-
 struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
 void mm_put_huge_zero_folio(struct mm_struct *mm);
 
@@ -578,11 +573,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
 	return false;
 }
 
-static inline bool is_huge_zero_pud(pud_t pud)
-{
-	return false;
-}
-
 static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
 {
 	return;
-- 
cgit v1.2.3


From 5dd40721f147e83733ad34848330913cb633046e Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 26 Aug 2024 16:43:38 -0400
Subject: mm: allow THP orders for PFNMAPs

This enables PFNMAPs to be mapped at either pmd/pud layers.  Generalize the
dax case into vma_is_special_huge() so as to cover both.  Meanwhile, rename
the macro to THP_ORDERS_ALL_SPECIAL.

Link: https://lkml.kernel.org/r/20240826204353.2228736-5-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Gavin Shan <gshan@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ffca706bac81..0b0539f4ee1a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -76,9 +76,9 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
 /*
  * Mask of all large folio orders supported for file THP. Folios in a DAX
  * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to
- * it.
+ * it.  Same to PFNMAPs where there's neither page* nor pagecache.
  */
-#define THP_ORDERS_ALL_FILE_DAX		\
+#define THP_ORDERS_ALL_SPECIAL		\
 	(BIT(PMD_ORDER) | BIT(PUD_ORDER))
 #define THP_ORDERS_ALL_FILE_DEFAULT	\
 	((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))
@@ -87,7 +87,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
  * Mask of all large folio orders supported for THP.
  */
 #define THP_ORDERS_ALL	\
-	(THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT)
+	(THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT)
 
 #define TVA_SMAPS		(1 << 0)	/* Will be used for procfs */
 #define TVA_IN_PF		(1 << 1)	/* Page fault handler */
-- 
cgit v1.2.3


From 0515e022e167cfacf1fee092eb93aa9514e23c0a Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 26 Aug 2024 16:43:42 -0400
Subject: mm: always define pxx_pgprot()

There're:

  - 8 archs (arc, arm64, include, mips, powerpc, s390, sh, x86) that
  support pte_pgprot().

  - 2 archs (x86, sparc) that support pmd_pgprot().

  - 1 arch (x86) that support pud_pgprot().

Always define them to be used in generic code, and then we don't need to
fiddle with "#ifdef"s when doing so.

Link: https://lkml.kernel.org/r/20240826204353.2228736-9-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gavin Shan <gshan@redhat.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 780f3b439d98..e8b2ac6bd2ae 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1956,6 +1956,18 @@ typedef unsigned int pgtbl_mod_mask;
 #define MAX_PTRS_PER_P4D PTRS_PER_P4D
 #endif
 
+#ifndef pte_pgprot
+#define pte_pgprot(x) ((pgprot_t) {0})
+#endif
+
+#ifndef pmd_pgprot
+#define pmd_pgprot(x) ((pgprot_t) {0})
+#endif
+
+#ifndef pud_pgprot
+#define pud_pgprot(x) ((pgprot_t) {0})
+#endif
+
 /* description of effects of mapping type and prot in current implementation.
  * this is due to the limited x86 page protection hardware.  The expected
  * behavior is in parens:
-- 
cgit v1.2.3


From 6da8e9634bb7e3fdad9ae0e4db873a05036c4343 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 26 Aug 2024 16:43:43 -0400
Subject: mm: new follow_pfnmap API

Introduce a pair of APIs to follow pfn mappings to get entry information.
It's very similar to what follow_pte() does before, but different in that
it recognizes huge pfn mappings.

Link: https://lkml.kernel.org/r/20240826204353.2228736-10-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gavin Shan <gshan@redhat.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c2307a2d275d..62bd89414897 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2373,6 +2373,37 @@ int follow_pte(struct vm_area_struct *vma, unsigned long address,
 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 			void *buf, int len, int write);
 
+struct follow_pfnmap_args {
+	/**
+	 * Inputs:
+	 * @vma: Pointer to @vm_area_struct struct
+	 * @address: the virtual address to walk
+	 */
+	struct vm_area_struct *vma;
+	unsigned long address;
+	/**
+	 * Internals:
+	 *
+	 * The caller shouldn't touch any of these.
+	 */
+	spinlock_t *lock;
+	pte_t *ptep;
+	/**
+	 * Outputs:
+	 *
+	 * @pfn: the PFN of the address
+	 * @pgprot: the pgprot_t of the mapping
+	 * @writable: whether the mapping is writable
+	 * @special: whether the mapping is a special mapping (real PFN maps)
+	 */
+	unsigned long pfn;
+	pgprot_t pgprot;
+	bool writable;
+	bool special;
+};
+int follow_pfnmap_start(struct follow_pfnmap_args *args);
+void follow_pfnmap_end(struct follow_pfnmap_args *args);
+
 extern void truncate_pagecache(struct inode *inode, loff_t new);
 extern void truncate_setsize(struct inode *inode, loff_t newsize);
 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
-- 
cgit v1.2.3


From b0a1c0d0edcd75a0f8ec5fd19dbd64b8d097f534 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 26 Aug 2024 16:43:50 -0400
Subject: mm: remove follow_pte()

follow_pte() users have been converted to follow_pfnmap*().  Remove the
API.

Link: https://lkml.kernel.org/r/20240826204353.2228736-17-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gavin Shan <gshan@redhat.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 62bd89414897..d750be768121 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2368,8 +2368,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
-int follow_pte(struct vm_area_struct *vma, unsigned long address,
-	       pte_t **ptepp, spinlock_t **ptlp);
 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 			void *buf, int len, int write);
 
-- 
cgit v1.2.3


From 82ce8e2f31a1eb05b1527c3d807bea40031df913 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Sat, 7 Sep 2024 17:40:42 +0200
Subject: set_memory: add __must_check to generic stubs

Following query shows that architectures that don't provide
asm/set_memory.h don't use set_memory_...() functions.

  $ git grep set_memory_ alpha arc csky hexagon loongarch m68k microblaze mips nios2 openrisc parisc sh sparc um xtensa

Following query shows that all core users of set_memory_...()
functions always take returned value into account:

  $ git grep -w -e set_memory_ro -e set_memory_rw -e set_memory_x -e set_memory_nx -e set_memory_rox `find . -maxdepth 1 -type d | grep -v arch | grep /`

set_memory_...() functions can fail, leaving the memory attributes
unchanged. Make sure all callers check the returned code.

Link: https://github.com/KSPP/linux/issues/7
Link: https://lkml.kernel.org/r/6a89ffc69666de84721216947c6b6c7dcca39d7d.1725723347.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/set_memory.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index 95ac8398ee72..e7aec20fb44f 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -8,10 +8,10 @@
 #ifdef CONFIG_ARCH_HAS_SET_MEMORY
 #include <asm/set_memory.h>
 #else
-static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; }
-static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; }
-static inline int set_memory_x(unsigned long addr,  int numpages) { return 0; }
-static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
+static inline int __must_check set_memory_ro(unsigned long addr, int numpages) { return 0; }
+static inline int __must_check set_memory_rw(unsigned long addr, int numpages) { return 0; }
+static inline int __must_check set_memory_x(unsigned long addr,  int numpages) { return 0; }
+static inline int __must_check set_memory_nx(unsigned long addr, int numpages) { return 0; }
 #endif
 
 #ifndef set_memory_rox
-- 
cgit v1.2.3


From 325efb16da2c840e165d9b620fec8049d4d664cc Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Mon, 9 Sep 2024 11:21:18 +1200
Subject: mm: add nr argument in mem_cgroup_swapin_uncharge_swap() helper to
 support large folios

With large folios swap-in, we might need to uncharge multiple entries all
together, add nr argument in mem_cgroup_swapin_uncharge_swap().

For the existing two users, just pass nr=1.

Link: https://lkml.kernel.org/r/20240908232119.2157-3-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Acked-by: Chris Li <chrisl@kernel.org>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gao Xiang <xiang@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Chuanhua Han <hanchuanhua@oppo.com>
Cc: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2ef94c74847d..34d2da05f2f1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -699,7 +699,8 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
 
 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
 				  gfp_t gfp, swp_entry_t entry);
-void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
+
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
 
 void __mem_cgroup_uncharge(struct folio *folio);
 
@@ -1206,7 +1207,7 @@ static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
 	return 0;
 }
 
-static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr)
 {
 }
 
-- 
cgit v1.2.3


From ed8d5b0ce1d738e13c60d6b1a901a56d832e5070 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2024 15:13:20 +0200
Subject: Revert "uprobes: use vm_special_mapping close() functionality"

This reverts commit 08e28de1160a712724268fd33d77b32f1bc84d1c.

A malicious application can munmap() its "[uprobes]" vma and in this case
xol_mapping.close == uprobe_clear_state() will free the memory which can
be used by another thread, or the same thread when it hits the uprobe bp
afterwards.

Link: https://lkml.kernel.org/r/20240911131320.GA3448@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/uprobes.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 493dc95d912c..b503fafb7fb3 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -126,6 +126,7 @@ extern int uprobe_pre_sstep_notifier(struct pt_regs *regs);
 extern void uprobe_notify_resume(struct pt_regs *regs);
 extern bool uprobe_deny_signal(void);
 extern bool arch_uprobe_skip_sstep(struct arch_uprobe *aup, struct pt_regs *regs);
+extern void uprobe_clear_state(struct mm_struct *mm);
 extern int  arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr);
 extern int  arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern int  arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
-- 
cgit v1.2.3


From aef79e189ba2b32f78bd35daf2c0b41f3868a321 Mon Sep 17 00:00:00 2001
From: Carlos Song <carlos.song@nxp.com>
Date: Tue, 10 Sep 2024 13:16:25 +0800
Subject: i3c: master: support to adjust first broadcast address speed

According to I3C spec 6.2 Timing Specification, the Open Drain High Period
of SCL Clock timing for first broadcast address should be adjusted to 200ns
at least. I3C device working as i2c device will see the broadcast to close
its Spike Filter then change to work at I3C mode. After that I3C open drain
SCL high level should be adjusted back.

Signed-off-by: Carlos Song <carlos.song@nxp.com>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20240910051626.4052552-1-carlos.song@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/master.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 074f632868d9..2a1ed05d5782 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -277,6 +277,20 @@ enum i3c_bus_mode {
 	I3C_BUS_MODE_MIXED_SLOW,
 };
 
+/**
+ * enum i3c_open_drain_speed - I3C open-drain speed
+ * @I3C_OPEN_DRAIN_SLOW_SPEED: Slow open-drain speed for sending the first
+ *				broadcast address. The first broadcast address at this speed
+ *				will be visible to all devices on the I3C bus. I3C devices
+ *				working in I2C mode will turn off their spike filter when
+ *				switching into I3C mode.
+ * @I3C_OPEN_DRAIN_NORMAL_SPEED: Normal open-drain speed in I3C bus mode.
+ */
+enum i3c_open_drain_speed {
+	I3C_OPEN_DRAIN_SLOW_SPEED,
+	I3C_OPEN_DRAIN_NORMAL_SPEED,
+};
+
 /**
  * enum i3c_addr_slot_status - I3C address slot status
  * @I3C_ADDR_SLOT_FREE: address is free
@@ -436,6 +450,7 @@ struct i3c_bus {
  *		      NULL.
  * @enable_hotjoin: enable hot join event detect.
  * @disable_hotjoin: disable hot join event detect.
+ * @set_speed: adjust I3C open drain mode timing.
  */
 struct i3c_master_controller_ops {
 	int (*bus_init)(struct i3c_master_controller *master);
@@ -464,6 +479,7 @@ struct i3c_master_controller_ops {
 				 struct i3c_ibi_slot *slot);
 	int (*enable_hotjoin)(struct i3c_master_controller *master);
 	int (*disable_hotjoin)(struct i3c_master_controller *master);
+	int (*set_speed)(struct i3c_master_controller *master, enum i3c_open_drain_speed speed);
 };
 
 /**
-- 
cgit v1.2.3


From 9ba5dcc722de4390a1d3211b2ee3c864f84f5461 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Fri, 20 Sep 2024 01:48:17 +0100
Subject: block: Remove unused blk_limits_io_{min,opt}

blk_limits_io_min and blk_limits_io_opt are unused since the
recent commit
  0a94a469a4f0 ("dm: stop using blk_limits_io_{min,opt}")

Remove them.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Link: https://lore.kernel.org/r/20240920004817.676216-1-linux@treblig.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 643c9020a35a..50c3b959da28 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -968,8 +968,6 @@ static inline void blk_queue_disable_write_zeroes(struct request_queue *q)
 /*
  * Access functions for manipulating queue properties
  */
-extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
-extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
 extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
 extern void blk_set_stacking_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
-- 
cgit v1.2.3


From 65f666c6203600053478ce8e34a1db269a8701c9 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 19 Sep 2024 10:17:09 +0800
Subject: lib/sbitmap: define swap_lock as raw_spinlock_t

When called from sbitmap_queue_get(), sbitmap_deferred_clear() may be run
with preempt disabled. In RT kernel, spin_lock() can sleep, then warning
of "BUG: sleeping function called from invalid context" can be triggered.

Fix it by replacing it with raw_spin_lock.

Cc: Yang Yang <yang.yang@vivo.com>
Fixes: 72d04bdcf3f7 ("sbitmap: fix io hung due to race on sbitmap_word::cleared")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Yang Yang <yang.yang@vivo.com>
Link: https://lore.kernel.org/r/20240919021709.511329-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index c09cdcc99471..189140bf11fc 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -40,7 +40,7 @@ struct sbitmap_word {
 	/**
 	 * @swap_lock: serializes simultaneous updates of ->word and ->cleared
 	 */
-	spinlock_t swap_lock;
+	raw_spinlock_t swap_lock;
 } ____cacheline_aligned_in_smp;
 
 /**
-- 
cgit v1.2.3


From 652bfcb76fe689f4d85b6f3688025a87fb94f9a1 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Wed, 31 Jul 2024 15:43:13 +0800
Subject: KEYS: Remove unused declarations

These declarations are never implemented, remove it.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/linux/key.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index 943a432da3ae..074dca3222b9 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -436,9 +436,6 @@ extern key_ref_t keyring_search(key_ref_t keyring,
 				const char *description,
 				bool recurse);
 
-extern int keyring_add_key(struct key *keyring,
-			   struct key *key);
-
 extern int keyring_restrict(key_ref_t keyring, const char *type,
 			    const char *restriction);
 
-- 
cgit v1.2.3


From 60749cbe3d8ae572a6c7dda675de3e8b25797a18 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 15 Jul 2024 17:14:18 +1000
Subject: sunrpc: change sp_nrthreads from atomic_t to unsigned int.

sp_nrthreads is only ever accessed under the service mutex
  nlmsvc_mutex nfs_callback_mutex nfsd_mutex
so these is no need for it to be an atomic_t.

The fact that all code using it is single-threaded means that we can
simplify svc_pool_victim and remove the temporary elevation of
sp_nrthreads.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index e4fa25fafa97..99e9345d829e 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -33,9 +33,9 @@
  * node traffic on multi-node NUMA NFS servers.
  */
 struct svc_pool {
-	unsigned int		sp_id;	    	/* pool id; also node id on NUMA */
+	unsigned int		sp_id;		/* pool id; also node id on NUMA */
 	struct lwq		sp_xprts;	/* pending transports */
-	atomic_t		sp_nrthreads;	/* # of threads in pool */
+	unsigned int		sp_nrthreads;	/* # of threads in pool */
 	struct list_head	sp_all_threads;	/* all server threads */
 	struct llist_head	sp_idle_threads; /* idle server threads */
 
-- 
cgit v1.2.3


From 3391fc92db8e761f1a2df5612fcb999dac6bc00a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 16 Sep 2024 09:45:40 +1000
Subject: sunrpc: allow svc threads to fail initialisation cleanly

If an svc thread needs to perform some initialisation that might fail,
it has no good way to handle the failure.

Before the thread can exit it must call svc_exit_thread(), but that
requires the service mutex to be held.  The thread cannot simply take
the mutex as that could deadlock if there is a concurrent attempt to
shut down all threads (which is unlikely, but not impossible).

nfsd currently call svc_exit_thread() unprotected in the unlikely event
that unshare_fs_struct() fails.

We can clean this up by introducing svc_thread_init_status() by which an
svc thread can report whether initialisation has succeeded.  If it has,
it continues normally into the action loop.  If it has not,
svc_thread_init_status() immediately aborts the thread.
svc_start_kthread() waits for either of these to happen, and calls
svc_exit_thread() (under the mutex) if the thread aborted.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 99e9345d829e..c419a61f60e5 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -21,6 +21,7 @@
 #include <linux/wait.h>
 #include <linux/mm.h>
 #include <linux/pagevec.h>
+#include <linux/kthread.h>
 
 /*
  *
@@ -232,6 +233,11 @@ struct svc_rqst {
 	struct net		*rq_bc_net;	/* pointer to backchannel's
 						 * net namespace
 						 */
+
+	int			rq_err;		/* Thread sets this to inidicate
+						 * initialisation success.
+						 */
+
 	unsigned long	bc_to_initval;
 	unsigned int	bc_to_retries;
 	void **			rq_lease_breaker; /* The v4 client breaking a lease */
@@ -305,6 +311,31 @@ static inline bool svc_thread_should_stop(struct svc_rqst *rqstp)
 	return test_bit(RQ_VICTIM, &rqstp->rq_flags);
 }
 
+/**
+ * svc_thread_init_status - report whether thread has initialised successfully
+ * @rqstp: the thread in question
+ * @err: errno code
+ *
+ * After performing any initialisation that could fail, and before starting
+ * normal work, each sunrpc svc_thread must call svc_thread_init_status()
+ * with an appropriate error, or zero.
+ *
+ * If zero is passed, the thread is ready and must continue until
+ * svc_thread_should_stop() returns true.  If a non-zero error is passed
+ * the call will not return - the thread will exit.
+ */
+static inline void svc_thread_init_status(struct svc_rqst *rqstp, int err)
+{
+	rqstp->rq_err = err;
+	/* memory barrier ensures assignment to error above is visible before
+	 * waitqueue_active() test below completes.
+	 */
+	smp_mb();
+	wake_up_var(&rqstp->rq_err);
+	if (err)
+		kthread_exit(1);
+}
+
 struct svc_deferred_req {
 	u32			prot;	/* protocol (UDP or TCP) */
 	struct svc_xprt		*xprt;
-- 
cgit v1.2.3


From 36ffa3d0de54c1cf516ea32a5ec556f5c9874795 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 29 Jul 2024 11:47:23 +1000
Subject: nfsd: be more systematic about selecting error codes for internal
 use.

Rather than using ad hoc values for internal errors (30000, 11000, ...)
use 'enum' to sequentially allocate numbers starting from the first
known available number - now visible as NFS4ERR_FIRST_FREE.

The goal is values that are distinct from all be32 error codes.  To get
those we must first select integers that are not already used, then
convert them with cpu_to_be32().

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/nfs4.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index f9df88091c6d..8d7430d9f218 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -281,15 +281,18 @@ enum nfsstat4 {
 	/* nfs42 */
 	NFS4ERR_PARTNER_NOTSUPP	= 10088,
 	NFS4ERR_PARTNER_NO_AUTH	= 10089,
-	NFS4ERR_UNION_NOTSUPP = 10090,
-	NFS4ERR_OFFLOAD_DENIED = 10091,
-	NFS4ERR_WRONG_LFS = 10092,
-	NFS4ERR_BADLABEL = 10093,
-	NFS4ERR_OFFLOAD_NO_REQS = 10094,
+	NFS4ERR_UNION_NOTSUPP	= 10090,
+	NFS4ERR_OFFLOAD_DENIED	= 10091,
+	NFS4ERR_WRONG_LFS	= 10092,
+	NFS4ERR_BADLABEL	= 10093,
+	NFS4ERR_OFFLOAD_NO_REQS	= 10094,
 
 	/* xattr (RFC8276) */
-	NFS4ERR_NOXATTR        = 10095,
-	NFS4ERR_XATTR2BIG      = 10096,
+	NFS4ERR_NOXATTR		= 10095,
+	NFS4ERR_XATTR2BIG	= 10096,
+
+	/* can be used for internal errors */
+	NFS4ERR_FIRST_FREE
 };
 
 /* error codes for internal client use */
-- 
cgit v1.2.3


From c4de97f7c45434985e5dbf2d6ccc9eca676e37fe Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 29 Jul 2024 16:52:32 -0400
Subject: svcrdma: Handle device removal outside of the CM event handler

Synchronously wait for all disconnects to complete to ensure the
transports have divested all hardware resources before the
underlying RDMA device can safely be removed.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index d33bab33099a..619fc0bd837a 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -48,6 +48,7 @@
 #include <linux/sunrpc/rpc_rdma.h>
 #include <linux/sunrpc/rpc_rdma_cid.h>
 #include <linux/sunrpc/svc_rdma_pcl.h>
+#include <linux/sunrpc/rdma_rn.h>
 
 #include <linux/percpu_counter.h>
 #include <rdma/ib_verbs.h>
@@ -76,6 +77,7 @@ struct svcxprt_rdma {
 	struct svc_xprt      sc_xprt;		/* SVC transport structure */
 	struct rdma_cm_id    *sc_cm_id;		/* RDMA connection id */
 	struct list_head     sc_accept_q;	/* Conn. waiting accept */
+	struct rpcrdma_notification sc_rn;	/* removal notification */
 	int		     sc_ord;		/* RDMA read limit */
 	int                  sc_max_send_sges;
 	bool		     sc_snd_w_inv;	/* OK to use Send With Invalidate */
-- 
cgit v1.2.3


From 4b132aacb0768ac1e652cf517097ea6f237214b9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 13 Sep 2024 14:08:13 -0400
Subject: tools: Add xdrgen

Add a Python-based tool for translating XDR specifications into XDR
encoder and decoder functions written in the Linux kernel's C coding
style. The generator attempts to match the usual C coding style of
the Linux kernel's SunRPC consumers.

This approach is similar to the netlink code generator in
tools/net/ynl .

The maintainability benefits of machine-generated XDR code include:

- Stronger type checking
- Reduces the number of bugs introduced by human error
- Makes the XDR code easier to audit and analyze
- Enables rapid prototyping of new RPC-based protocols
- Hardens the layering between protocol logic and marshaling
- Makes it easier to add observability on demand
- Unit tests might be built for both the tool and (automatically)
  for the generated code

In addition, converting the XDR layer to use memory-safe languages
such as Rust will be easier if much of the code can be converted
automatically.

Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/xdrgen/_builtins.h | 243 ++++++++++++++++++++++++++++++++
 include/linux/sunrpc/xdrgen/_defs.h     |  26 ++++
 2 files changed, 269 insertions(+)
 create mode 100644 include/linux/sunrpc/xdrgen/_builtins.h
 create mode 100644 include/linux/sunrpc/xdrgen/_defs.h

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdrgen/_builtins.h b/include/linux/sunrpc/xdrgen/_builtins.h
new file mode 100644
index 000000000000..68746c59fc9a
--- /dev/null
+++ b/include/linux/sunrpc/xdrgen/_builtins.h
@@ -0,0 +1,243 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Oracle and/or its affiliates.
+ *
+ * This header defines XDR data type primitives specified in
+ * Section 4 of RFC 4506, used by RPC programs implemented
+ * in the Linux kernel.
+ */
+
+#ifndef _SUNRPC_XDRGEN__BUILTINS_H_
+#define _SUNRPC_XDRGEN__BUILTINS_H_
+
+#include <linux/sunrpc/xdr.h>
+
+static inline bool
+xdrgen_decode_void(struct xdr_stream *xdr)
+{
+	return true;
+}
+
+static inline bool
+xdrgen_encode_void(struct xdr_stream *xdr)
+{
+	return true;
+}
+
+static inline bool
+xdrgen_decode_bool(struct xdr_stream *xdr, bool *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = (*p != xdr_zero);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_bool(struct xdr_stream *xdr, bool val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*p = val ? xdr_one : xdr_zero;
+	return true;
+}
+
+static inline bool
+xdrgen_decode_int(struct xdr_stream *xdr, s32 *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = be32_to_cpup(p);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_int(struct xdr_stream *xdr, s32 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*p = cpu_to_be32(val);
+	return true;
+}
+
+static inline bool
+xdrgen_decode_unsigned_int(struct xdr_stream *xdr, u32 *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = be32_to_cpup(p);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_unsigned_int(struct xdr_stream *xdr, u32 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*p = cpu_to_be32(val);
+	return true;
+}
+
+static inline bool
+xdrgen_decode_long(struct xdr_stream *xdr, s32 *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = be32_to_cpup(p);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_long(struct xdr_stream *xdr, s32 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*p = cpu_to_be32(val);
+	return true;
+}
+
+static inline bool
+xdrgen_decode_unsigned_long(struct xdr_stream *xdr, u32 *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = be32_to_cpup(p);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_unsigned_long(struct xdr_stream *xdr, u32 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*p = cpu_to_be32(val);
+	return true;
+}
+
+static inline bool
+xdrgen_decode_hyper(struct xdr_stream *xdr, s64 *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT * 2);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = get_unaligned_be64(p);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_hyper(struct xdr_stream *xdr, s64 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2);
+
+	if (unlikely(!p))
+		return false;
+	put_unaligned_be64(val, p);
+	return true;
+}
+
+static inline bool
+xdrgen_decode_unsigned_hyper(struct xdr_stream *xdr, u64 *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT * 2);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = get_unaligned_be64(p);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_unsigned_hyper(struct xdr_stream *xdr, u64 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2);
+
+	if (unlikely(!p))
+		return false;
+	put_unaligned_be64(val, p);
+	return true;
+}
+
+static inline bool
+xdrgen_decode_string(struct xdr_stream *xdr, string *ptr, u32 maxlen)
+{
+	__be32 *p;
+	u32 len;
+
+	if (unlikely(xdr_stream_decode_u32(xdr, &len) != XDR_UNIT))
+		return false;
+	if (unlikely(maxlen && len > maxlen))
+		return false;
+	if (len != 0) {
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			return false;
+		ptr->data = (unsigned char *)p;
+	}
+	ptr->len = len;
+	return true;
+}
+
+static inline bool
+xdrgen_encode_string(struct xdr_stream *xdr, string val, u32 maxlen)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(val.len));
+
+	if (unlikely(!p))
+		return false;
+	xdr_encode_opaque(p, val.data, val.len);
+	return true;
+}
+
+static inline bool
+xdrgen_decode_opaque(struct xdr_stream *xdr, opaque *ptr, u32 maxlen)
+{
+	__be32 *p;
+	u32 len;
+
+	if (unlikely(xdr_stream_decode_u32(xdr, &len) != XDR_UNIT))
+		return false;
+	if (unlikely(maxlen && len > maxlen))
+		return false;
+	if (len != 0) {
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			return false;
+		ptr->data = (u8 *)p;
+	}
+	ptr->len = len;
+	return true;
+}
+
+static inline bool
+xdrgen_encode_opaque(struct xdr_stream *xdr, opaque val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(val.len));
+
+	if (unlikely(!p))
+		return false;
+	xdr_encode_opaque(p, val.data, val.len);
+	return true;
+}
+
+#endif /* _SUNRPC_XDRGEN__BUILTINS_H_ */
diff --git a/include/linux/sunrpc/xdrgen/_defs.h b/include/linux/sunrpc/xdrgen/_defs.h
new file mode 100644
index 000000000000..be9e62371758
--- /dev/null
+++ b/include/linux/sunrpc/xdrgen/_defs.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Oracle and/or its affiliates.
+ *
+ * This header defines XDR data type primitives specified in
+ * Section 4 of RFC 4506, used by RPC programs implemented
+ * in the Linux kernel.
+ */
+
+#ifndef _SUNRPC_XDRGEN__DEFS_H_
+#define _SUNRPC_XDRGEN__DEFS_H_
+
+#define TRUE	(true)
+#define FALSE	(false)
+
+typedef struct {
+	u32 len;
+	unsigned char *data;
+} string;
+
+typedef struct {
+	u32 len;
+	u8 *data;
+} opaque;
+
+#endif /* _SUNRPC_XDRGEN__DEFS_H_ */
-- 
cgit v1.2.3


From 663ad8b1df8724cd5e01df66ea67ce0424fbcdf6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 10 Sep 2024 15:31:19 -0400
Subject: xdrgen: Fix return code checking in built-in XDR decoders

xdr_stream_encode_u32() returns XDR_UNIT on success.
xdr_stream_decode_u32() returns zero or -EMSGSIZE, but never
XDR_UNIT.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/xdrgen/_builtins.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdrgen/_builtins.h b/include/linux/sunrpc/xdrgen/_builtins.h
index 68746c59fc9a..66ca3ece951a 100644
--- a/include/linux/sunrpc/xdrgen/_builtins.h
+++ b/include/linux/sunrpc/xdrgen/_builtins.h
@@ -184,7 +184,7 @@ xdrgen_decode_string(struct xdr_stream *xdr, string *ptr, u32 maxlen)
 	__be32 *p;
 	u32 len;
 
-	if (unlikely(xdr_stream_decode_u32(xdr, &len) != XDR_UNIT))
+	if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0))
 		return false;
 	if (unlikely(maxlen && len > maxlen))
 		return false;
@@ -215,7 +215,7 @@ xdrgen_decode_opaque(struct xdr_stream *xdr, opaque *ptr, u32 maxlen)
 	__be32 *p;
 	u32 len;
 
-	if (unlikely(xdr_stream_decode_u32(xdr, &len) != XDR_UNIT))
+	if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0))
 		return false;
 	if (unlikely(maxlen && len > maxlen))
 		return false;
-- 
cgit v1.2.3


From bb0e391975f8da826305cbaa3e3d34b03c47e2a6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 22 Sep 2024 09:10:17 +0200
Subject: dma-mapping: fix vmap and mmap of noncontiougs allocations

Commit b5c58b2fdc42 ("dma-mapping: direct calls for dma-iommu") switched
to use direct calls to dma-iommu, but missed the dma_vmap_noncontiguous,
dma_vunmap_noncontiguous and dma_mmap_noncontiguous behavior keyed off the
presence of the alloc_noncontiguous method.

Fix this by removing the now unused alloc_noncontiguous and
free_noncontiguous methods and moving the vmapping and mmaping of the
noncontiguous allocations into the iommu code, as it is the only provider
of actually noncontiguous allocations.

Fixes: b5c58b2fdc42 ("dma-mapping: direct calls for dma-iommu")
Reported-by: Xi Ruoyao <xry111@xry111.site>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Leon Romanovsky <leon@kernel.org>
Tested-by: Xi Ruoyao <xry111@xry111.site>
---
 include/linux/dma-map-ops.h | 19 -------------------
 include/linux/iommu-dma.h   |  6 ++++++
 2 files changed, 6 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 9668ddf3696e..b7773201414c 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -24,11 +24,6 @@ struct dma_map_ops {
 			gfp_t gfp);
 	void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
 			dma_addr_t dma_handle, enum dma_data_direction dir);
-	struct sg_table *(*alloc_noncontiguous)(struct device *dev, size_t size,
-			enum dma_data_direction dir, gfp_t gfp,
-			unsigned long attrs);
-	void (*free_noncontiguous)(struct device *dev, size_t size,
-			struct sg_table *sgt, enum dma_data_direction dir);
 	int (*mmap)(struct device *, struct vm_area_struct *,
 			void *, dma_addr_t, size_t, unsigned long attrs);
 
@@ -206,20 +201,6 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma,
 }
 #endif /* CONFIG_DMA_GLOBAL_POOL */
 
-/*
- * This is the actual return value from the ->alloc_noncontiguous method.
- * The users of the DMA API should only care about the sg_table, but to make
- * the DMA-API internal vmaping and freeing easier we stash away the page
- * array as well (except for the fallback case).  This can go away any time,
- * e.g. when a vmap-variant that takes a scatterlist comes along.
- */
-struct dma_sgt_handle {
-	struct sg_table sgt;
-	struct page **pages;
-};
-#define sgt_handle(sgt) \
-	container_of((sgt), struct dma_sgt_handle, sgt)
-
 int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		unsigned long attrs);
diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h
index 1bb55ca1ab79..7bf145a52d6a 100644
--- a/include/linux/iommu-dma.h
+++ b/include/linux/iommu-dma.h
@@ -44,6 +44,12 @@ struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, size_t size,
 		enum dma_data_direction dir, gfp_t gfp, unsigned long attrs);
 void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
 		struct sg_table *sgt, enum dma_data_direction dir);
+void *iommu_dma_vmap_noncontiguous(struct device *dev, size_t size,
+		struct sg_table *sgt);
+#define iommu_dma_vunmap_noncontiguous(dev, vaddr) \
+	vunmap(vaddr);
+int iommu_dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
+		size_t size, struct sg_table *sgt);
 void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
 		size_t size, enum dma_data_direction dir);
 void iommu_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
-- 
cgit v1.2.3


From 3d09ff45469eb2912ce2227c47efac297230d6d6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 22 Sep 2024 09:21:10 +0200
Subject: iommu/dma: remove most stubs in iommu-dma.h

The direct calls from mapping.c all guarded by use_dma_iommu(), so don't
bother to provide stubs, but instead just expose the prototypes
unconditionally.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/iommu-dma.h | 108 ++++------------------------------------------
 1 file changed, 8 insertions(+), 100 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h
index 7bf145a52d6a..508beaa44c39 100644
--- a/include/linux/iommu-dma.h
+++ b/include/linux/iommu-dma.h
@@ -14,6 +14,13 @@ static inline bool use_dma_iommu(struct device *dev)
 {
 	return dev->dma_iommu;
 }
+#else
+static inline bool use_dma_iommu(struct device *dev)
+{
+	return false;
+}
+#endif /* CONFIG_IOMMU_DMA */
+
 dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t size, enum dma_data_direction dir,
 		unsigned long attrs);
@@ -58,104 +65,5 @@ void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
 		int nelems, enum dma_data_direction dir);
 void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
 		int nelems, enum dma_data_direction dir);
-#else
-static inline bool use_dma_iommu(struct device *dev)
-{
-	return false;
-}
-static inline dma_addr_t iommu_dma_map_page(struct device *dev,
-		struct page *page, unsigned long offset, size_t size,
-		enum dma_data_direction dir, unsigned long attrs)
-{
-	return DMA_MAPPING_ERROR;
-}
-static inline void iommu_dma_unmap_page(struct device *dev,
-		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-}
-static inline int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
-{
-	return -EINVAL;
-}
-static inline void iommu_dma_unmap_sg(struct device *dev,
-		struct scatterlist *sg, int nents, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-}
-static inline void *iommu_dma_alloc(struct device *dev, size_t size,
-		dma_addr_t *handle, gfp_t gfp, unsigned long attrs)
-{
-	return NULL;
-}
-static inline int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
-		void *cpu_addr, dma_addr_t dma_addr, size_t size,
-		unsigned long attrs)
-{
-	return -EINVAL;
-}
-static inline int iommu_dma_get_sgtable(struct device *dev,
-		struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr,
-		size_t size, unsigned long attrs)
-{
-	return -EINVAL;
-}
-static inline unsigned long iommu_dma_get_merge_boundary(struct device *dev)
-{
-	return 0;
-}
-static inline size_t iommu_dma_opt_mapping_size(void)
-{
-	return 0;
-}
-static inline size_t iommu_dma_max_mapping_size(struct device *dev)
-{
-	return 0;
-}
-static inline void iommu_dma_free(struct device *dev, size_t size,
-		void *cpu_addr, dma_addr_t handle, unsigned long attrs)
-{
-}
-static inline dma_addr_t iommu_dma_map_resource(struct device *dev,
-		phys_addr_t phys, size_t size, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-	return DMA_MAPPING_ERROR;
-}
-static inline void iommu_dma_unmap_resource(struct device *dev,
-		dma_addr_t handle, size_t size, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-}
-static inline struct sg_table *
-iommu_dma_alloc_noncontiguous(struct device *dev, size_t size,
-		enum dma_data_direction dir, gfp_t gfp, unsigned long attrs)
-{
-	return NULL;
-}
-static inline void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
-		struct sg_table *sgt, enum dma_data_direction dir)
-{
-}
-static inline void iommu_dma_sync_single_for_cpu(struct device *dev,
-		dma_addr_t dma_handle, size_t size,
-		enum dma_data_direction dir)
-{
-}
-static inline void iommu_dma_sync_single_for_device(struct device *dev,
-		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
-{
-}
-static inline void iommu_dma_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sgl, int nelems,
-		enum dma_data_direction dir)
-{
-}
-static inline void iommu_dma_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sgl, int nelems,
-		enum dma_data_direction dir)
-{
-}
-#endif /* CONFIG_IOMMU_DMA */
+
 #endif /* _LINUX_IOMMU_DMA_H */
-- 
cgit v1.2.3


From d0dd066a0fa26d55c19ace9e89dedd9504c5bcba Mon Sep 17 00:00:00 2001
From: "Christoph Lameter (Ampere)" <cl@gentwo.org>
Date: Wed, 12 Jun 2024 09:49:56 -0700
Subject: seqcount: replace smp_rmb() in read_seqcount() with load acquire

Many architectures support load acquire which can replace a memory
barrier and save some cycles.

A typical sequence

	do {
		seq = read_seqcount_begin(&s);
		<something>
	} while (read_seqcount_retry(&s, seq);

requires 13 cycles on an N1 Neoverse arm64 core (Ampere Altra, to be
specific) for an empty loop.  Two read memory barriers are needed.  One
for each of the seqcount_* functions.

We can replace the first read barrier with a load acquire of the
seqcount which saves us one barrier.

On the Altra doing so reduces the cycle count from 13 to 8.

According to ARM, this is a general improvement for the ARM64
architecture and not specific to a certain processor.

See

  https://developer.arm.com/documentation/102336/0100/Load-Acquire-and-Store-Release-instructions

 "Weaker ordering requirements that are imposed by Load-Acquire and
  Store-Release instructions allow for micro-architectural
  optimizations, which could reduce some of the performance impacts that
  are otherwise imposed by an explicit memory barrier.

  If the ordering requirement is satisfied using either a Load-Acquire
  or Store-Release, then it would be preferable to use these
  instructions instead of a DMB"

[ NOTE! This is my original minimal patch that unconditionally switches
  over to using smp_load_acquire(), instead of the much more involved
  and subtle patch that Christoph Lameter wrote that made it
  conditional.

  But Christoph gets authorship credit because I had initially thought
  that we needed the more complex model, and Christoph ran with it it
  and did the work. Only after looking at code generation for all the
  relevant architectures, did I come to the conclusion that nobody
  actually really needs the old "smp_rmb()" model.

  Even architectures without load-acquire support generally do as well
  or better with smp_load_acquire().

  So credit to Christoph, but if this then causes issues on other
  architectures, put the blame solidly on me.

  Also note as part of the ruthless simplification, this gets rid of the
  overly subtle optimization where some code uses a non-barrier version
  of the sequence count (see the __read_seqcount_begin() users in
  fs/namei.c). They then play games with their own barriers and/or with
  nested sequence counts.

  Those optimizations are literally meaningless on x86, and questionable
  elsewhere. If somebody can show that they matter, we need to re-do
  them more cleanly than "use an internal helper".       - Linus ]

Signed-off-by: Christoph Lameter (Ampere) <cl@gentwo.org>
Link: https://lore.kernel.org/all/20240912-seq_optimize-v3-1-8ee25e04dffa@gentwo.org/
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/seqlock.h | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index d90d8ee29d81..fffeb754880f 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -157,7 +157,7 @@ __seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s)	\
 static __always_inline unsigned						\
 __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)	\
 {									\
-	unsigned seq = READ_ONCE(s->seqcount.sequence);			\
+	unsigned seq = smp_load_acquire(&s->seqcount.sequence);		\
 									\
 	if (!IS_ENABLED(CONFIG_PREEMPT_RT))				\
 		return seq;						\
@@ -170,7 +170,7 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)	\
 		 * Re-read the sequence counter since the (possibly	\
 		 * preempted) writer made progress.			\
 		 */							\
-		seq = READ_ONCE(s->seqcount.sequence);			\
+		seq = smp_load_acquire(&s->seqcount.sequence);		\
 	}								\
 									\
 	return seq;							\
@@ -208,7 +208,7 @@ static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s)
 
 static inline unsigned __seqprop_sequence(const seqcount_t *s)
 {
-	return READ_ONCE(s->sequence);
+	return smp_load_acquire(&s->sequence);
 }
 
 static inline bool __seqprop_preemptible(const seqcount_t *s)
@@ -263,17 +263,9 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
 #define seqprop_assert(s)		__seqprop(s, assert)(s)
 
 /**
- * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
+ * __read_seqcount_begin() - begin a seqcount_t read section
  * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
  *
- * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb()
- * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
- * provided before actually loading any of the variables that are to be
- * protected in this critical section.
- *
- * Use carefully, only in critical code, and comment how the barrier is
- * provided.
- *
  * Return: count to be passed to read_seqcount_retry()
  */
 #define __read_seqcount_begin(s)					\
@@ -293,13 +285,7 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
  *
  * Return: count to be passed to read_seqcount_retry()
  */
-#define raw_read_seqcount_begin(s)					\
-({									\
-	unsigned _seq = __read_seqcount_begin(s);			\
-									\
-	smp_rmb();							\
-	_seq;								\
-})
+#define raw_read_seqcount_begin(s) __read_seqcount_begin(s)
 
 /**
  * read_seqcount_begin() - begin a seqcount_t read critical section
@@ -328,7 +314,6 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
 ({									\
 	unsigned __seq = seqprop_sequence(s);				\
 									\
-	smp_rmb();							\
 	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);			\
 	__seq;								\
 })
-- 
cgit v1.2.3


From ffcdc4c628e1a30489da10dd78358e89c823b341 Mon Sep 17 00:00:00 2001
From: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Date: Fri, 6 Sep 2024 16:34:52 +0200
Subject: fs/mnt_idmapping: introduce an invalid_mnt_idmap

Link: https://lore.kernel.org/linux-fsdevel/20240904-baugrube-erhoben-b3c1c49a2645@brauner/
Suggested-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/linux/mnt_idmapping.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index cd4d5c8781f5..b1b219bc3422 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -9,6 +9,7 @@ struct mnt_idmap;
 struct user_namespace;
 
 extern struct mnt_idmap nop_mnt_idmap;
+extern struct mnt_idmap invalid_mnt_idmap;
 extern struct user_namespace init_user_ns;
 
 typedef struct {
-- 
cgit v1.2.3


From f8eb5bd9a818cc5f2a1e50b22b0091830b28cc36 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 23 Sep 2024 08:58:31 -0700
Subject: mm: fix build on 32-bit targets without MAX_PHYSMEM_BITS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The merge resolution to deal with the conflict between commits
ea72ce5da228 ("x86/kaslr: Expose and use the end of the physical memory
address space") and 99185c10d5d9 ("resource, kunit: add test case for
region_intersects()") ended up being broken in configurations didn't
define a MAX_PHYSMEM_BITS and that had a 32-bit 'phys_addr_t'.

The fallback to using all bits set (ie "(-1ULL)") ended up causing a
build error:

    kernel/resource.c: In function ‘gfr_start’:
    include/linux/minmax.h:93:30: error: conversion from ‘long long unsigned int’ to ‘resource_size_t’ {aka ‘unsigned int’} changes value from ‘18446744073709551615’ to ‘4294967295’ [-Werror=overflow]

this was reported by Geert for m68k, but he points out that it happens
on other 32-bit architectures too, eg mips, xtensa, parisc, and powerpc.

Limiting 'PHYSMEM_END' to a 'phys_addr_t' (which is the same as
'resource_size_t') fixes the build, but Geert points out that it will
then cause a silent overflow in mm/sparse.c:

	unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT;

so we actually do want PHYSMEM_END to be defined a 64-bit type - just
not all ones, and not larger than 'phys_addr_t'.

The proper fix is probably to not have some kind of default fallback at
all, but just make sure every architecture has a valid MAX_PHYSMEM_BITS.
But in the meantime, this just applies the rule that PHYSMEM_END is the
largest value that fits in a 'phys_addr_t', but does not have the high
bit set in 64 bits.

Ugly, ugly.

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b62437447077..ecf63d2b0582 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -101,7 +101,7 @@ extern int mmap_rnd_compat_bits __read_mostly;
 # ifdef MAX_PHYSMEM_BITS
 # define PHYSMEM_END	((1ULL << MAX_PHYSMEM_BITS) - 1)
 # else
-# define PHYSMEM_END	(-1ULL)
+# define PHYSMEM_END	(((phys_addr_t)-1)&~(1ULL<<63))
 # endif
 #endif
 
-- 
cgit v1.2.3


From d98f72272500f505cd7e152ffa456e64ee3855f0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 6 Sep 2024 12:32:03 +1000
Subject: nfs: simplify and guarantee owner uniqueness.

I have evidence of an Linux NFS client getting NFS4ERR_BAD_SEQID to a
v4.0 LOCK request to a Linux server (which had fixed the problem with
RELEASE_LOCKOWNER bug fixed).
The LOCK request presented a "new" lock owner so there are two seq ids
in the request: that for the open file, and that for the new lock.
Given the context I am confident that the new lock owner was reported to
have the wrong seqid.  As lock owner identifiers are reused, the server
must still have a lock owner active which the client thinks is no longer
active.

I wasn't able to determine a root-cause but the simplest fix seems to be
to ensure lock owners are always unique much as open owners are (thanks
to a time stamp).  The easiest way to ensure uniqueness is with a 64bit
counter for each server.  That will never cycle (if updated once a
nanosecond the last 584 years.  A single NFS server would not handle
open/lock requests nearly that fast, and a Linux node is unlikely to
have an uptime approaching that).

This patch removes the 2 ida and instead uses a per-server
atomic64_t to provide uniqueness.

Note that the lock owner already encodes the id as 64 bits even though
it is a 32bit value.  So changing to a 64bit value does not change the
encoding of the lock owner.  The open owner encoding is now 4 bytes
larger.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/nfs_fs_sb.h | 3 +--
 include/linux/nfs_xdr.h   | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 1df86ab98c77..e1e47ebd83ef 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -234,8 +234,7 @@ struct nfs_server {
 	/* the following fields are protected by nfs_client->cl_lock */
 	struct rb_root		state_owners;
 #endif
-	struct ida		openowner_id;
-	struct ida		lockowner_id;
+	atomic64_t		owner_ctr;
 	struct list_head	state_owners_lru;
 	struct list_head	layouts;
 	struct list_head	delegations;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 45623af3e7b8..96ba04ab24f3 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -446,7 +446,7 @@ struct nfs42_clone_res {
 
 struct stateowner_id {
 	__u64	create_time;
-	__u32	uniquifier;
+	__u64	uniquifier;
 };
 
 struct nfs4_open_delegation {
-- 
cgit v1.2.3


From 0b108e83795c9c23101f584ef7e3ab4f1f120ef0 Mon Sep 17 00:00:00 2001
From: Stephen Brennan <stephen.s.brennan@oracle.com>
Date: Mon, 19 Aug 2024 08:58:59 -0700
Subject: SUNRPC: convert RPC_TASK_* constants to enum

The RPC_TASK_* constants are defined as macros, which means that most
kernel builds will not contain their definitions in the debuginfo.
However, it's quite useful for debuggers to be able to view the task
state constant and interpret it correctly. Conversion to an enum will
ensure the constants are present in debuginfo and can be interpreted by
debuggers without needing to hard-code them and track their changes.

Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/sunrpc/sched.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 0c77ba488bba..fec1e8a1570c 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -151,13 +151,15 @@ struct rpc_task_setup {
 #define RPC_WAS_SENT(t)		((t)->tk_flags & RPC_TASK_SENT)
 #define RPC_IS_MOVEABLE(t)	((t)->tk_flags & RPC_TASK_MOVEABLE)
 
-#define RPC_TASK_RUNNING	0
-#define RPC_TASK_QUEUED		1
-#define RPC_TASK_ACTIVE		2
-#define RPC_TASK_NEED_XMIT	3
-#define RPC_TASK_NEED_RECV	4
-#define RPC_TASK_MSG_PIN_WAIT	5
-#define RPC_TASK_SIGNALLED	6
+enum {
+	RPC_TASK_RUNNING,
+	RPC_TASK_QUEUED,
+	RPC_TASK_ACTIVE,
+	RPC_TASK_NEED_XMIT,
+	RPC_TASK_NEED_RECV,
+	RPC_TASK_MSG_PIN_WAIT,
+	RPC_TASK_SIGNALLED,
+};
 
 #define rpc_test_and_set_running(t) \
 				test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
-- 
cgit v1.2.3


From dfb07e990a0d019d7ae9b78dd4260620ce32e79a Mon Sep 17 00:00:00 2001
From: Dan Aloni <dan.aloni@vastdata.com>
Date: Wed, 24 Jul 2024 14:07:12 +0300
Subject: nfs: add 'noalignwrite' option for lock-less 'lost writes' prevention

There are some applications that write to predefined non-overlapping
file offsets from multiple clients and therefore don't need to rely on
file locking. However, if these applications want non-aligned offsets
and sizes they need to either use locks or risk data corruption, as the
NFS client defaults to extending writes to whole pages.

This commit adds a new mount option `noalignwrite`, which allows to turn
that off and avoid the need of locking, as long as these applications
don't overlap on offsets.

Signed-off-by: Dan Aloni <dan.aloni@vastdata.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/nfs_fs_sb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index e1e47ebd83ef..c49bfdded5c1 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -158,6 +158,7 @@ struct nfs_server {
 #define NFS_MOUNT_WRITE_WAIT		0x02000000
 #define NFS_MOUNT_TRUNK_DISCOVERY	0x04000000
 #define NFS_MOUNT_SHUTDOWN			0x08000000
+#define NFS_MOUNT_NO_ALIGNWRITE		0x10000000
 
 	unsigned int		fattr_valid;	/* Valid attributes */
 	unsigned int		caps;		/* server capabilities */
-- 
cgit v1.2.3


From 4806ded4c14c5e8fdc6ce885d83221a78c06a428 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Thu, 5 Sep 2024 15:09:35 -0400
Subject: nfs_common: factor out nfs_errtbl and nfs_stat_to_errno

Common nfs_stat_to_errno() is used by both fs/nfs/nfs2xdr.c and
fs/nfs/nfs3xdr.c

Will also be used by fs/nfsd/localio.c

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/nfs_common.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 include/linux/nfs_common.h

(limited to 'include/linux')

diff --git a/include/linux/nfs_common.h b/include/linux/nfs_common.h
new file mode 100644
index 000000000000..3395c4a4d372
--- /dev/null
+++ b/include/linux/nfs_common.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This file contains constants and methods used by both NFS client and server.
+ */
+#ifndef _LINUX_NFS_COMMON_H
+#define _LINUX_NFS_COMMON_H
+
+#include <linux/errno.h>
+#include <uapi/linux/nfs.h>
+
+/* Mapping from NFS error code to "errno" error code. */
+#define errno_NFSERR_IO EIO
+
+int nfs_stat_to_errno(enum nfs_stat status);
+
+#endif /* _LINUX_NFS_COMMON_H */
-- 
cgit v1.2.3


From 1fcb16674e37e434efe68ec3e142229f35b6b9e1 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Thu, 5 Sep 2024 15:09:36 -0400
Subject: nfs_common: factor out nfs4_errtbl and nfs4_stat_to_errno

Common nfs4_stat_to_errno() is used by fs/nfs/nfs4xdr.c and will be
used by fs/nfs/localio.c

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/nfs_common.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_common.h b/include/linux/nfs_common.h
index 3395c4a4d372..5fc02df88252 100644
--- a/include/linux/nfs_common.h
+++ b/include/linux/nfs_common.h
@@ -12,5 +12,6 @@
 #define errno_NFSERR_IO EIO
 
 int nfs_stat_to_errno(enum nfs_stat status);
+int nfs4_stat_to_errno(int stat);
 
 #endif /* _LINUX_NFS_COMMON_H */
-- 
cgit v1.2.3


From 1545e488b1f908b10f6dff0c278c6b7a37122de8 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Thu, 5 Sep 2024 15:09:37 -0400
Subject: nfs: factor out {encode,decode}_opaque_fixed to nfs_xdr.h

Eliminates duplicate functions in various files to allow for
additional callers.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/nfs_xdr.h | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 96ba04ab24f3..12d8e47bc5a3 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1853,6 +1853,24 @@ struct nfs_rpc_ops {
 	void	(*disable_swap)(struct inode *inode);
 };
 
+/*
+ * Helper functions used by NFS client and/or server
+ */
+static inline void encode_opaque_fixed(struct xdr_stream *xdr,
+				       const void *buf, size_t len)
+{
+	WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0);
+}
+
+static inline int decode_opaque_fixed(struct xdr_stream *xdr,
+				      void *buf, size_t len)
+{
+	ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len);
+	if (unlikely(ret < 0))
+		return -EIO;
+	return 0;
+}
+
 /*
  * Function vectors etc. for the NFS client
  */
@@ -1866,4 +1884,4 @@ extern const struct rpc_version nfs_version4;
 extern const struct rpc_version nfsacl_version3;
 extern const struct rpc_program nfsacl_program;
 
-#endif
+#endif /* _LINUX_NFS_XDR_H */
-- 
cgit v1.2.3


From 199f2128741077087a2ab33889a6868830465033 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Thu, 5 Sep 2024 15:09:46 -0400
Subject: SUNRPC: add svcauth_map_clnt_to_svc_cred_local

Add new funtion svcauth_map_clnt_to_svc_cred_local which maps a
generic cred to a svc_cred suitable for use in nfsd.

This is needed by the localio code to map nfs client creds to nfs
server credentials.

Following from net/sunrpc/auth_unix.c:unx_marshal() it is clear that
->fsuid and ->fsgid must be used (rather than ->uid and ->gid).  In
addition, these uid and gid must be translated with from_kuid_munged()
so local client uses correct uid and gid when acting as local server.

Jeff Layton noted:
  This is where the magic happens. Since we're working in
  kuid_t/kgid_t, we don't need to worry about further idmapping.

Suggested-by: NeilBrown <neilb@suse.de> # to approximate unx_marshal()
Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Co-developed-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/sunrpc/svcauth.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index 63cf6fb26dcc..2e111153f7cd 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -14,6 +14,7 @@
 #include <linux/sunrpc/msg_prot.h>
 #include <linux/sunrpc/cache.h>
 #include <linux/sunrpc/gss_api.h>
+#include <linux/sunrpc/clnt.h>
 #include <linux/hash.h>
 #include <linux/stringhash.h>
 #include <linux/cred.h>
@@ -157,6 +158,10 @@ extern enum svc_auth_status svc_set_client(struct svc_rqst *rqstp);
 extern int	svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops);
 extern void	svc_auth_unregister(rpc_authflavor_t flavor);
 
+extern void	svcauth_map_clnt_to_svc_cred_local(struct rpc_clnt *clnt,
+						   const struct cred *,
+						   struct svc_cred *);
+
 extern struct auth_domain *unix_domain_find(char *name);
 extern void auth_domain_put(struct auth_domain *item);
 extern struct auth_domain *auth_domain_lookup(char *name, struct auth_domain *new);
-- 
cgit v1.2.3


From 86ab08beb3f07f6e51922a8b8f662a5ec7012d35 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 5 Sep 2024 15:09:47 -0400
Subject: SUNRPC: replace program list with program array

A service created with svc_create_pooled() can be given a linked list of
programs and all of these will be served.

Using a linked list makes it cumbersome when there are several programs
that can be optionally selected with CONFIG settings.

After this patch is applied, API consumers must use only
svc_create_pooled() when creating an RPC service that listens for more
than one RPC program.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/sunrpc/svc.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index c419a61f60e5..e68fecf6eab5 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -67,9 +67,10 @@ enum {
  * We currently do not support more than one RPC program per daemon.
  */
 struct svc_serv {
-	struct svc_program *	sv_program;	/* RPC program */
+	struct svc_program *	sv_programs;	/* RPC programs */
 	struct svc_stat *	sv_stats;	/* RPC statistics */
 	spinlock_t		sv_lock;
+	unsigned int		sv_nprogs;	/* Number of sv_programs */
 	unsigned int		sv_nrthreads;	/* # of server threads */
 	unsigned int		sv_maxconn;	/* max connections allowed or
 						 * '0' causing max to be based
@@ -360,10 +361,9 @@ struct svc_process_info {
 };
 
 /*
- * List of RPC programs on the same transport endpoint
+ * RPC program - an array of these can use the same transport endpoint
  */
 struct svc_program {
-	struct svc_program *	pg_next;	/* other programs (same xprt) */
 	u32			pg_prog;	/* program number */
 	unsigned int		pg_lovers;	/* lowest version */
 	unsigned int		pg_hivers;	/* highest version */
@@ -441,6 +441,7 @@ bool		   svc_rqst_replace_page(struct svc_rqst *rqstp,
 void		   svc_rqst_release_pages(struct svc_rqst *rqstp);
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *prog,
+				     unsigned int nprog,
 				     struct svc_stat *stats,
 				     unsigned int bufsize,
 				     int (*threadfn)(void *data));
-- 
cgit v1.2.3


From 2a33a85be45178198245e1f656e6224c899895e4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Thu, 5 Sep 2024 15:09:48 -0400
Subject: nfs_common: add NFS LOCALIO auxiliary protocol enablement

fs/nfs_common/nfslocalio.c provides interfaces that enable an NFS
client to generate a nonce (single-use UUID) and associated nfs_uuid_t
struct, register it with nfs_common for subsequent lookup and
verification by the NFS server and if matched the NFS server populates
members in the nfs_uuid_t struct.

nfs_common's nfs_uuids list is the basis for localio enablement, as
such it has members that point to nfsd memory for direct use by the
client (e.g. 'net' is the server's network namespace, through it the
client can access nn->nfsd_serv).

This commit also provides the base nfs_uuid_t interfaces to allow
proper net namespace refcounting for the LOCALIO use case.

CONFIG_NFS_LOCALIO controls the nfs_common, NFS server and NFS client
enablement for LOCALIO. If both NFS_FS=m and NFSD=m then
NFS_COMMON_LOCALIO_SUPPORT=m and nfs_localio.ko is built (and provides
nfs_common's LOCALIO support).

  # lsmod | grep nfs_localio
  nfs_localio            12288  2 nfsd,nfs
  sunrpc                745472  35 nfs_localio,nfsd,auth_rpcgss,lockd,nfsv3,nfs

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Co-developed-by: NeilBrown <neilb@suse.de>
Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/nfslocalio.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 include/linux/nfslocalio.h

(limited to 'include/linux')

diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
new file mode 100644
index 000000000000..4165ff8390c1
--- /dev/null
+++ b/include/linux/nfslocalio.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
+ * Copyright (C) 2024 NeilBrown <neilb@suse.de>
+ */
+#ifndef __LINUX_NFSLOCALIO_H
+#define __LINUX_NFSLOCALIO_H
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/uuid.h>
+#include <linux/sunrpc/svcauth.h>
+#include <linux/nfs.h>
+#include <net/net_namespace.h>
+
+/*
+ * Useful to allow a client to negotiate if localio
+ * possible with its server.
+ *
+ * See Documentation/filesystems/nfs/localio.rst for more detail.
+ */
+typedef struct {
+	uuid_t uuid;
+	struct list_head list;
+	struct net *net; /* nfsd's network namespace */
+	struct auth_domain *dom; /* auth_domain for localio */
+} nfs_uuid_t;
+
+void nfs_uuid_begin(nfs_uuid_t *);
+void nfs_uuid_end(nfs_uuid_t *);
+void nfs_uuid_is_local(const uuid_t *, struct list_head *,
+		       struct net *, struct auth_domain *, struct module *);
+void nfs_uuid_invalidate_clients(struct list_head *list);
+void nfs_uuid_invalidate_one_client(nfs_uuid_t *nfs_uuid);
+
+#endif  /* __LINUX_NFSLOCALIO_H */
-- 
cgit v1.2.3


From a61e147e6be6e763d9c6dec8061d2893c0bb3423 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Thu, 5 Sep 2024 15:09:49 -0400
Subject: nfs_common: prepare for the NFS client to use nfsd_file for LOCALIO

The next commit will introduce nfsd_open_local_fh() which returns an
nfsd_file structure.  This commit exposes LOCALIO's required NFSD
symbols to the NFS client:

- Make nfsd_open_local_fh() symbol and other required NFSD symbols
  available to NFS in a global 'nfs_to' nfsd_localio_operations
  struct (global access suggested by Trond, nfsd_localio_operations
  suggested by NeilBrown).  The next commit will also introduce
  nfsd_localio_ops_init() that init_nfsd() will call to initialize
  'nfs_to'.

- Introduce nfsd_file_file() that provides access to nfsd_file's
  backing file.  Keeps nfsd_file structure opaque to NFS client (as
  suggested by Jeff Layton).

- Introduce nfsd_file_put_local() that will put the reference to the
  nfsd_file's associated nn->nfsd_serv and then put the reference to
  the nfsd_file (as suggested by NeilBrown).

Suggested-by: Trond Myklebust <trond.myklebust@hammerspace.com> # nfs_to
Suggested-by: NeilBrown <neilb@suse.de> # nfsd_localio_operations
Suggested-by: Jeff Layton <jlayton@kernel.org> # nfsd_file_file
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/nfslocalio.h | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index 4165ff8390c1..1a39d7d727bd 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/list.h>
 #include <linux/uuid.h>
+#include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svcauth.h>
 #include <linux/nfs.h>
 #include <net/net_namespace.h>
@@ -22,7 +23,7 @@
 typedef struct {
 	uuid_t uuid;
 	struct list_head list;
-	struct net *net; /* nfsd's network namespace */
+	struct net __rcu *net; /* nfsd's network namespace */
 	struct auth_domain *dom; /* auth_domain for localio */
 } nfs_uuid_t;
 
@@ -33,4 +34,28 @@ void nfs_uuid_is_local(const uuid_t *, struct list_head *,
 void nfs_uuid_invalidate_clients(struct list_head *list);
 void nfs_uuid_invalidate_one_client(nfs_uuid_t *nfs_uuid);
 
+struct nfsd_file;
+
+/* localio needs to map filehandle -> struct nfsd_file */
+extern struct nfsd_file *
+nfsd_open_local_fh(struct net *, struct auth_domain *, struct rpc_clnt *,
+		   const struct cred *, const struct nfs_fh *,
+		   const fmode_t) __must_hold(rcu);
+
+struct nfsd_localio_operations {
+	bool (*nfsd_serv_try_get)(struct net *);
+	void (*nfsd_serv_put)(struct net *);
+	struct nfsd_file *(*nfsd_open_local_fh)(struct net *,
+						struct auth_domain *,
+						struct rpc_clnt *,
+						const struct cred *,
+						const struct nfs_fh *,
+						const fmode_t);
+	void (*nfsd_file_put_local)(struct nfsd_file *);
+	struct file *(*nfsd_file_file)(struct nfsd_file *);
+} ____cacheline_aligned;
+
+extern void nfsd_localio_ops_init(void);
+extern const struct nfsd_localio_operations *nfs_to;
+
 #endif  /* __LINUX_NFSLOCALIO_H */
-- 
cgit v1.2.3


From fa4983862e506d395acc1b8d14dbebf63acc2e82 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Thu, 5 Sep 2024 15:09:50 -0400
Subject: nfsd: add LOCALIO support

Add server support for bypassing NFS for localhost reads, writes, and
commits. This is only useful when both the client and server are
running on the same host.

If nfsd_open_local_fh() fails then the NFS client will both retry and
fallback to normal network-based read, write and commit operations if
localio is no longer supported.

Care is taken to ensure the same NFS security mechanisms are used
(authentication, etc) regardless of whether localio or regular NFS
access is used.  The auth_domain established as part of the traditional
NFS client access to the NFS server is also used for localio.  Store
auth_domain for localio in nfsd_uuid_t and transfer it to the client
if it is local to the server.

Relative to containers, localio gives the client access to the network
namespace the server has.  This is required to allow the client to
access the server's per-namespace nfsd_net struct.

This commit also introduces the use of NFSD's percpu_ref to interlock
nfsd_destroy_serv and nfsd_open_local_fh, to ensure nn->nfsd_serv is
not destroyed while in use by nfsd_open_local_fh and other LOCALIO
client code.

CONFIG_NFS_LOCALIO enables NFS server support for LOCALIO.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Co-developed-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Co-developed-by: NeilBrown <neilb@suse.de>
Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/nfslocalio.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index 1a39d7d727bd..546f46b4e46f 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -6,6 +6,8 @@
 #ifndef __LINUX_NFSLOCALIO_H
 #define __LINUX_NFSLOCALIO_H
 
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+
 #include <linux/module.h>
 #include <linux/list.h>
 #include <linux/uuid.h>
@@ -58,4 +60,10 @@ struct nfsd_localio_operations {
 extern void nfsd_localio_ops_init(void);
 extern const struct nfsd_localio_operations *nfs_to;
 
+#else   /* CONFIG_NFS_LOCALIO */
+static inline void nfsd_localio_ops_init(void)
+{
+}
+#endif  /* CONFIG_NFS_LOCALIO */
+
 #endif  /* __LINUX_NFSLOCALIO_H */
-- 
cgit v1.2.3


From 946af9b3a0e7571c01447107d5e8968401e659ba Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Thu, 5 Sep 2024 15:09:51 -0400
Subject: nfsd: implement server support for NFS_LOCALIO_PROGRAM

The LOCALIO auxiliary RPC protocol consists of a single "UUID_IS_LOCAL"
RPC method that allows the Linux NFS client to verify the local Linux
NFS server can see the nonce (single-use UUID) the client generated and
made available in nfs_common.  The server expects this protocol to use
the same transport as NFS and NFSACL for its RPCs.  This protocol
isn't part of an IETF standard, nor does it need to be considering it
is Linux-to-Linux auxiliary RPC protocol that amounts to an
implementation detail.

The UUID_IS_LOCAL method encodes the client generated uuid_t in terms of
the fixed UUID_SIZE (16 bytes).  The fixed size opaque encode and decode
XDR methods are used instead of the less efficient variable sized
methods.

The RPC program number for the NFS_LOCALIO_PROGRAM is 400122 (as assigned
by IANA, see https://www.iana.org/assignments/rpc-program-numbers/ ):
Linux Kernel Organization       400122  nfslocalio

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
[neilb: factored out and simplified single localio protocol]
Co-developed-by: NeilBrown <neilb@suse.de>
Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/nfs.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs.h b/include/linux/nfs.h
index ceb70a926b95..73da75908d95 100644
--- a/include/linux/nfs.h
+++ b/include/linux/nfs.h
@@ -13,6 +13,13 @@
 #include <linux/crc32.h>
 #include <uapi/linux/nfs.h>
 
+/* The LOCALIO program is entirely private to Linux and is
+ * NOT part of the uapi.
+ */
+#define NFS_LOCALIO_PROGRAM		400122
+#define LOCALIOPROC_NULL		0
+#define LOCALIOPROC_UUID_IS_LOCAL	1
+
 /*
  * This is the kernel NFS client file handle representation
  */
-- 
cgit v1.2.3


From df24c483e28f7f9a421afde15d0497e61bc2d3ea Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Thu, 5 Sep 2024 15:09:52 -0400
Subject: nfs: pass struct nfsd_file to nfs_init_pgio and nfs_init_commit

The nfsd_file will be passed, in future commits, by callers
that enable LOCALIO support (for both regular NFS and pNFS IO).

[Derived from patch authored by Weston Andros Adamson, but switched
 from passing struct file to struct nfsd_file]

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/nfslocalio.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index 546f46b4e46f..2c4e0fd9da6b 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -6,6 +6,9 @@
 #ifndef __LINUX_NFSLOCALIO_H
 #define __LINUX_NFSLOCALIO_H
 
+/* nfsd_file structure is purposely kept opaque to NFS client */
+struct nfsd_file;
+
 #if IS_ENABLED(CONFIG_NFS_LOCALIO)
 
 #include <linux/module.h>
@@ -36,8 +39,6 @@ void nfs_uuid_is_local(const uuid_t *, struct list_head *,
 void nfs_uuid_invalidate_clients(struct list_head *list);
 void nfs_uuid_invalidate_one_client(nfs_uuid_t *nfs_uuid);
 
-struct nfsd_file;
-
 /* localio needs to map filehandle -> struct nfsd_file */
 extern struct nfsd_file *
 nfsd_open_local_fh(struct net *, struct auth_domain *, struct rpc_clnt *,
-- 
cgit v1.2.3


From 70ba381e1a431245c137ed597ec6a05991c79bd9 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Thu, 5 Sep 2024 15:09:53 -0400
Subject: nfs: add LOCALIO support

Add client support for bypassing NFS for localhost reads, writes, and
commits. This is only useful when the client and the server are
running on the same host.

nfs_local_probe() is stubbed out, later commits will enable client and
server handshake via a Linux-only LOCALIO auxiliary RPC protocol.

This has dynamic binding with the nfsd module (via nfs_localio module
which is part of nfs_common). LOCALIO will only work if nfsd is
already loaded.

The "localio_enabled" nfs kernel module parameter can be used to
disable and enable the ability to use LOCALIO support.

CONFIG_NFS_LOCALIO enables NFS client support for LOCALIO.

Lastly, LOCALIO uses an nfsd_file to initiate all IO. To make proper
use of nfsd_file (and nfsd's filecache) its lifetime (duration before
nfsd_file_put is called) must extend until after commit, read and
write operations. So rather than immediately drop the nfsd_file
reference in nfs_local_open_fh(), that doesn't happen until
nfs_local_pgio_release() for read/write and not until
nfs_local_release_commit_data() for commit. The same applies to the
reference held on nfsd's nn->nfsd_serv. Both objects' lifetimes and
associated references are managed through calls to
nfs_to->nfsd_file_put_local().

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Co-developed-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de> # nfs_open_local_fh
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
---
 include/linux/nfs.h        | 2 ++
 include/linux/nfs_fs_sb.h  | 9 +++++++++
 include/linux/nfslocalio.h | 4 ++++
 3 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs.h b/include/linux/nfs.h
index 73da75908d95..9ad727ddfedb 100644
--- a/include/linux/nfs.h
+++ b/include/linux/nfs.h
@@ -8,6 +8,8 @@
 #ifndef _LINUX_NFS_H
 #define _LINUX_NFS_H
 
+#include <linux/cred.h>
+#include <linux/sunrpc/auth.h>
 #include <linux/sunrpc/msg_prot.h>
 #include <linux/string.h>
 #include <linux/crc32.h>
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index c49bfdded5c1..853df3fcd4c2 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -8,6 +8,7 @@
 #include <linux/wait.h>
 #include <linux/nfs_xdr.h>
 #include <linux/sunrpc/xprt.h>
+#include <linux/nfslocalio.h>
 
 #include <linux/atomic.h>
 #include <linux/refcount.h>
@@ -49,6 +50,7 @@ struct nfs_client {
 #define NFS_CS_DS		7		/* - Server is a DS */
 #define NFS_CS_REUSEPORT	8		/* - reuse src port on reconnect */
 #define NFS_CS_PNFS		9		/* - Server used for pnfs */
+#define NFS_CS_LOCAL_IO		10		/* - client is local */
 	struct sockaddr_storage	cl_addr;	/* server identifier */
 	size_t			cl_addrlen;
 	char *			cl_hostname;	/* hostname of server */
@@ -125,6 +127,13 @@ struct nfs_client {
 	struct net		*cl_net;
 	struct list_head	pending_cb_stateids;
 	struct rcu_head		rcu;
+
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+	struct timespec64	cl_nfssvc_boot;
+	seqlock_t		cl_boot_lock;
+	nfs_uuid_t		cl_uuid;
+	spinlock_t		cl_localio_lock;
+#endif /* CONFIG_NFS_LOCALIO */
 };
 
 /*
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index 2c4e0fd9da6b..b353abe00357 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -61,6 +61,10 @@ struct nfsd_localio_operations {
 extern void nfsd_localio_ops_init(void);
 extern const struct nfsd_localio_operations *nfs_to;
 
+struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *,
+		   struct rpc_clnt *, const struct cred *,
+		   const struct nfs_fh *, const fmode_t);
+
 #else   /* CONFIG_NFS_LOCALIO */
 static inline void nfsd_localio_ops_init(void)
 {
-- 
cgit v1.2.3


From b166b8ab4189743a717cb93f50d6fcca3a46770d Mon Sep 17 00:00:00 2001
From: Jiqian Chen <Jiqian.Chen@amd.com>
Date: Tue, 24 Sep 2024 14:14:36 +0800
Subject: xen/pvh: Setup gsi for passthrough device

In PVH dom0, the gsis don't get registered, but the gsi of
a passthrough device must be configured for it to be able to be
mapped into a domU.

When assigning a device to passthrough, proactively setup the gsi
of the device during that process.

Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
Signed-off-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Jiqian Chen <Jiqian.Chen@amd.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
Message-ID: <20240924061437.2636766-3-Jiqian.Chen@amd.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 include/linux/acpi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 0687a442fec7..02ded9f53a6b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -362,6 +362,7 @@ void acpi_unregister_gsi (u32 gsi);
 
 struct pci_dev;
 
+struct acpi_prt_entry *acpi_pci_irq_lookup(struct pci_dev *dev, int pin);
 int acpi_pci_irq_enable (struct pci_dev *dev);
 void acpi_penalize_isa_irq(int irq, int active);
 bool acpi_isa_irq_available(int irq);
-- 
cgit v1.2.3


From 2efddb5575cd9f5f4d61ad417c92365a5f18d2f1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 25 Sep 2024 10:57:00 +0200
Subject: Revert "driver core: shut down devices asynchronously"

This reverts commit 8064952c65045f05ee2671fe437770e50c151776.

The series is being reverted before -rc1 as there are still reports of
lockups on shutdown, so it's not quite ready for "prime time."

Reported-by: Andrey Skvortsov <andrej.skvortzov@gmail.com>
Link: https://lore.kernel.org/r/ZvMkkhyJrohaajuk@skv.local
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Jeffery <djeffery@redhat.com>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Laurence Oberman <loberman@redhat.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device/driver.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h
index 14c9211b82d6..5c04b8e3833b 100644
--- a/include/linux/device/driver.h
+++ b/include/linux/device/driver.h
@@ -56,7 +56,6 @@ enum probe_type {
  * @mod_name:	Used for built-in modules.
  * @suppress_bind_attrs: Disables bind/unbind via sysfs.
  * @probe_type:	Type of the probe (synchronous or asynchronous) to use.
- * @async_shutdown_enable: Enables devices to be shutdown asynchronously.
  * @of_match_table: The open firmware table.
  * @acpi_match_table: The ACPI match table.
  * @probe:	Called to query the existence of a specific device,
@@ -103,7 +102,6 @@ struct device_driver {
 
 	bool suppress_bind_attrs;	/* disables bind/unbind via sysfs */
 	enum probe_type probe_type;
-	bool async_shutdown_enable;
 
 	const struct of_device_id	*of_match_table;
 	const struct acpi_device_id	*acpi_match_table;
-- 
cgit v1.2.3


From ce4db753de21abfb7516ef64aff907813e8a8e3e Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Mon, 26 Aug 2024 11:25:52 +0800
Subject: kprobes: Remove obsoleted declaration for init_test_probes

The init_test_probes() have been removed since
commit e44e81c5b90f ("kprobes: convert tests to kunit"), and now
it is useless, so remove it.

Link: https://lore.kernel.org/all/20240826032552.4016314-1-cuigaosheng1@huawei.com/

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/kprobes.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 5fcbc254d186..8c4f3bb24429 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -269,15 +269,6 @@ extern unsigned long __stop_kprobe_blacklist[];
 
 extern struct kretprobe_blackpoint kretprobe_blacklist[];
 
-#ifdef CONFIG_KPROBES_SANITY_TEST
-extern int init_test_probes(void);
-#else /* !CONFIG_KPROBES_SANITY_TEST */
-static inline int init_test_probes(void)
-{
-	return 0;
-}
-#endif /* CONFIG_KPROBES_SANITY_TEST */
-
 extern int arch_prepare_kprobe(struct kprobe *p);
 extern void arch_arm_kprobe(struct kprobe *p);
 extern void arch_disarm_kprobe(struct kprobe *p);
-- 
cgit v1.2.3


From d5dbf8b48a4620db771f399ed7fce32d447f04a6 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Sun, 18 Aug 2024 19:42:58 +0900
Subject: tracepoint: Support iterating over tracepoints on modules

Add for_each_module_tracepoint() for iterating over tracepoints
on modules. This is similar to the for_each_kernel_tracepoint()
but only for the tracepoints on modules (not including kernel
built-in tracepoints).

Link: https://lore.kernel.org/all/172397777800.286558.14554748203446214056.stgit@devnote2/

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/tracepoint.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 6be396bb4297..837fcf8ec0d5 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -64,6 +64,8 @@ struct tp_module {
 bool trace_module_has_bad_taint(struct module *mod);
 extern int register_tracepoint_module_notifier(struct notifier_block *nb);
 extern int unregister_tracepoint_module_notifier(struct notifier_block *nb);
+void for_each_module_tracepoint(void (*fct)(struct tracepoint *, void *),
+				void *priv);
 #else
 static inline bool trace_module_has_bad_taint(struct module *mod)
 {
@@ -79,6 +81,11 @@ int unregister_tracepoint_module_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
+static inline
+void for_each_module_tracepoint(void (*fct)(struct tracepoint *, void *),
+				void *priv)
+{
+}
 #endif /* CONFIG_MODULES */
 
 /*
-- 
cgit v1.2.3


From d4df54f338e43c790460674a3cc7db35b8395421 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Sun, 18 Aug 2024 19:43:07 +0900
Subject: tracepoint: Support iterating tracepoints in a loading module

Add for_each_tracepoint_in_module() function to iterate tracepoints in
a module. This API is needed for handling tracepoints in a loading
module from tracepoint_module_notifier callback function.
This also update for_each_module_tracepoint() to pass the module to
callback function so that it can find module easily.

Link: https://lore.kernel.org/all/172397778740.286558.15781131277732977643.stgit@devnote2/

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/tracepoint.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 837fcf8ec0d5..93a9f3070b48 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -64,8 +64,13 @@ struct tp_module {
 bool trace_module_has_bad_taint(struct module *mod);
 extern int register_tracepoint_module_notifier(struct notifier_block *nb);
 extern int unregister_tracepoint_module_notifier(struct notifier_block *nb);
-void for_each_module_tracepoint(void (*fct)(struct tracepoint *, void *),
+void for_each_module_tracepoint(void (*fct)(struct tracepoint *,
+					struct module *, void *),
 				void *priv);
+void for_each_tracepoint_in_module(struct module *,
+				   void (*fct)(struct tracepoint *,
+					struct module *, void *),
+				   void *priv);
 #else
 static inline bool trace_module_has_bad_taint(struct module *mod)
 {
@@ -82,10 +87,18 @@ int unregister_tracepoint_module_notifier(struct notifier_block *nb)
 	return 0;
 }
 static inline
-void for_each_module_tracepoint(void (*fct)(struct tracepoint *, void *),
+void for_each_module_tracepoint(void (*fct)(struct tracepoint *,
+					struct module *, void *),
 				void *priv)
 {
 }
+static inline
+void for_each_tracepoint_in_module(struct module *mod,
+				   void (*fct)(struct tracepoint *,
+					struct module *, void *),
+				   void *priv)
+{
+}
 #endif /* CONFIG_MODULES */
 
 /*
-- 
cgit v1.2.3


From 04e906839a053f092ef53f4fb2d610983412b904 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Thu, 19 Sep 2024 14:33:42 +0200
Subject: usbnet: fix cyclical race on disconnect with work queue

The work can submit URBs and the URBs can schedule the work.
This cycle needs to be broken, when a device is to be stopped.
Use a flag to do so.
This is a design issue as old as the driver.

Signed-off-by: Oliver Neukum <oneukum@suse.com>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
CC: stable@vger.kernel.org
Link: https://patch.msgid.link/20240919123525.688065-1-oneukum@suse.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/usb/usbnet.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 9f08a584d707..0b9f1e598e3a 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -76,8 +76,23 @@ struct usbnet {
 #		define EVENT_LINK_CHANGE	11
 #		define EVENT_SET_RX_MODE	12
 #		define EVENT_NO_IP_ALIGN	13
+/* This one is special, as it indicates that the device is going away
+ * there are cyclic dependencies between tasklet, timer and bh
+ * that must be broken
+ */
+#		define EVENT_UNPLUG		31
 };
 
+static inline bool usbnet_going_away(struct usbnet *ubn)
+{
+	return test_bit(EVENT_UNPLUG, &ubn->flags);
+}
+
+static inline void usbnet_mark_going_away(struct usbnet *ubn)
+{
+	set_bit(EVENT_UNPLUG, &ubn->flags);
+}
+
 static inline struct usb_driver *driver_of(struct usb_interface *intf)
 {
 	return to_usb_driver(intf->dev.driver);
-- 
cgit v1.2.3


From 8af79d3edb5fd2dce35ea0a71595b6d4f9962350 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 18 Sep 2024 15:13:39 +0200
Subject: netfilter: nfnetlink_queue: remove old clash resolution logic

For historical reasons there are two clash resolution spots in
netfilter, one in nfnetlink_queue and one in conntrack core.

nfnetlink_queue one was added first: If a colliding entry is found, NAT
NAT transformation is reversed by calling nat engine again with altered
tuple.

See commit 368982cd7d1b ("netfilter: nfnetlink_queue: resolve clash for
unconfirmed conntracks") for details.

One problem is that nf_reroute() won't take an action if the queueing
doesn't occur in the OUTPUT hook, i.e. when queueing in forward or
postrouting, packet will be sent via the wrong path.

Another problem is that the scenario addressed (2nd UDP packet sent with
identical addresses while first packet is still being processed) can also
occur without any nfqueue involvement due to threaded resolvers doing
A and AAAA requests back-to-back.

This lead us to add clash resolution logic to the conntrack core, see
commit 6a757c07e51f ("netfilter: conntrack: allow insertion of clashing
entries").  Instead of fixing the nfqueue based logic, lets remove it
and let conntrack core handle this instead.

Retain the ->update hook for sake of nfqueue based conntrack helpers.
We could axe this hook completely but we'd have to split confirm and
helper logic again, see commit ee04805ff54a ("netfilter: conntrack: make
conntrack userspace helpers work again").

This SHOULD NOT be backported to kernels earlier than v5.6; they lack
adequate clash resolution handling.

Patch was originally written by Pablo Neira Ayuso.

Reported-by: Antonio Ojea <aojea@google.com>
Closes: https://bugzilla.netfilter.org/show_bug.cgi?id=1766
Signed-off-by: Florian Westphal <fw@strlen.de>
Tested-by: Antonio Ojea <aojea@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 2683b2b77612..2b8aac2c70ad 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -376,15 +376,11 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
 struct nf_conn;
 enum nf_nat_manip_type;
 struct nlattr;
-enum ip_conntrack_dir;
 
 struct nf_nat_hook {
 	int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip,
 			       const struct nlattr *attr);
 	void (*decode_session)(struct sk_buff *skb, struct flowi *fl);
-	unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct,
-				  enum nf_nat_manip_type mtype,
-				  enum ip_conntrack_dir dir);
 	void (*remove_nat_bysrc)(struct nf_conn *ct);
 };
 
-- 
cgit v1.2.3


From a78282e2c94f4ca80a2d7c56e4d1e9546be5596d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 26 Sep 2024 11:39:02 -0700
Subject: Revert "binfmt_elf, coredump: Log the reason of the failed core
 dumps"

This reverts commit fb97d2eb542faf19a8725afbd75cbc2518903210.

The logging was questionable to begin with, but it seems to actively
deadlock on the task lock.

 "On second thought, let's not log core dump failures. 'Tis a silly place"

because if you can't tell your core dump is truncated, maybe you should
just fix your debugger instead of adding bugs to the kernel.

Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
Link: https://lore.kernel.org/all/d122ece6-3606-49de-ae4d-8da88846bef2@oracle.com/
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/coredump.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index edeb8532ce0f..45e598fe3476 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -42,7 +42,7 @@ extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr);
 extern int dump_align(struct coredump_params *cprm, int align);
 int dump_user_range(struct coredump_params *cprm, unsigned long start,
 		    unsigned long len);
-extern int do_coredump(const kernel_siginfo_t *siginfo);
+extern void do_coredump(const kernel_siginfo_t *siginfo);
 
 /*
  * Logging for the coredump code, ratelimited.
@@ -62,11 +62,7 @@ extern int do_coredump(const kernel_siginfo_t *siginfo);
 #define coredump_report_failure(fmt, ...) __COREDUMP_PRINTK(KERN_WARNING, fmt, ##__VA_ARGS__)
 
 #else
-static inline int do_coredump(const kernel_siginfo_t *siginfo)
-{
-	/* Coredump support is not available, can't fail. */
-	return 0;
-}
+static inline void do_coredump(const kernel_siginfo_t *siginfo) {}
 
 #define coredump_report(...)
 #define coredump_report_failure(...)
-- 
cgit v1.2.3


From 26a8ea80929c518bdec5e53a5776f95919b7c88e Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Tue, 3 Sep 2024 07:25:19 -0700
Subject: mm/hugetlb: fix memfd_pin_folios resv_huge_pages leak

memfd_pin_folios followed by unpin_folios leaves resv_huge_pages elevated
if the pages were not already faulted in.  During a normal page fault,
resv_huge_pages is consumed here:

hugetlb_fault()
  alloc_hugetlb_folio()
    dequeue_hugetlb_folio_vma()
      dequeue_hugetlb_folio_nodemask()
        dequeue_hugetlb_folio_node_exact()
          free_huge_pages--
      resv_huge_pages--

During memfd_pin_folios, the page is created by calling
alloc_hugetlb_folio_nodemask instead of alloc_hugetlb_folio, and
resv_huge_pages is not modified:

memfd_alloc_folio()
  alloc_hugetlb_folio_nodemask()
    dequeue_hugetlb_folio_nodemask()
      dequeue_hugetlb_folio_node_exact()
        free_huge_pages--

alloc_hugetlb_folio_nodemask has other callers that must not modify
resv_huge_pages.  Therefore, to fix, define an alternate version of
alloc_hugetlb_folio_nodemask for this call site that adjusts
resv_huge_pages.

Link: https://lkml.kernel.org/r/1725373521-451395-4-git-send-email-steven.sistare@oracle.com
Fixes: 89c1905d9c14 ("mm/gup: introduce memfd_pin_folios() for pinning memfd folios")
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Acked-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Peter Xu <peterx@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 98c47c394b89..e4697539b665 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -692,6 +692,9 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
 				nodemask_t *nmask, gfp_t gfp_mask,
 				bool allow_alloc_fallback);
+struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
+					  nodemask_t *nmask, gfp_t gfp_mask);
+
 int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
 			pgoff_t idx);
 void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
@@ -1059,6 +1062,13 @@ static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 	return NULL;
 }
 
+static inline struct folio *
+alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
+			    nodemask_t *nmask, gfp_t gfp_mask)
+{
+	return NULL;
+}
+
 static inline struct folio *
 alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
 			nodemask_t *nmask, gfp_t gfp_mask,
-- 
cgit v1.2.3


From c5b1184decc819756ae549ba54c63b6790c4ddfd Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Tue, 24 Sep 2024 14:27:10 +0800
Subject: compiler.h: specify correct attribute for .rodata..c_jump_table

Currently, there is an assembler message when generating kernel/bpf/core.o
under CONFIG_OBJTOOL with LoongArch compiler toolchain:

  Warning: setting incorrect section attributes for .rodata..c_jump_table

This is because the section ".rodata..c_jump_table" should be readonly,
but there is a "W" (writable) part of the flags:

  $ readelf -S kernel/bpf/core.o | grep -A 1 "rodata..c"
  [34] .rodata..c_j[...] PROGBITS         0000000000000000  0000d2e0
       0000000000000800  0000000000000000  WA       0     0     8

There is no above issue on x86 due to the generated section flag is only
"A" (allocatable). In order to silence the warning on LoongArch, specify
the attribute like ".rodata..c_jump_table,\"a\",@progbits #" explicitly,
then the section attribute of ".rodata..c_jump_table" must be readonly
in the kernel/bpf/core.o file.

Before:

  $ objdump -h kernel/bpf/core.o | grep -A 1 "rodata..c"
   21 .rodata..c_jump_table 00000800  0000000000000000  0000000000000000  0000d2e0  2**3
                  CONTENTS, ALLOC, LOAD, RELOC, DATA

After:

  $ objdump -h kernel/bpf/core.o | grep -A 1 "rodata..c"
   21 .rodata..c_jump_table 00000800  0000000000000000  0000000000000000  0000d2e0  2**3
                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA

By the way, AFAICT, maybe the root cause is related with the different
compiler behavior of various archs, so to some extent this change is a
workaround for LoongArch, and also there is no effect for x86 which is the
only port supported by objtool before LoongArch with this patch.

Link: https://lkml.kernel.org/r/20240924062710.1243-1-yangtiezhu@loongson.cn
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>	[6.9+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index ec55bcce4146..4d4e23b6e3e7 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -133,7 +133,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 #define annotate_unreachable() __annotate_unreachable(__COUNTER__)
 
 /* Annotate a C jump table to allow objtool to follow the code flow */
-#define __annotate_jump_table __section(".rodata..c_jump_table")
+#define __annotate_jump_table __section(".rodata..c_jump_table,\"a\",@progbits #")
 
 #else /* !CONFIG_OBJTOOL */
 #define annotate_reachable()
-- 
cgit v1.2.3


From cb787f4ac0c2e439ea8d7e6387b925f74576bdf8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 Sep 2024 02:56:11 +0100
Subject: [tree-wide] finally take no_llseek out

no_llseek had been defined to NULL two years ago, in commit 868941b14441
("fs: remove no_llseek")

To quote that commit,

  At -rc1 we'll need do a mechanical removal of no_llseek -

  git grep -l -w no_llseek | grep -v porting.rst | while read i; do
	sed -i '/\<no_llseek\>/d' $i
  done

  would do it.

Unfortunately, that hadn't been done.  Linus, could you do that now, so
that we could finally put that thing to rest? All instances are of the
form
	.llseek = no_llseek,
so it's obviously safe.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/debugfs.h | 1 -
 include/linux/fs.h      | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index c9c65b132c0f..0928a6c8ae1e 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -57,7 +57,6 @@ static const struct file_operations __fops = {				\
 	.release = simple_attr_release,					\
 	.read	 = debugfs_attr_read,					\
 	.write	 = (__is_signed) ? debugfs_attr_write_signed : debugfs_attr_write,	\
-	.llseek  = no_llseek,						\
 }
 
 #define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt)		\
diff --git a/include/linux/fs.h b/include/linux/fs.h
index eae5b67e4a15..e3c603d01337 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3234,7 +3234,6 @@ extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
 extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern loff_t noop_llseek(struct file *file, loff_t offset, int whence);
-#define no_llseek NULL
 extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
 extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
-- 
cgit v1.2.3