From dc55e35f9e810f23dd69cfdc91a3d636023f57a2 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 14 Feb 2022 19:18:14 +0100 Subject: ipc: Store mqueue sysctls in the ipc namespace Right now, the mqueue sysctls take ipc namespaces into account in a rather hacky way. This works in most cases, but does not respect the user namespace. Within the user namespace, the user cannot change the /proc/sys/fs/mqueue/* parametres. This poses a problem in the rootless containers. To solve this I changed the implementation of the mqueue sysctls just like some other sysctls. So far, the changes do not provide additional access to files. This will be done in a future patch. v3: * Don't implemenet set_permissions to keep the current behavior. v2: * Fixed compilation problem if CONFIG_POSIX_MQUEUE_SYSCTL is not specified. Reported-by: kernel test robot Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/b0ccbb2489119f1f20c737cf1930c3a9c4e4243a.1644862280.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- include/linux/ipc_namespace.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index b75395ec8d52..fa787d97d60a 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -10,6 +10,7 @@ #include #include #include +#include struct user_namespace; @@ -63,6 +64,9 @@ struct ipc_namespace { unsigned int mq_msg_default; unsigned int mq_msgsize_default; + struct ctl_table_set mq_set; + struct ctl_table_header *mq_sysctls; + /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; struct ucounts *ucounts; @@ -169,14 +173,18 @@ static inline void put_ipc_ns(struct ipc_namespace *ns) #ifdef CONFIG_POSIX_MQUEUE_SYSCTL -struct ctl_table_header; -extern struct ctl_table_header *mq_register_sysctl_table(void); +void retire_mq_sysctls(struct ipc_namespace *ns); +bool setup_mq_sysctls(struct ipc_namespace *ns); #else /* CONFIG_POSIX_MQUEUE_SYSCTL */ -static inline struct ctl_table_header *mq_register_sysctl_table(void) +static inline void retire_mq_sysctls(struct ipc_namespace *ns) { - return NULL; +} + +static inline bool setup_mq_sysctls(struct ipc_namespace *ns) +{ + return true; } #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */ -- cgit v1.2.3 From 1f5c135ee509e89e0cc274333a65f73c62cb16e5 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 14 Feb 2022 19:18:15 +0100 Subject: ipc: Store ipc sysctls in the ipc namespace The ipc sysctls are not available for modification inside the user namespace. Following the mqueue sysctls, we changed the implementation to be more userns friendly. So far, the changes do not provide additional access to files. This will be done in a future patch. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/be6f9d014276f4dddd0c3aa05a86052856c1c555.1644862280.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- include/linux/ipc_namespace.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index fa787d97d60a..e3e8c8662b49 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -67,6 +67,9 @@ struct ipc_namespace { struct ctl_table_set mq_set; struct ctl_table_header *mq_sysctls; + struct ctl_table_set ipc_set; + struct ctl_table_header *ipc_sysctls; + /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; struct ucounts *ucounts; @@ -188,4 +191,22 @@ static inline bool setup_mq_sysctls(struct ipc_namespace *ns) } #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */ + +#ifdef CONFIG_SYSVIPC_SYSCTL + +bool setup_ipc_sysctls(struct ipc_namespace *ns); +void retire_ipc_sysctls(struct ipc_namespace *ns); + +#else /* CONFIG_SYSVIPC_SYSCTL */ + +static inline void retire_ipc_sysctls(struct ipc_namespace *ns) +{ +} + +static inline bool setup_ipc_sysctls(struct ipc_namespace *ns) +{ + return true; +} + +#endif /* CONFIG_SYSVIPC_SYSCTL */ #endif -- cgit v1.2.3 From 8733068b9bdbc7a54f02dcc59eb0e4789cd60942 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 3 Mar 2022 15:41:17 +0000 Subject: KVM: x86/xen: Make kvm_xen_set_evtchn() reusable from other places Clean it up to return -errno on error consistently, while still being compatible with the return conventions for kvm_arch_set_irq_inatomic() and the kvm_set_irq() callback. We use -ENOTCONN to indicate when the port is masked. No existing users care, except that it's negative. Also allow it to optimise the vCPU lookup. Unless we abuse the lapic map, there is no quick lookup from APIC ID to a vCPU; the logic in kvm_get_vcpu_by_id() will just iterate over all vCPUs till it finds the one it wants. So do that just once and stash the result in the struct kvm_xen_evtchn for next time. Signed-off-by: David Woodhouse Signed-off-by: Paolo Bonzini Message-Id: <20220303154127.202856-8-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3f9b22c4983a..252ee4a61b58 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -611,7 +611,8 @@ struct kvm_hv_sint { struct kvm_xen_evtchn { u32 port; - u32 vcpu; + u32 vcpu_id; + int vcpu_idx; u32 priority; }; -- cgit v1.2.3 From 8bea9af887de4c99a95f93f2ce400ef63e8b4e9b Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Tue, 22 Mar 2022 12:50:27 +0200 Subject: iio: adc: ad_sigma_delta: Add sequencer support Some sigma-delta chips support sampling of multiple channels in continuous mode. When the operating with more than one channel enabled, the channel sequencer cycles through the enabled channels in sequential order, from first channel to the last one. If a channel is disabled, it is skipped by the sequencer. If more than one channel is used in continuous mode, instruct the device to append the status to the SPI transfer (1 extra byte) every time we receive a sample. All sigma-delta chips possessing a sampling sequencer have this ability. Inside the status register there will be the number of the converted channel. In this way, even if the CPU won't keep up with the sampling rate, it won't send to userspace wrong channel samples. When multiple channels are enabled in continuous mode, the device needs to perform a measurement on all slots before we can push to userspace the sample. If, during sequencing and data reading, a channel measurement is lost, a desync occurred. In this case, ad_sigma_delta drops the incomplete sample and waits for the device to send the measurement on the first active slot. Co-developed-by: Alexandru Tachici Signed-off-by: Alexandru Tachici Signed-off-by: Lars-Peter Clausen Link: https://lore.kernel.org/r/20220322105029.86389-5-alexandru.tachici@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/ad_sigma_delta.h | 38 ++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index c525fd51652f..7852f6c9a714 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -32,26 +32,34 @@ struct iio_dev; /** * struct ad_sigma_delta_info - Sigma Delta driver specific callbacks and options * @set_channel: Will be called to select the current channel, may be NULL. + * @append_status: Will be called to enable status append at the end of the sample, may be NULL. * @set_mode: Will be called to select the current mode, may be NULL. + * @disable_all: Will be called to disable all channels, may be NULL. * @postprocess_sample: Is called for each sampled data word, can be used to * modify or drop the sample data, it, may be NULL. * @has_registers: true if the device has writable and readable registers, false * if there is just one read-only sample data shift register. * @addr_shift: Shift of the register address in the communications register. * @read_mask: Mask for the communications register having the read bit set. + * @status_ch_mask: Mask for the channel number stored in status register. * @data_reg: Address of the data register, if 0 the default address of 0x3 will * be used. * @irq_flags: flags for the interrupt used by the triggered buffer + * @num_slots: Number of sequencer slots */ struct ad_sigma_delta_info { int (*set_channel)(struct ad_sigma_delta *, unsigned int channel); + int (*append_status)(struct ad_sigma_delta *, bool append); int (*set_mode)(struct ad_sigma_delta *, enum ad_sigma_delta_mode mode); + int (*disable_all)(struct ad_sigma_delta *); int (*postprocess_sample)(struct ad_sigma_delta *, unsigned int raw_sample); bool has_registers; unsigned int addr_shift; unsigned int read_mask; + unsigned int status_ch_mask; unsigned int data_reg; unsigned long irq_flags; + unsigned int num_slots; }; /** @@ -76,6 +84,13 @@ struct ad_sigma_delta { uint8_t comm; const struct ad_sigma_delta_info *info; + unsigned int active_slots; + unsigned int current_slot; + unsigned int num_slots; + bool status_appended; + /* map slots to channels in order to know what to expect from devices */ + unsigned int *slots; + uint8_t *samples_buf; /* * DMA (thus cache coherency maintenance) requires the @@ -97,6 +112,29 @@ static inline int ad_sigma_delta_set_channel(struct ad_sigma_delta *sd, return 0; } +static inline int ad_sigma_delta_append_status(struct ad_sigma_delta *sd, bool append) +{ + int ret; + + if (sd->info->append_status) { + ret = sd->info->append_status(sd, append); + if (ret < 0) + return ret; + + sd->status_appended = append; + } + + return 0; +} + +static inline int ad_sigma_delta_disable_all(struct ad_sigma_delta *sd) +{ + if (sd->info->disable_all) + return sd->info->disable_all(sd); + + return 0; +} + static inline int ad_sigma_delta_set_mode(struct ad_sigma_delta *sd, unsigned int mode) { -- cgit v1.2.3 From e87f4152e542610d0b4c6c8548964a68a59d2040 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 23 Mar 2022 20:02:41 +0100 Subject: task_stack, x86/cea: Force-inline stack helpers Force-inline two stack helpers to fix the following objtool warnings: vmlinux.o: warning: objtool: in_task_stack()+0xc: call to task_stack_page() leaves .noinstr.text section vmlinux.o: warning: objtool: in_entry_stack()+0x10: call to cpu_entry_stack() leaves .noinstr.text section Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220324183607.31717-2-bp@alien8.de --- include/linux/sched/task_stack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index 892562ebbd3a..5e799a47431e 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -16,7 +16,7 @@ * try_get_task_stack() instead. task_stack_page will return a pointer * that could get freed out from under you. */ -static inline void *task_stack_page(const struct task_struct *task) +static __always_inline void *task_stack_page(const struct task_struct *task) { return task->stack; } -- cgit v1.2.3 From f0e3c6261af183f0c2246cfe691abec78377622c Mon Sep 17 00:00:00 2001 From: Johnson Wang Date: Fri, 1 Apr 2022 16:02:11 +0800 Subject: regulator: mt6366: Add support for MT6366 regulator The MT6366 is a regulator found on boards based on MediaTek MT8186 and probably other SoCs. It is a so called pmic and connects as a slave to SoC using SPI, wrapped inside the pmic-wrapper. Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Johnson Wang Link: https://lore.kernel.org/r/20220401080212.27383-2-johnson.wang@mediatek.com Signed-off-by: Mark Brown --- include/linux/regulator/mt6358-regulator.h | 45 ++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/mt6358-regulator.h b/include/linux/regulator/mt6358-regulator.h index 1cc304946d09..bdcf83cd719e 100644 --- a/include/linux/regulator/mt6358-regulator.h +++ b/include/linux/regulator/mt6358-regulator.h @@ -48,9 +48,54 @@ enum { MT6358_ID_VLDO28, MT6358_ID_VAUD28, MT6358_ID_VSIM2, + MT6358_ID_VCORE_SSHUB, + MT6358_ID_VSRAM_OTHERS_SSHUB, MT6358_ID_RG_MAX, }; +enum { + MT6366_ID_VDRAM1 = 0, + MT6366_ID_VCORE, + MT6366_ID_VPA, + MT6366_ID_VPROC11, + MT6366_ID_VPROC12, + MT6366_ID_VGPU, + MT6366_ID_VS2, + MT6366_ID_VMODEM, + MT6366_ID_VS1, + MT6366_ID_VDRAM2, + MT6366_ID_VSIM1, + MT6366_ID_VIBR, + MT6366_ID_VRF12, + MT6366_ID_VIO18, + MT6366_ID_VUSB, + MT6366_ID_VCN18, + MT6366_ID_VFE28, + MT6366_ID_VSRAM_PROC11, + MT6366_ID_VCN28, + MT6366_ID_VSRAM_OTHERS, + MT6366_ID_VSRAM_GPU, + MT6366_ID_VXO22, + MT6366_ID_VEFUSE, + MT6366_ID_VAUX18, + MT6366_ID_VMCH, + MT6366_ID_VBIF28, + MT6366_ID_VSRAM_PROC12, + MT6366_ID_VEMC, + MT6366_ID_VIO28, + MT6366_ID_VA12, + MT6366_ID_VRF18, + MT6366_ID_VCN33_BT, + MT6366_ID_VCN33_WIFI, + MT6366_ID_VMC, + MT6366_ID_VAUD28, + MT6366_ID_VSIM2, + MT6366_ID_VCORE_SSHUB, + MT6366_ID_VSRAM_OTHERS_SSHUB, + MT6366_ID_RG_MAX, +}; + #define MT6358_MAX_REGULATOR MT6358_ID_RG_MAX +#define MT6366_MAX_REGULATOR MT6366_ID_RG_MAX #endif /* __LINUX_REGULATOR_MT6358_H */ -- cgit v1.2.3 From 8b023accc8df70e72f7704d29fead7ca914d6837 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Mon, 14 Mar 2022 15:19:03 -0700 Subject: lockdep: Fix -Wunused-parameter for _THIS_IP_ While looking into a bug related to the compiler's handling of addresses of labels, I noticed some uses of _THIS_IP_ seemed unused in lockdep. Drive by cleanup. -Wunused-parameter: kernel/locking/lockdep.c:1383:22: warning: unused parameter 'ip' kernel/locking/lockdep.c:4246:48: warning: unused parameter 'ip' kernel/locking/lockdep.c:4844:19: warning: unused parameter 'ip' Signed-off-by: Nick Desaulniers Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lore.kernel.org/r/20220314221909.2027027-1-ndesaulniers@google.com --- include/linux/irqflags.h | 4 ++-- include/linux/kvm_host.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 4b140938b03e..5ec0fa71399e 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -20,13 +20,13 @@ #ifdef CONFIG_PROVE_LOCKING extern void lockdep_softirqs_on(unsigned long ip); extern void lockdep_softirqs_off(unsigned long ip); - extern void lockdep_hardirqs_on_prepare(unsigned long ip); + extern void lockdep_hardirqs_on_prepare(void); extern void lockdep_hardirqs_on(unsigned long ip); extern void lockdep_hardirqs_off(unsigned long ip); #else static inline void lockdep_softirqs_on(unsigned long ip) { } static inline void lockdep_softirqs_off(unsigned long ip) { } - static inline void lockdep_hardirqs_on_prepare(unsigned long ip) { } + static inline void lockdep_hardirqs_on_prepare(void) { } static inline void lockdep_hardirqs_on(unsigned long ip) { } static inline void lockdep_hardirqs_off(unsigned long ip) { } #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3f9b22c4983a..e1b2413750c0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -450,7 +450,7 @@ static __always_inline void guest_state_enter_irqoff(void) { instrumentation_begin(); trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); instrumentation_end(); guest_context_enter_irqoff(); -- cgit v1.2.3 From bfe4daf850f45d92dcd3da477f0b0456620294c3 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:05 -0700 Subject: perf/core: Add perf_clear_branch_entry_bitfields() helper Make it simpler to reset all the info fields on the perf_branch_entry by adding a helper inline function. The goal is to centralize the initialization to avoid missing a field in case more are added. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-2-eranian@google.com --- include/linux/perf_event.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index af97dd427501..a411080d5169 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1063,6 +1063,22 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->txn = 0; } +/* + * Clear all bitfields in the perf_branch_entry. + * The to and from fields are not cleared because they are + * systematically modified by caller. + */ +static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) +{ + br->mispred = 0; + br->predicted = 0; + br->in_tx = 0; + br->abort = 0; + br->cycles = 0; + br->type = 0; + br->reserved = 0; +} + extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, -- cgit v1.2.3 From 2a606a18cd672a16343d146a126721b34cc6adbd Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:12 -0700 Subject: ACPI: Add perf low power callback Add an optional callback needed by some PMU features, e.g., AMD BRS, to give a chance to the perf_events code to change its state before a CPU goes to low power and after it comes back. The callback is void when the PERF_NEEDS_LOPWR_CB flag is not set. This flag must be set in arch specific perf_event.h header whenever needed. When not set, there is no impact on the ACPI code. Signed-off-by: Stephane Eranian [peterz: build fix] Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-9-eranian@google.com --- include/linux/perf_event.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a411080d5169..da759560eec5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1676,4 +1676,10 @@ typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries, unsigned int cnt); DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); +#ifndef PERF_NEEDS_LOPWR_CB +static inline void perf_lopwr_cb(bool mode) +{ +} +#endif + #endif /* _LINUX_PERF_EVENT_H */ -- cgit v1.2.3 From cfe43f478b79ba45573ca22d52d0d8823be068fa Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 12 Nov 2021 18:52:01 +0000 Subject: preempt/dynamic: Introduce preemption model accessors CONFIG_PREEMPT{_NONE, _VOLUNTARY} designate either: o The build-time preemption model when !PREEMPT_DYNAMIC o The default boot-time preemption model when PREEMPT_DYNAMIC IOW, using those on PREEMPT_DYNAMIC kernels is meaningless - the actual model could have been set to something else by the "preempt=foo" cmdline parameter. Same problem applies to CONFIG_PREEMPTION. Introduce a set of helpers to determine the actual preemption model used by the live kernel. Suggested-by: Marco Elver Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Marco Elver Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20211112185203.280040-3-valentin.schneider@arm.com --- include/linux/sched.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d5e3c00b74e1..67f06f72c50e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2117,6 +2117,47 @@ static inline void cond_resched_rcu(void) #endif } +#ifdef CONFIG_PREEMPT_DYNAMIC + +extern bool preempt_model_none(void); +extern bool preempt_model_voluntary(void); +extern bool preempt_model_full(void); + +#else + +static inline bool preempt_model_none(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_NONE); +} +static inline bool preempt_model_voluntary(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); +} +static inline bool preempt_model_full(void) +{ + return IS_ENABLED(CONFIG_PREEMPT); +} + +#endif + +static inline bool preempt_model_rt(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_RT); +} + +/* + * Does the preemption model allow non-cooperative preemption? + * + * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with + * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the + * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the + * PREEMPT_NONE model. + */ +static inline bool preempt_model_preemptible(void) +{ + return preempt_model_full() || preempt_model_rt(); +} + /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPTION, -- cgit v1.2.3 From 8c756a0a2de17f1535ef885ac7e556e016735eb2 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Thu, 31 Mar 2022 15:54:47 +0300 Subject: device property: Convert device_{dma_supported,get_dma_attr} to fwnode Make the device_dma_supported and device_get_dma_attr functions to use the fwnode ops, and move the implementation to ACPI and OF frameworks. Signed-off-by: Sakari Ailus Acked-by: Rob Herring Reviewed-by: Andy Shevchenko Reviewed-by: Heikki Krogerus Signed-off-by: Rafael J. Wysocki --- include/linux/fwnode.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 3a532ba66f6c..6f307f21fc65 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -113,6 +113,9 @@ struct fwnode_operations { bool (*device_is_available)(const struct fwnode_handle *fwnode); const void *(*device_get_match_data)(const struct fwnode_handle *fwnode, const struct device *dev); + bool (*device_dma_supported)(const struct fwnode_handle *fwnode); + enum dev_dma_attr + (*device_get_dma_attr)(const struct fwnode_handle *fwnode); bool (*property_present)(const struct fwnode_handle *fwnode, const char *propname); int (*property_read_int_array)(const struct fwnode_handle *fwnode, -- cgit v1.2.3 From 68b979d068d3d0dceb14c446f664433d96f20a7e Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Thu, 31 Mar 2022 15:54:49 +0300 Subject: device property: Add iomap to fwnode operations Add iomap() fwnode operation to implement fwnode_iomap() through fwnode operations, moving the code in fwnode_iomap() to OF framework. Note that the IS_ENABLED(CONFIG_OF_ADDRESS) && is_of_node(fwnode) check is needed for Sparc that has its own implementation of of_iomap anyway. Let the pre-compiler to handle that check. Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Reviewed-by: Heikki Krogerus Signed-off-by: Rafael J. Wysocki --- include/linux/fwnode.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 6f307f21fc65..ebbc3bf03f95 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -148,6 +148,7 @@ struct fwnode_operations { (*graph_get_port_parent)(struct fwnode_handle *fwnode); int (*graph_parse_endpoint)(const struct fwnode_handle *fwnode, struct fwnode_endpoint *endpoint); + void __iomem *(*iomap)(struct fwnode_handle *fwnode, int index); int (*add_links)(struct fwnode_handle *fwnode); }; -- cgit v1.2.3 From 99c63707bafd15bcf97fbd6bef1c92d5bfa01d28 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Thu, 31 Mar 2022 15:54:50 +0300 Subject: device property: Add irq_get to fwnode operation Add irq_get() fwnode operation to implement fwnode_irq_get() through fwnode operations, moving the code in fwnode_irq_get() to OF and ACPI frameworks. Signed-off-by: Sakari Ailus Acked-by: Rob Herring Reviewed-by: Andy Shevchenko Reviewed-by: Heikki Krogerus Signed-off-by: Rafael J. Wysocki --- include/linux/fwnode.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index ebbc3bf03f95..6ab69871b06d 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -149,6 +149,7 @@ struct fwnode_operations { int (*graph_parse_endpoint)(const struct fwnode_handle *fwnode, struct fwnode_endpoint *endpoint); void __iomem *(*iomap)(struct fwnode_handle *fwnode, int index); + int (*irq_get)(const struct fwnode_handle *fwnode, unsigned int index); int (*add_links)(struct fwnode_handle *fwnode); }; -- cgit v1.2.3 From 1be9473e31ab87ad1b6ecf9fd11df461930ede85 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:34 +0000 Subject: module: Move livepatch support to a separate file No functional change. This patch migrates livepatch support (i.e. used during module add/or load and remove/or deletion) from core module code into kernel/module/livepatch.c. At the moment it contains code to persist Elf information about a given livepatch module, only. The new file was added to MAINTAINERS. Reviewed-by: Petr Mladek Tested-by: Petr Mladek Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- include/linux/module.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 1e135fd5c076..7ec9715de7dc 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -663,17 +663,14 @@ static inline bool module_requested_async_probing(struct module *module) return module && module->async_probe_requested; } -#ifdef CONFIG_LIVEPATCH static inline bool is_livepatch_module(struct module *mod) { +#ifdef CONFIG_LIVEPATCH return mod->klp; -} -#else /* !CONFIG_LIVEPATCH */ -static inline bool is_livepatch_module(struct module *mod) -{ +#else return false; +#endif } -#endif /* CONFIG_LIVEPATCH */ bool is_module_sig_enforced(void); void set_module_sig_enforced(void); -- cgit v1.2.3 From 0c1e42805c25c87eb7a6f3b18bdbf3b3b7840aff Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:37 +0000 Subject: module: Move extra signature support out of core code No functional change. This patch migrates additional module signature check code from core module code into kernel/module/signing.c. Reviewed-by: Christophe Leroy Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- include/linux/module.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 7ec9715de7dc..5e2059f3afc7 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -672,7 +672,6 @@ static inline bool is_livepatch_module(struct module *mod) #endif } -bool is_module_sig_enforced(void); void set_module_sig_enforced(void); #else /* !CONFIG_MODULES... */ @@ -799,10 +798,6 @@ static inline bool module_requested_async_probing(struct module *module) return false; } -static inline bool is_module_sig_enforced(void) -{ - return false; -} static inline void set_module_sig_enforced(void) { @@ -854,11 +849,18 @@ static inline bool retpoline_module_ok(bool has_retpoline) #endif #ifdef CONFIG_MODULE_SIG +bool is_module_sig_enforced(void); + static inline bool module_sig_ok(struct module *module) { return module->sig_ok; } #else /* !CONFIG_MODULE_SIG */ +static inline bool is_module_sig_enforced(void) +{ + return false; +} + static inline bool module_sig_ok(struct module *module) { return true; -- cgit v1.2.3 From f64205a42046d3802c423fa2059e7fca39af127c Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 22 Mar 2022 14:03:43 +0000 Subject: module: Move kdb module related code out of main kdb code No functional change. This patch migrates the kdb 'lsmod' command support out of main kdb code into its own file under kernel/module. In addition to the above, a minor style warning i.e. missing a blank line after declarations, was resolved too. The new file was added to MAINTAINERS. Finally we remove linux/module.h as it is entirely redundant. Reviewed-by: Daniel Thompson Acked-by: Daniel Thompson Signed-off-by: Aaron Tomlin Signed-off-by: Luis Chamberlain --- include/linux/kdb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index ea0f5e580fac..07dfb6a20a1c 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -222,5 +222,6 @@ enum { extern int kdbgetintenv(const char *, int *); extern int kdb_set(int, const char **); +int kdb_lsmod(int argc, const char **argv); #endif /* !_KDB_H */ -- cgit v1.2.3 From 01dc0386efb769056257410ba5754558384006a7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 23 Feb 2022 13:02:14 +0100 Subject: module: Add CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC Add CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC to allow architectures to request having modules data in vmalloc area instead of module area. This is required on powerpc book3s/32 in order to set data non executable, because it is not possible to set executability on page basis, this is done per 256 Mbytes segments. The module area has exec right, vmalloc area has noexec. This can also be useful on other powerpc/32 in order to maximize the chance of code being close enough to kernel core to avoid branch trampolines. Cc: Jason Wessel Acked-by: Daniel Thompson Cc: Douglas Anderson Signed-off-by: Christophe Leroy [mcgrof: rebased in light of kernel/module/kdb.c move] Signed-off-by: Luis Chamberlain --- include/linux/module.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 5e2059f3afc7..46d4d5f2516e 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -422,6 +422,9 @@ struct module { /* Core layout: rbtree is accessed frequently, so keep together. */ struct module_layout core_layout __module_layout_align; struct module_layout init_layout; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + struct module_layout data_layout; +#endif /* Arch-specific module values */ struct mod_arch_specific arch; @@ -569,6 +572,11 @@ bool is_module_text_address(unsigned long addr); static inline bool within_module_core(unsigned long addr, const struct module *mod) { +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + if ((unsigned long)mod->data_layout.base <= addr && + addr < (unsigned long)mod->data_layout.base + mod->data_layout.size) + return true; +#endif return (unsigned long)mod->core_layout.base <= addr && addr < (unsigned long)mod->core_layout.base + mod->core_layout.size; } -- cgit v1.2.3 From baa914cd81f51f4e4f3bae5bb59764b32ad8c353 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 5 Apr 2022 16:22:20 +0900 Subject: firewire: add kernel API to access CYCLE_TIME register 1394 OHCI specification defined Isochronous Cycle Timer Register to get value of CYCLE_TIME register defined by IEEE 1394 for CSR architecture defined by ISO/IEC 13213. Unit driver can calculate packet time by compute with the value of CYCLE_TIME and timeStamp field in descriptor of each isochronous and asynchronous context. The resolution of CYCLE_TIME is 49.576 MHz, while the one of timeStamp is 8,000 Hz. Current implementation of Linux FireWire subsystem allows the driver to get the value of CYCLE_TIMER CSR register by transaction service. The transaction service has overhead in regard of access to MMIO register. This commit adds kernel API for unit driver to access the register directly. Signed-off-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20220405072221.226217-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai --- include/linux/firewire.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 07967a450eaa..2f467c52bdec 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -150,6 +150,8 @@ static inline void fw_card_put(struct fw_card *card) kref_put(&card->kref, fw_card_release); } +int fw_card_read_cycle_time(struct fw_card *card, u32 *cycle_time); + struct fw_attribute_group { struct attribute_group *groups[2]; struct attribute_group group; -- cgit v1.2.3 From b2405aa948b95afc5246fa56fc05c3512cd6185c Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 5 Apr 2022 16:22:21 +0900 Subject: firewire: add kernel API to access packet structure in request structure for AR context In 1394 OHCI specification, descriptor of Asynchronous Receive DMA context has timeStamp field in its trailer quadlet. The field is written by the host controller for the time to receive asynchronous request subaction in isochronous cycle time. In Linux FireWire subsystem, the value of field is stored to fw_packet structure and copied to fw_request structure as the part. The fw_request structure is hidden from unit driver and passed as opaque pointer when calling registered handler. It's inconvenient to the unit driver which needs timestamp of packet. This commit adds kernel API to pick up timestamp from opaque pointer to fw_request structure. Signed-off-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20220405072221.226217-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai --- include/linux/firewire.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 2f467c52bdec..980019053e54 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -354,6 +354,7 @@ void fw_core_remove_address_handler(struct fw_address_handler *handler); void fw_send_response(struct fw_card *card, struct fw_request *request, int rcode); int fw_get_request_speed(struct fw_request *request); +u32 fw_request_get_timestamp(const struct fw_request *request); void fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode, int destination_id, int generation, int speed, unsigned long long offset, void *payload, size_t length, -- cgit v1.2.3 From a8e2512efc65892a1cbf608d9c03c8bcbe5a623a Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 1 Apr 2022 15:06:04 +0100 Subject: PM: core: Add NS varients of EXPORT[_GPL]_SIMPLE_DEV_PM_OPS and runtime pm equiv As more drivers start to use namespaces, we need to have varients of these useful macros that allow the export to be in a particular namespace. Signed-off-by: Jonathan Cameron Cc: Paul Cercueil Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 14 +++++++++----- include/linux/pm_runtime.h | 10 ++++++++-- 2 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index e65b3ab28377..ffe941958501 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -368,13 +368,13 @@ const struct dev_pm_ops name = { \ #ifdef CONFIG_PM #define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \ - runtime_resume_fn, idle_fn, sec) \ + runtime_resume_fn, idle_fn, sec, ns) \ _DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \ runtime_resume_fn, idle_fn); \ - _EXPORT_SYMBOL(name, sec) + __EXPORT_SYMBOL(name, sec, ns) #else #define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \ - runtime_resume_fn, idle_fn, sec) \ + runtime_resume_fn, idle_fn, sec, ns) \ static __maybe_unused _DEFINE_DEV_PM_OPS(__static_##name, suspend_fn, \ resume_fn, runtime_suspend_fn, \ runtime_resume_fn, idle_fn) @@ -391,9 +391,13 @@ static __maybe_unused _DEFINE_DEV_PM_OPS(__static_##name, suspend_fn, \ _DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL) #define EXPORT_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ - _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "") + _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "", "") #define EXPORT_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ - _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl") + _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl", "") +#define EXPORT_NS_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns) \ + _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "", #ns) +#define EXPORT_NS_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns) \ + _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl", #ns) /* Deprecated. Use DEFINE_SIMPLE_DEV_PM_OPS() instead. */ #define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 2bff6a10095d..9e4d056967c6 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -41,10 +41,16 @@ #define EXPORT_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \ - suspend_fn, resume_fn, idle_fn, "") + suspend_fn, resume_fn, idle_fn, "", "") #define EXPORT_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \ - suspend_fn, resume_fn, idle_fn, "_gpl") + suspend_fn, resume_fn, idle_fn, "_gpl", "") +#define EXPORT_NS_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \ + _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \ + suspend_fn, resume_fn, idle_fn, "", #ns) +#define EXPORT_NS_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \ + _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \ + suspend_fn, resume_fn, idle_fn, "_gpl", #ns) #ifdef CONFIG_PM extern struct workqueue_struct *pm_wq; -- cgit v1.2.3 From 0b5c21bbc01e92745ca1ca4f6fd87d878fa3ea5e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 4 Apr 2022 11:38:47 +0200 Subject: net: ensure net_todo_list is processed quickly In [1], Will raised a potential issue that the cfg80211 code, which does (from a locking perspective) rtnl_lock() wiphy_lock() rtnl_unlock() might be suspectible to ABBA deadlocks, because rtnl_unlock() calls netdev_run_todo(), which might end up calling rtnl_lock() again, which could then deadlock (see the comment in the code added here for the scenario). Some back and forth and thinking ensued, but clearly this can't happen if the net_todo_list is empty at the rtnl_unlock() here. Clearly, the code here cannot actually put an entry on it, and all other users of rtnl_unlock() will empty it since that will always go through netdev_run_todo(), emptying the list. So the only other way to get there would be to add to the list and then unlock the RTNL without going through rtnl_unlock(), which is only possible through __rtnl_unlock(). However, this isn't exported and not used in many places, and none of them seem to be able to unregister before using it. Therefore, add a WARN_ON() in the code to ensure this invariant won't be broken, so that the cfg80211 (or any similar) code stays safe. [1] https://lore.kernel.org/r/Yjzpo3TfZxtKPMAG@google.com Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20220404113847.0ee02e4a70da.Ic73d206e217db20fd22dcec14fe5442ca732804b@changeid Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 59e27a2b7bf0..b6a1e7f643da 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3894,7 +3894,8 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); extern int netdev_budget; extern unsigned int netdev_budget_usecs; -/* Called by rtnetlink.c:rtnl_unlock() */ +/* Used by rtnetlink.c:__rtnl_unlock()/rtnl_unlock() */ +extern struct list_head net_todo_list; void netdev_run_todo(void); static inline void __dev_put(struct net_device *dev) -- cgit v1.2.3 From 40379a0084c2f65eb62c102f5bbf5cdc14a50410 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 4 Apr 2022 15:08:15 +0300 Subject: net/mlx5_fpga: Drop INNOVA TLS support Mellanox INNOVA TLS cards are EOL in May, 2018 [1]. As such, the code is unmaintained, untested and not in-use by any upstream/distro oriented customers. In order to reduce code complexity, drop the kernel code. [1] https://network.nvidia.com/related-docs/eol/LCR-000286.pdf Link: https://lore.kernel.org/r/b88add368def721ea9d054cb69def72d9e3f67aa.1649073691.git.leonro@nvidia.com Reviewed-by: Tariq Toukan Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc_fpga.h | 63 -------------------------------------- 1 file changed, 63 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index 07d77323f78a..e3d824f6a309 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -54,7 +54,6 @@ enum { enum { MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_IPSEC = 0x2, - MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_TLS = 0x3, }; struct mlx5_ifc_fpga_shell_caps_bits { @@ -387,27 +386,6 @@ struct mlx5_ifc_fpga_destroy_qp_out_bits { u8 reserved_at_40[0x40]; }; -struct mlx5_ifc_tls_extended_cap_bits { - u8 aes_gcm_128[0x1]; - u8 aes_gcm_256[0x1]; - u8 reserved_at_2[0x1e]; - u8 reserved_at_20[0x20]; - u8 context_capacity_total[0x20]; - u8 context_capacity_rx[0x20]; - u8 context_capacity_tx[0x20]; - u8 reserved_at_a0[0x10]; - u8 tls_counter_size[0x10]; - u8 tls_counters_addr_low[0x20]; - u8 tls_counters_addr_high[0x20]; - u8 rx[0x1]; - u8 tx[0x1]; - u8 tls_v12[0x1]; - u8 tls_v13[0x1]; - u8 lro[0x1]; - u8 ipv6[0x1]; - u8 reserved_at_106[0x1a]; -}; - struct mlx5_ifc_ipsec_extended_cap_bits { u8 encapsulation[0x20]; @@ -572,45 +550,4 @@ struct mlx5_ifc_fpga_ipsec_sa { __be16 vid; /* only 12 bits, rest is reserved */ __be16 reserved2; } __packed; - -enum fpga_tls_cmds { - CMD_SETUP_STREAM = 0x1001, - CMD_TEARDOWN_STREAM = 0x1002, - CMD_RESYNC_RX = 0x1003, -}; - -#define MLX5_TLS_1_2 (0) - -#define MLX5_TLS_ALG_AES_GCM_128 (0) -#define MLX5_TLS_ALG_AES_GCM_256 (1) - -struct mlx5_ifc_tls_cmd_bits { - u8 command_type[0x20]; - u8 ipv6[0x1]; - u8 direction_sx[0x1]; - u8 tls_version[0x2]; - u8 reserved[0x1c]; - u8 swid[0x20]; - u8 src_port[0x10]; - u8 dst_port[0x10]; - union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits src_ipv4_src_ipv6; - union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits dst_ipv4_dst_ipv6; - u8 tls_rcd_sn[0x40]; - u8 tcp_sn[0x20]; - u8 tls_implicit_iv[0x20]; - u8 tls_xor_iv[0x40]; - u8 encryption_key[0x100]; - u8 alg[4]; - u8 reserved2[0x1c]; - u8 reserved3[0x4a0]; -}; - -struct mlx5_ifc_tls_resp_bits { - u8 syndrome[0x20]; - u8 stream_id[0x20]; - u8 reserved[0x40]; -}; - -#define MLX5_TLS_COMMAND_SIZE (0x100) - #endif /* MLX5_IFC_FPGA_H */ -- cgit v1.2.3 From 0276bd3a94c072de3f69b5afe6224e488cc76635 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 Apr 2022 17:15:16 +0200 Subject: IB/mlx5: Fix undefined behavior due to shift overflowing the constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: drivers/infiniband/hw/mlx5/main.c: In function ‘translate_eth_legacy_proto_oper’: drivers/infiniband/hw/mlx5/main.c:370:2: error: case label does not reduce to an integer constant case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2): ^~~~ See https://lore.kernel.org/r/YkwQ6%2BtIH8GQpuct@zn.tnic for the gory details as to why it triggers with older gccs only. Link: https://lore.kernel.org/all/20220405151517.29753-11-bp@alien8.de Signed-off-by: Borislav Petkov Cc: Leon Romanovsky Cc: Saeed Mahameed Cc: linux-rdma@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/port.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 28a928b0684b..e96ee1e348cb 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -141,7 +141,7 @@ enum mlx5_ptys_width { MLX5_PTYS_WIDTH_12X = 1 << 4, }; -#define MLX5E_PROT_MASK(link_mode) (1 << link_mode) +#define MLX5E_PROT_MASK(link_mode) (1U << link_mode) #define MLX5_GET_ETH_PROTO(reg, out, ext, field) \ (ext ? MLX5_GET(reg, out, ext_##field) : \ MLX5_GET(reg, out, field)) -- cgit v1.2.3 From a285909f471d6703a04b2b3942c352e27131c92b Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 6 Apr 2022 15:00:03 +0900 Subject: mm/slub, kunit: Make slub_kunit unaffected by user specified flags slub_kunit does not expect other debugging flags to be set when running tests. When SLAB_RED_ZONE flag is set globally, test fails because the flag affects number of errors reported. To make slub_kunit unaffected by user specified debugging flags, introduce SLAB_NO_USER_FLAGS to ignore them. With this flag, only flags specified in the code are used and others are ignored. Suggested-by: Vlastimil Babka Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka Link: https://lore.kernel.org/r/Yk0sY9yoJhFEXWOg@hyeyoo --- include/linux/slab.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 373b3ef99f4e..11ceddcae9f4 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -112,6 +112,13 @@ #define SLAB_KASAN 0 #endif +/* + * Ignore user specified debugging flags. + * Intended for caches created for self-tests so they have only flags + * specified in the code and other flags are ignored. + */ +#define SLAB_NO_USER_FLAGS ((slab_flags_t __force)0x10000000U) + /* The following flags affect the page allocator grouping pages by mobility */ /* Objects are reclaimable */ #define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U) -- cgit v1.2.3 From a5f1783be29adae15666fd803efd7d2979130869 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 2 Mar 2022 12:02:22 +0100 Subject: lib/stackdepot: allow requesting early initialization dynamically In a later patch we want to add stackdepot support for object owner tracking in slub caches, which is enabled by slub_debug boot parameter. This creates a bootstrap problem as some caches are created early in boot when slab_is_available() is false and thus stack_depot_init() tries to use memblock. But, as reported by Hyeonggon Yoo [1] we are already beyond memblock_free_all(). Ideally memblock allocation should fail, yet it succeeds, but later the system crashes, which is a separately handled issue. To resolve this boostrap issue in a robust way, this patch adds another way to request stack_depot_early_init(), which happens at a well-defined point of time. In addition to build-time CONFIG_STACKDEPOT_ALWAYS_INIT, code that's e.g. processing boot parameters (which happens early enough) can call a new function stack_depot_want_early_init(), which sets a flag that stack_depot_early_init() will check. In this patch we also convert page_owner to this approach. While it doesn't have the bootstrap issue as slub, it's also a functionality enabled by a boot param and can thus request stack_depot_early_init() with memblock allocation instead of later initialization with kvmalloc(). As suggested by Mike, make stack_depot_early_init() only attempt memblock allocation and stack_depot_init() only attempt kvmalloc(). Also change the latter to kvcalloc(). In both cases we can lose the explicit array zeroing, which the allocations do already. As suggested by Marco, provide empty implementations of the init functions for !CONFIG_STACKDEPOT builds to simplify the callers. [1] https://lore.kernel.org/all/YhnUcqyeMgCrWZbd@ip-172-31-19-208.ap-northeast-1.compute.internal/ Reported-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Suggested-by: Mike Rapoport Suggested-by: Marco Elver Signed-off-by: Vlastimil Babka Reviewed-by: Marco Elver Reviewed-and-tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Mike Rapoport Acked-by: David Rientjes --- include/linux/stackdepot.h | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 17f992fe6355..bc2797955de9 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -20,18 +20,36 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, gfp_t gfp_flags, bool can_alloc); /* - * Every user of stack depot has to call this during its own init when it's - * decided that it will be calling stack_depot_save() later. + * Every user of stack depot has to call stack_depot_init() during its own init + * when it's decided that it will be calling stack_depot_save() later. This is + * recommended for e.g. modules initialized later in the boot process, when + * slab_is_available() is true. * * The alternative is to select STACKDEPOT_ALWAYS_INIT to have stack depot * enabled as part of mm_init(), for subsystems where it's known at compile time * that stack depot will be used. + * + * Another alternative is to call stack_depot_want_early_init(), when the + * decision to use stack depot is taken e.g. when evaluating kernel boot + * parameters, which precedes the enablement point in mm_init(). + * + * stack_depot_init() and stack_depot_want_early_init() can be called regardless + * of CONFIG_STACKDEPOT and are no-op when disabled. The actual save/fetch/print + * functions should only be called from code that makes sure CONFIG_STACKDEPOT + * is enabled. */ +#ifdef CONFIG_STACKDEPOT int stack_depot_init(void); -#ifdef CONFIG_STACKDEPOT_ALWAYS_INIT -static inline int stack_depot_early_init(void) { return stack_depot_init(); } +void __init stack_depot_want_early_init(void); + +/* This is supposed to be called only from mm_init() */ +int __init stack_depot_early_init(void); #else +static inline int stack_depot_init(void) { return 0; } + +static inline void stack_depot_want_early_init(void) { } + static inline int stack_depot_early_init(void) { return 0; } #endif -- cgit v1.2.3 From f742b90e61bb53b27771f64bdae05db03a6ab1f2 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 24 Feb 2022 10:55:49 -0600 Subject: x86/mm: Extend cc_attr to include AMD SEV-SNP The CC_ATTR_GUEST_SEV_SNP can be used by the guest to query whether the SNP (Secure Nested Paging) feature is active. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-10-brijesh.singh@amd.com --- include/linux/cc_platform.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index efd8205282da..d08dd65b5c43 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -72,6 +72,14 @@ enum cc_attr { * Examples include TDX guest & SEV. */ CC_ATTR_GUEST_UNROLL_STRING_IO, + + /** + * @CC_ATTR_SEV_SNP: Guest SNP is active. + * + * The platform/OS is running as a guest/virtual machine and actively + * using AMD SEV-SNP features. + */ + CC_ATTR_GUEST_SEV_SNP, }; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM -- cgit v1.2.3 From f4b41f062c424209e3939a81e6da022e049a45f2 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 4 Apr 2022 18:30:22 +0200 Subject: net: remove noblock parameter from skb_recv_datagram() skb_recv_datagram() has two parameters 'flags' and 'noblock' that are merged inside skb_recv_datagram() by 'flags | (noblock ? MSG_DONTWAIT : 0)' As 'flags' may contain MSG_DONTWAIT as value most callers split the 'flags' into 'flags' and 'noblock' with finally obsolete bit operations like this: skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &rc); And this is not even done consistently with the 'flags' parameter. This patch removes the obsolete and costly splitting into two parameters and only performs bit operations when really needed on the caller side. One missing conversion thankfully reported by kernel test robot. I missed to enable kunit tests to build the mctp code. Reported-by: kernel test robot Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3a30cae8b0a5..2394441fa3dd 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3836,8 +3836,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, struct sk_buff *__skb_recv_datagram(struct sock *sk, struct sk_buff_head *sk_queue, unsigned int flags, int *off, int *err); -struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, - int *err); +struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err); __poll_t datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, -- cgit v1.2.3 From fe696ccb277d332dc4e625b5b20b988b04d16c04 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 3 Apr 2022 15:53:54 -0700 Subject: gpu: host1x: Fix a kernel-doc warning Add @cache description to eliminate a kernel-doc warning. include/linux/host1x.h:104: warning: Function parameter or member 'cache' not described in 'host1x_client' Fixes: 1f39b1dfa53c ("drm/tegra: Implement buffer object cache") Signed-off-by: Randy Dunlap Cc: Thierry Reding Cc: linux-tegra@vger.kernel.org Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Signed-off-by: Thierry Reding --- include/linux/host1x.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/host1x.h b/include/linux/host1x.h index e8dc5bc41f79..00278853eadf 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -81,6 +81,7 @@ struct host1x_client_ops { * @parent: pointer to parent structure * @usecount: reference count for this structure * @lock: mutex for mutually exclusive concurrency + * @cache: host1x buffer object cache */ struct host1x_client { struct list_head list; -- cgit v1.2.3 From 804775dfc2885e93a0a4b35db1914c2cc25172b5 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 5 Apr 2022 21:57:47 +0200 Subject: net: ethernet: mtk_eth_soc: add support for Wireless Ethernet Dispatch (WED) The Wireless Ethernet Dispatch subsystem on the MT7622 SoC can be configured to intercept and handle access to the DMA queues and PCIe interrupts for a MT7615/MT7915 wireless card. It can manage the internal WDMA (Wireless DMA) controller, which allows ethernet packets to be passed from the packet switch engine (PSE) to the wireless card, bypassing the CPU entirely. This can be used to implement hardware flow offloading from ethernet to WLAN. Signed-off-by: Felix Fietkau Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 131 +++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 include/linux/soc/mediatek/mtk_wed.h (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h new file mode 100644 index 000000000000..7e00cca06709 --- /dev/null +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -0,0 +1,131 @@ +#ifndef __MTK_WED_H +#define __MTK_WED_H + +#include +#include +#include +#include + +#define MTK_WED_TX_QUEUES 2 + +struct mtk_wed_hw; +struct mtk_wdma_desc; + +struct mtk_wed_ring { + struct mtk_wdma_desc *desc; + dma_addr_t desc_phys; + int size; + + u32 reg_base; + void __iomem *wpdma; +}; + +struct mtk_wed_device { +#ifdef CONFIG_NET_MEDIATEK_SOC_WED + const struct mtk_wed_ops *ops; + struct device *dev; + struct mtk_wed_hw *hw; + bool init_done, running; + int wdma_idx; + int irq; + + struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES]; + struct mtk_wed_ring txfree_ring; + struct mtk_wed_ring tx_wdma[MTK_WED_TX_QUEUES]; + + struct { + int size; + void **pages; + struct mtk_wdma_desc *desc; + dma_addr_t desc_phys; + } buf_ring; + + /* filled by driver: */ + struct { + struct pci_dev *pci_dev; + + u32 wpdma_phys; + + u16 token_start; + unsigned int nbuf; + + u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id); + int (*offload_enable)(struct mtk_wed_device *wed); + void (*offload_disable)(struct mtk_wed_device *wed); + } wlan; +#endif +}; + +struct mtk_wed_ops { + int (*attach)(struct mtk_wed_device *dev); + int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, + void __iomem *regs); + int (*txfree_ring_setup)(struct mtk_wed_device *dev, + void __iomem *regs); + void (*detach)(struct mtk_wed_device *dev); + + void (*stop)(struct mtk_wed_device *dev); + void (*start)(struct mtk_wed_device *dev, u32 irq_mask); + void (*reset_dma)(struct mtk_wed_device *dev); + + u32 (*reg_read)(struct mtk_wed_device *dev, u32 reg); + void (*reg_write)(struct mtk_wed_device *dev, u32 reg, u32 val); + + u32 (*irq_get)(struct mtk_wed_device *dev, u32 mask); + void (*irq_set_mask)(struct mtk_wed_device *dev, u32 mask); +}; + +extern const struct mtk_wed_ops __rcu *mtk_soc_wed_ops; + +static inline int +mtk_wed_device_attach(struct mtk_wed_device *dev) +{ + int ret = -ENODEV; + +#ifdef CONFIG_NET_MEDIATEK_SOC_WED + rcu_read_lock(); + dev->ops = rcu_dereference(mtk_soc_wed_ops); + if (dev->ops) + ret = dev->ops->attach(dev); + else + rcu_read_unlock(); + + if (ret) + dev->ops = NULL; +#endif + + return ret; +} + +#ifdef CONFIG_NET_MEDIATEK_SOC_WED +#define mtk_wed_device_active(_dev) !!(_dev)->ops +#define mtk_wed_device_detach(_dev) (_dev)->ops->detach(_dev) +#define mtk_wed_device_start(_dev, _mask) (_dev)->ops->start(_dev, _mask) +#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs) \ + (_dev)->ops->tx_ring_setup(_dev, _ring, _regs) +#define mtk_wed_device_txfree_ring_setup(_dev, _regs) \ + (_dev)->ops->txfree_ring_setup(_dev, _regs) +#define mtk_wed_device_reg_read(_dev, _reg) \ + (_dev)->ops->reg_read(_dev, _reg) +#define mtk_wed_device_reg_write(_dev, _reg, _val) \ + (_dev)->ops->reg_write(_dev, _reg, _val) +#define mtk_wed_device_irq_get(_dev, _mask) \ + (_dev)->ops->irq_get(_dev, _mask) +#define mtk_wed_device_irq_set_mask(_dev, _mask) \ + (_dev)->ops->irq_set_mask(_dev, _mask) +#else +static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) +{ + return false; +} +#define mtk_wed_device_detach(_dev) do {} while (0) +#define mtk_wed_device_start(_dev, _mask) do {} while (0) +#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs) -ENODEV +#define mtk_wed_device_txfree_ring_setup(_dev, _ring, _regs) -ENODEV +#define mtk_wed_device_reg_read(_dev, _reg) 0 +#define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0) +#define mtk_wed_device_irq_get(_dev, _mask) 0 +#define mtk_wed_device_irq_set_mask(_dev, _mask) do {} while (0) +#endif + +#endif -- cgit v1.2.3 From a333215e10cb5d3b1e0685ca117f0e9452215485 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 5 Apr 2022 21:57:48 +0200 Subject: net: ethernet: mtk_eth_soc: implement flow offloading to WED devices This allows hardware flow offloading from Ethernet to WLAN on MT7622 SoC Co-developed-by: Lorenzo Bianconi Signed-off-by: Lorenzo Bianconi Signed-off-by: Felix Fietkau Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b6a1e7f643da..7b2a0b739684 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -862,6 +862,7 @@ enum net_device_path_type { DEV_PATH_BRIDGE, DEV_PATH_PPPOE, DEV_PATH_DSA, + DEV_PATH_MTK_WDMA, }; struct net_device_path { @@ -887,6 +888,12 @@ struct net_device_path { int port; u16 proto; } dsa; + struct { + u8 wdma_idx; + u8 queue; + u16 wcid; + u8 bss; + } mtk_wdma; }; }; -- cgit v1.2.3 From 3e9c4584336149146fe15cb5703fc10a2ca2d2a0 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 24 Mar 2022 11:30:25 +0100 Subject: gpu: host1x: Do not use mapping cache for job submissions Buffer mappings used in job submissions are usually small and not rapidly reused as opposed to framebuffers (which are usually large and rapidly reused, for example when page-flipping between double-buffered framebuffers). Avoid going through the mapping cache for these buffers since the cache would also lead to leaks if nobody is ever releasing the cache's last reference. For DRM/KMS these last references are dropped when the framebuffers are removed and therefore no longer needed. While at it, also add a note about the need to explicitly remove the final reference to the mapping in the cache. Reviewed-by: Jon Hunter Tested-by: Jon Hunter Signed-off-by: Thierry Reding --- include/linux/host1x.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/host1x.h b/include/linux/host1x.h index 00278853eadf..c0bf4e581fe9 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -31,6 +31,11 @@ u64 host1x_get_dma_mask(struct host1x *host1x); * struct host1x_bo_cache - host1x buffer object cache * @mappings: list of mappings * @lock: synchronizes accesses to the list of mappings + * + * Note that entries are not periodically evicted from this cache and instead need to be + * explicitly released. This is used primarily for DRM/KMS where the cache's reference is + * released when the last reference to a buffer object represented by a mapping in this + * cache is dropped. */ struct host1x_bo_cache { struct list_head mappings; -- cgit v1.2.3 From a60707d74bd1d119cf7bcc5101cda912fc46d5e3 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:45:57 +0800 Subject: sched: Move child_runs_first sysctls to fair.c move child_runs_first sysctls to fair.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c1076b5e17fb..1d2ff3cd1728 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -14,8 +14,6 @@ extern unsigned long sysctl_hung_task_timeout_secs; enum { sysctl_hung_task_timeout_secs = 0 }; #endif -extern unsigned int sysctl_sched_child_runs_first; - enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, -- cgit v1.2.3 From f5ef06d58be8311a9425e6a54a053ecb350952f3 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:45:58 +0800 Subject: sched: Move schedstats sysctls to core.c move schedstats sysctls to core.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 1d2ff3cd1728..6c7a6850559b 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -64,8 +64,6 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern unsigned int sysctl_sched_energy_aware; -- cgit v1.2.3 From d9ab0e63fa7f8405fbb19e28c5191e0880a7f2db Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:45:59 +0800 Subject: sched: Move rt_period/runtime sysctls to rt.c move rt_period/runtime sysctls to rt.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 6c7a6850559b..4391c1307945 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,15 +31,6 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif -/* - * control realtime throttling: - * - * /proc/sys/kernel/sched_rt_period_us - * /proc/sys/kernel/sched_rt_runtime_us - */ -extern unsigned int sysctl_sched_rt_period; -extern int sysctl_sched_rt_runtime; - extern unsigned int sysctl_sched_dl_period_max; extern unsigned int sysctl_sched_dl_period_min; @@ -58,8 +49,6 @@ extern int sched_rr_timeslice; int sched_rr_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int sched_rt_handler(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, -- cgit v1.2.3 From 84227c12888b1105725cd2de14705b029bcbb4b2 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:00 +0800 Subject: sched: Move deadline_period sysctls to deadline.c move deadline_period sysctls to deadline.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 4391c1307945..7da9b94c5e1c 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,9 +31,6 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif -extern unsigned int sysctl_sched_dl_period_max; -extern unsigned int sysctl_sched_dl_period_min; - #ifdef CONFIG_UCLAMP_TASK extern unsigned int sysctl_sched_uclamp_util_min; extern unsigned int sysctl_sched_uclamp_util_max; -- cgit v1.2.3 From dafd7a9dad22fadcb290b24dff54e2eae3b89776 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:01 +0800 Subject: sched: Move rr_timeslice sysctls to rt.c move rr_timeslice sysctls to rt.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 7da9b94c5e1c..3d307e512d1f 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -41,11 +41,6 @@ extern unsigned int sysctl_sched_uclamp_util_min_rt_default; extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif -extern int sysctl_sched_rr_timeslice; -extern int sched_rr_timeslice; - -int sched_rr_handler(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, -- cgit v1.2.3 From 3267e0156c3341ac25b37a0f60551cdae1634b60 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:02 +0800 Subject: sched: Move uclamp_util sysctls to core.c move uclamp_util sysctls to core.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3d307e512d1f..0934b21a57a4 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,18 +31,10 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif -#ifdef CONFIG_UCLAMP_TASK -extern unsigned int sysctl_sched_uclamp_util_min; -extern unsigned int sysctl_sched_uclamp_util_max; -extern unsigned int sysctl_sched_uclamp_util_min_rt_default; -#endif - #ifdef CONFIG_CFS_BANDWIDTH extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif -int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.2.3 From d4ae80ffa64f87b9c355692b680b603add084e96 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:03 +0800 Subject: sched: Move cfs_bandwidth_slice sysctls to fair.c move cfs_bandwidth_slice sysctls to fair.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 0934b21a57a4..198f77c8a873 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,10 +31,6 @@ extern int sysctl_numa_balancing_mode; #define sysctl_numa_balancing_mode 0 #endif -#ifdef CONFIG_CFS_BANDWIDTH -extern unsigned int sysctl_sched_cfs_bandwidth_slice; -#endif - int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.2.3 From 8a0441415b3f9b9a920a6a5086580ea3daa7b884 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Tue, 15 Feb 2022 19:46:04 +0800 Subject: sched: Move energy_aware sysctls to topology.c move energy_aware sysctls to topology.c and use the new register_sysctl_init() to register the sysctl interface. Signed-off-by: Zhen Ni Signed-off-by: Luis Chamberlain --- include/linux/sched/sysctl.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 198f77c8a873..e650946816d0 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -34,10 +34,4 @@ extern int sysctl_numa_balancing_mode; int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) -extern unsigned int sysctl_sched_energy_aware; -int sched_energy_aware_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -#endif - #endif /* _LINUX_SCHED_SYSCTL_H */ -- cgit v1.2.3 From 06d177662fb86b80c7fc2290667b9a14cb0bd925 Mon Sep 17 00:00:00 2001 From: tangmeng Date: Thu, 17 Feb 2022 12:23:21 +0800 Subject: kernel/reboot: move reboot sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the poweroff_cmd and ctrl-alt-del sysctls to its own file, kernel/reboot.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/reboot.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index af907a3d68d1..a2429648d831 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -71,12 +71,8 @@ extern void kernel_restart(char *cmd); extern void kernel_halt(void); extern void kernel_power_off(void); -extern int C_A_D; /* for sysctl */ void ctrl_alt_del(void); -#define POWEROFF_CMD_PATH_LEN 256 -extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN]; - extern void orderly_poweroff(bool force); extern void orderly_reboot(void); void hw_protection_shutdown(const char *reason, int ms_until_forced); -- cgit v1.2.3 From 43fe219aa56a2fdd8f0623c9470a32b14b0617a5 Mon Sep 17 00:00:00 2001 From: sujiaxun Date: Thu, 17 Feb 2022 18:51:48 -0800 Subject: mm: move oom_kill sysctls to their own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the oom_kill sysctls to their own file, mm/oom_kill.c [sfr@canb.auug.org.au: null-terminate the array] Link: https://lkml.kernel.org/r/20220216193202.28838626@canb.auug.org.au Link: https://lkml.kernel.org/r/20220215093203.31032-1-sujiaxun@uniontech.com Signed-off-by: sujiaxun Signed-off-by: Stephen Rothwell Cc: Kees Cook Cc: Iurii Zaikin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Luis Chamberlain --- include/linux/oom.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index 2db9a1432511..02d1e7bbd8cd 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -123,8 +123,4 @@ extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); -/* sysctls */ -extern int sysctl_oom_dump_tasks; -extern int sysctl_oom_kill_allocating_task; -extern int sysctl_panic_on_oom; #endif /* _INCLUDE_LINUX_OOM_H */ -- cgit v1.2.3 From aa779e5102195e1d9ade95dcbc0bfbd8f916eb59 Mon Sep 17 00:00:00 2001 From: zhanglianjie Date: Thu, 17 Feb 2022 18:51:51 -0800 Subject: mm: move page-writeback sysctls to their own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the page-writeback sysctls to its own file. [akpm@linux-foundation.org: coding-style cleanups] akpm@linux-foundation.org: fix CONFIG_SYSCTL=n warnings] Link: https://lkml.kernel.org/r/20220129012955.26594-1-zhanglianjie@uniontech.com Signed-off-by: zhanglianjie Cc: Kees Cook Cc: Iurii Zaikin Cc: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Luis Chamberlain --- include/linux/writeback.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fec248ab1fec..dc2b94e6a94f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -345,28 +345,13 @@ void wb_domain_exit(struct wb_domain *dom); extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ -extern int dirty_background_ratio; -extern unsigned long dirty_background_bytes; -extern int vm_dirty_ratio; -extern unsigned long vm_dirty_bytes; extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; extern unsigned int dirtytime_expire_interval; -extern int vm_highmem_is_dirtyable; extern int laptop_mode; -int dirty_background_ratio_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_background_bytes_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_ratio_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_bytes_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int dirtytime_interval_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); -- cgit v1.2.3 From f79c9b8ae8bde10126586c1bb55b5fd027276d8e Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:58:57 +0800 Subject: kernel/lockdep: move lockdep sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the prove_locking and lock_stat sysctls to its own file, kernel/lockdep.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/lockdep.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 467b94257105..37951c17908e 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -16,10 +16,6 @@ struct task_struct; -/* for sysctl */ -extern int prove_locking; -extern int lock_stat; - #ifdef CONFIG_LOCKDEP #include -- cgit v1.2.3 From 9df918698408fd914493aba0b7858fef50eba63a Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:59:12 +0800 Subject: kernel/panic: move panic sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the oops_all_cpu_backtrace sysctl to its own file, kernel/panic.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/panic.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/panic.h b/include/linux/panic.h index f5844908a089..e71161da69c4 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -15,12 +15,6 @@ extern void oops_enter(void); extern void oops_exit(void); extern bool oops_may_print(void); -#ifdef CONFIG_SMP -extern unsigned int sysctl_oops_all_cpu_backtrace; -#else -#define sysctl_oops_all_cpu_backtrace 0 -#endif /* CONFIG_SMP */ - extern int panic_timeout; extern unsigned long panic_print; extern int panic_on_oops; -- cgit v1.2.3 From 801b501439d1b366d524dee4fc1e6b3473a95b9a Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:59:23 +0800 Subject: kernel/acct: move acct sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the acct sysctl to its own file, kernel/acct.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/acct.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/acct.h b/include/linux/acct.h index bc70e81895c0..2718c4854815 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -21,7 +21,6 @@ #ifdef CONFIG_BSD_PROCESS_ACCT struct pid_namespace; -extern int acct_parm[]; /* for sysctl */ extern void acct_collect(long exitcode, int group_dead); extern void acct_process(void); extern void acct_exit_ns(struct pid_namespace *); -- cgit v1.2.3 From 1186618a6a35d43a865448472a261184b608d13c Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:59:36 +0800 Subject: kernel/delayacct: move delayacct sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the delayacct sysctl to its own file, kernel/delayacct.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/delayacct.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 3e03d010bd2e..6b16a6930a19 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -61,9 +61,6 @@ extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); -extern int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); - extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); -- cgit v1.2.3 From d772cc2c321900f3f463a124eebeb7218e66dda6 Mon Sep 17 00:00:00 2001 From: tangmeng Date: Fri, 18 Feb 2022 18:59:49 +0800 Subject: kernel/do_mount_initrd: move real_root_dev sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. All filesystem syctls now get reviewed by fs folks. This commit follows the commit of fs, move the real_root_dev sysctl to its own file, kernel/do_mount_initrd.c. Signed-off-by: tangmeng Signed-off-by: Luis Chamberlain --- include/linux/initrd.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/initrd.h b/include/linux/initrd.h index 1bbe9af48dc3..f1a1f4c92ded 100644 --- a/include/linux/initrd.h +++ b/include/linux/initrd.h @@ -29,8 +29,6 @@ static inline void wait_for_initramfs(void) {} extern phys_addr_t phys_initrd_start; extern unsigned long phys_initrd_size; -extern unsigned int real_root_dev; - extern char __initramfs_start[]; extern unsigned long __initramfs_size; -- cgit v1.2.3 From 8e4e83b2278bdfb55cb2b13de07cf0a721ce8af7 Mon Sep 17 00:00:00 2001 From: Wei Xiao Date: Wed, 23 Feb 2022 19:11:53 +0800 Subject: ftrace: move sysctl_ftrace_enabled to ftrace.c This moves ftrace_enabled to trace/ftrace.c. We move sysctls to places where features actually belong to improve the readability of the code and reduce the risk of code merge conflicts. At the same time, the proc-sysctl maintainers do not want to know what sysctl knobs you wish to add for your owner piece of code, we just care about the core logic. Signed-off-by: Wei Xiao Acked-by: Steven Rostedt (Google) Signed-off-by: Luis Chamberlain --- include/linux/ftrace.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4816b7e11047..088b915853dd 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -101,9 +101,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val #ifdef CONFIG_FUNCTION_TRACER extern int ftrace_enabled; -extern int -ftrace_enable_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS -- cgit v1.2.3 From 5ea98e01ab524cbc53dad8aebd27b434ebe5d074 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 7 Mar 2022 15:33:39 -0600 Subject: x86/boot: Add Confidential Computing type to setup_data While launching encrypted guests, the hypervisor may need to provide some additional information during the guest boot. When booting under an EFI-based BIOS, the EFI configuration table contains an entry for the confidential computing blob that contains the required information. To support booting encrypted guests on non-EFI VMs, the hypervisor needs to pass this additional information to the guest kernel using a different method. For this purpose, introduce SETUP_CC_BLOB type in setup_data to hold the physical address of the confidential computing blob location. The boot loader or hypervisor may choose to use this method instead of an EFI configuration table. The CC blob location scanning should give preference to a setup_data blob over an EFI configuration table. In AMD SEV-SNP, the CC blob contains the address of the secrets and CPUID pages. The secrets page includes information such as a VM to PSP communication key and the CPUID page contains PSP-filtered CPUID values. Define the AMD SEV confidential computing blob structure. While at it, define the EFI GUID for the confidential computing blob. [ bp: Massage commit message, mark struct __packed. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220307213356.2797205-30-brijesh.singh@amd.com --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index ccd4d3f91c98..984aa688997a 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -390,6 +390,7 @@ void efi_native_runtime_setup(void); #define EFI_CERT_SHA256_GUID EFI_GUID(0xc1c41626, 0x504c, 0x4092, 0xac, 0xa9, 0x41, 0xf9, 0x36, 0x93, 0x43, 0x28) #define EFI_CERT_X509_GUID EFI_GUID(0xa5c059a1, 0x94e4, 0x4aa7, 0x87, 0xb5, 0xab, 0x15, 0x5c, 0x2b, 0xf0, 0x72) #define EFI_CERT_X509_SHA256_GUID EFI_GUID(0x3bd2a492, 0x96c0, 0x4079, 0xb4, 0x20, 0xfc, 0xf9, 0x8e, 0xf1, 0x03, 0xed) +#define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42) /* * This GUID is used to pass to the kernel proper the struct screen_info -- cgit v1.2.3 From bae1a962ac2c5e6be08319ff3f7d6df542584fce Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:33 +0300 Subject: x86/topology: Disable CPU online/offline control for TDX guests Unlike regular VMs, TDX guests use the firmware hand-off wakeup method to wake up the APs during the boot process. This wakeup model uses a mailbox to communicate with firmware to bring up the APs. As per the design, this mailbox can only be used once for the given AP, which means after the APs are booted, the same mailbox cannot be used to offline/online the given AP. More details about this requirement can be found in Intel TDX Virtual Firmware Design Guide, sec titled "AP initialization in OS" and in sec titled "Hotplug Device". Since the architecture does not support any method of offlining the CPUs, disable CPU hotplug support in the kernel. Since this hotplug disable feature can be re-used by other VM guests, add a new CC attribute CC_ATTR_HOTPLUG_DISABLED and use it to disable the hotplug support. Attempt to offline CPU will fail with -EOPNOTSUPP. Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-25-kirill.shutemov@linux.intel.com --- include/linux/cc_platform.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index efd8205282da..691494bbaf5a 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -72,6 +72,16 @@ enum cc_attr { * Examples include TDX guest & SEV. */ CC_ATTR_GUEST_UNROLL_STRING_IO, + + /** + * @CC_ATTR_HOTPLUG_DISABLED: Hotplug is not supported or disabled. + * + * The platform/OS is running as a guest/virtual machine does not + * support CPU hotplug feature. + * + * Examples include TDX Guest. + */ + CC_ATTR_HOTPLUG_DISABLED, }; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM -- cgit v1.2.3 From 6264f58ca0e54e41d63c2d00334a48bac28fbf30 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Apr 2022 14:37:54 -0700 Subject: net: extract a few internals from netdevice.h There's a number of functions and static variables used under net/core/ but not from the outside. We currently dump most of them into netdevice.h. That bad for many reasons: - netdevice.h is very cluttered, hard to figure out what the APIs are; - netdevice.h is very long; - we have to touch netdevice.h more which causes expensive incremental builds. Create a header under net/core/ and move some declarations. The new header is also a bit of a catch-all but that's fine, if we create more specific headers people will likely over-think where their declaration fit best. And end up putting them in netdevice.h, again. More work should be done on splitting netdevice.h into more targeted headers, but that'd be more time consuming so small steps. Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 72 ++--------------------------------------------- 1 file changed, 2 insertions(+), 70 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7b2a0b739684..7e7b2a72e473 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -59,7 +59,8 @@ struct dsa_port; struct ip_tunnel_parm; struct macsec_context; struct macsec_ops; - +struct netdev_name_node; +struct sd_flow_limit; struct sfp_bus; /* 802.11 specific */ struct wireless_dev; @@ -1020,16 +1021,6 @@ struct dev_ifalias { struct devlink; struct tlsdev_ops; -struct netdev_name_node { - struct hlist_node hlist; - struct list_head list; - struct net_device *dev; - const char *name; -}; - -int netdev_name_node_alt_create(struct net_device *dev, const char *name); -int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); - struct netdev_net_notifier { struct list_head list; struct notifier_block *nb; @@ -2975,7 +2966,6 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); -int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); @@ -3034,19 +3024,6 @@ static inline bool dev_has_header(const struct net_device *dev) return dev->header_ops && dev->header_ops->create; } -#ifdef CONFIG_NET_FLOW_LIMIT -#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ -struct sd_flow_limit { - u64 count; - unsigned int num_buckets; - unsigned int history_head; - u16 history[FLOW_LIMIT_HISTORY]; - u8 buckets[]; -}; - -extern int netdev_flow_limit_table_len; -#endif /* CONFIG_NET_FLOW_LIMIT */ - /* * Incoming packets are placed on per-CPU queues */ @@ -3770,7 +3747,6 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); void __dev_notify_flags(struct net_device *, unsigned int old_flags, unsigned int gchanges); -int dev_change_name(struct net_device *, const char *); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int __dev_change_net_namespace(struct net_device *dev, struct net *net, @@ -3782,13 +3758,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, return __dev_change_net_namespace(dev, net, pat, 0); } int __dev_set_mtu(struct net_device *, int); -int dev_validate_mtu(struct net_device *dev, int mtu, - struct netlink_ext_ack *extack); -int dev_set_mtu_ext(struct net_device *dev, int mtu, - struct netlink_ext_ack *extack); int dev_set_mtu(struct net_device *, int); -int dev_change_tx_queue_len(struct net_device *, unsigned long); -void dev_set_group(struct net_device *, int); int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, @@ -3796,24 +3766,13 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); -int dev_change_carrier(struct net_device *, bool new_carrier); -int dev_get_phys_port_id(struct net_device *dev, - struct netdev_phys_item_id *ppid); -int dev_get_phys_port_name(struct net_device *dev, - char *name, size_t len); int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); -int dev_change_proto_down(struct net_device *dev, bool proto_down); -void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, - u32 value); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); -typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); -int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, - int fd, int expected_fd, u32 flags); int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u8 dev_xdp_prog_count(struct net_device *dev); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); @@ -3898,13 +3857,6 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev, bool dev_nit_active(struct net_device *dev); void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); -extern int netdev_budget; -extern unsigned int netdev_budget_usecs; - -/* Used by rtnetlink.c:__rtnl_unlock()/rtnl_unlock() */ -extern struct list_head net_todo_list; -void netdev_run_todo(void); - static inline void __dev_put(struct net_device *dev) { if (dev) { @@ -4021,10 +3973,7 @@ static inline void dev_replace_track(struct net_device *odev, * called netif_lowerlayer_*() because they represent the state of any * kind of lower layer not just hardware media. */ - -void linkwatch_init_dev(struct net_device *dev); void linkwatch_fire_event(struct net_device *dev); -void linkwatch_forget_dev(struct net_device *dev); /** * netif_carrier_ok - test if carrier present @@ -4470,9 +4419,6 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); -void dev_addr_flush(struct net_device *dev); -int dev_addr_init(struct net_device *dev); -void dev_addr_check(struct net_device *dev); /* Functions used for unicast addresses handling */ int dev_uc_add(struct net_device *dev, const unsigned char *addr); @@ -4562,7 +4508,6 @@ static inline void __dev_mc_unsync(struct net_device *dev, /* Functions used for secondary unicast and multicast support */ void dev_set_rx_mode(struct net_device *dev); -void __dev_set_rx_mode(struct net_device *dev); int dev_set_promiscuity(struct net_device *dev, int inc); int dev_set_allmulti(struct net_device *dev, int inc); void netdev_state_change(struct net_device *dev); @@ -4580,11 +4525,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); extern int netdev_max_backlog; -extern int netdev_tstamp_prequeue; -extern int netdev_unregister_timeout_secs; -extern int weight_p; -extern int dev_weight_rx_bias; -extern int dev_weight_tx_bias; extern int dev_rx_weight; extern int dev_tx_weight; extern int gro_normal_batch; @@ -4772,12 +4712,6 @@ static inline void netdev_rx_csum_fault(struct net_device *dev, void net_enable_timestamp(void); void net_disable_timestamp(void); -#ifdef CONFIG_PROC_FS -int __init dev_proc_init(void); -#else -#define dev_proc_init() 0 -#endif - static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev, bool more) @@ -4813,8 +4747,6 @@ extern const struct kobj_ns_type_operations net_ns_type_operations; const char *netdev_drivername(const struct net_device *dev); -void linkwatch_run_queue(void); - static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, netdev_features_t f2) { -- cgit v1.2.3 From 794c24e9921f32ded4422833a990ccf11dc3c00e Mon Sep 17 00:00:00 2001 From: Jeffrey Ji Date: Wed, 6 Apr 2022 17:26:00 +0000 Subject: net-core: rx_otherhost_dropped to core_stats Increment rx_otherhost_dropped counter when packet dropped due to mismatched dest MAC addr. An example when this drop can occur is when manually crafting raw packets that will be consumed by a user space application via a tap device. For testing purposes local traffic was generated using trafgen for the client and netcat to start a server Tested: Created 2 netns, sent 1 packet using trafgen from 1 to the other with "{eth(daddr=$INCORRECT_MAC...}", verified that iproute2 showed the counter was incremented. (Also had to modify iproute2 to show the stat, additional patch for that coming next.) Signed-off-by: Jeffrey Ji Reviewed-by: Brian Vazquez Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220406172600.1141083-1-jeffreyjilinux@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7e7b2a72e473..28ea4f8269d4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -203,6 +203,7 @@ struct net_device_core_stats { local_t rx_dropped; local_t tx_dropped; local_t rx_nohandler; + local_t rx_otherhost_dropped; } __aligned(4 * sizeof(local_t)); #include @@ -3837,6 +3838,7 @@ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ DEV_CORE_STATS_INC(rx_dropped) DEV_CORE_STATS_INC(tx_dropped) DEV_CORE_STATS_INC(rx_nohandler) +DEV_CORE_STATS_INC(rx_otherhost_dropped) static __always_inline int ____dev_forward_skb(struct net_device *dev, struct sk_buff *skb, -- cgit v1.2.3 From bdae79651453df0bca20963fc2ab970146ef2a37 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Tue, 8 Mar 2022 22:40:51 +0800 Subject: efi/cper: Add a cper_mem_err_status_str() to decode error description Introduce a new helper function cper_mem_err_status_str() to decode the error status value into a human readable string. [ bp: Massage. ] Signed-off-by: Shuai Xue Signed-off-by: Borislav Petkov Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220308144053.49090-2-xueshuai@linux.alibaba.com --- include/linux/cper.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cper.h b/include/linux/cper.h index 6a511a1078ca..5b1dd27b317d 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -558,6 +558,7 @@ extern const char *const cper_proc_error_type_strs[4]; u64 cper_next_record_id(void); const char *cper_severity_str(unsigned int); const char *cper_mem_err_type_str(unsigned int); +const char *cper_mem_err_status_str(u64 status); void cper_print_bits(const char *prefix, unsigned int bits, const char * const strs[], unsigned int strs_size); void cper_mem_err_pack(const struct cper_sec_mem_err *, -- cgit v1.2.3 From ed27b5df3877458eb24615fd9c202178660db009 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Tue, 8 Mar 2022 22:40:52 +0800 Subject: EDAC/ghes: Unify CPER memory error location reporting Switch the GHES EDAC memory error reporting functions to use the common CPER ones and get rid of code duplication. [ bp: - rewrite commit message, remove useless text - rip out useless reformatting - align function params on the opening brace - rename function to a more descriptive name - drop useless function exports - handle buffer lengths properly when printing other detail - remove useless casting ] Signed-off-by: Shuai Xue Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220308144053.49090-3-xueshuai@linux.alibaba.com --- include/linux/cper.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cper.h b/include/linux/cper.h index 5b1dd27b317d..eacb7dd7b3af 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -569,5 +569,7 @@ void cper_print_proc_arm(const char *pfx, const struct cper_sec_proc_arm *proc); void cper_print_proc_ia(const char *pfx, const struct cper_sec_proc_ia *proc); +int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg); +int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg); #endif -- cgit v1.2.3 From 85ebb1a6bd62147ebcfa70500d513331a8daf9e0 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Apr 2022 13:35:52 +0300 Subject: gpiolib: Introduce for_each_gpiochip_node() loop helper Introduce for_each_gpiochip_node() loop helper which iterates over the GPIO controller child nodes of a given device. Signed-off-by: Andy Shevchenko Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Acked-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 98c93510640e..bfc91f122d5f 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -3,13 +3,14 @@ #define __LINUX_GPIO_DRIVER_H #include -#include #include #include #include #include #include #include +#include +#include struct gpio_desc; struct of_phandle_args; @@ -750,4 +751,8 @@ static inline void gpiochip_unlock_as_irq(struct gpio_chip *gc, } #endif /* CONFIG_GPIOLIB */ +#define for_each_gpiochip_node(dev, child) \ + device_for_each_child_node(dev, child) \ + if (!fwnode_property_present(child, "gpio-controller")) {} else + #endif /* __LINUX_GPIO_DRIVER_H */ -- cgit v1.2.3 From 0b19dde90ad004592792a928c75e80612be3e2e8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Apr 2022 13:35:53 +0300 Subject: gpiolib: Introduce gpiochip_node_count() helper The gpiochip_node_count() helper iterates over the device child nodes that have the "gpio-controller" property set. It returns the number of such nodes under a given device. Signed-off-by: Andy Shevchenko Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Acked-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index bfc91f122d5f..12de0b22b4ef 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -755,4 +755,15 @@ static inline void gpiochip_unlock_as_irq(struct gpio_chip *gc, device_for_each_child_node(dev, child) \ if (!fwnode_property_present(child, "gpio-controller")) {} else +static inline unsigned int gpiochip_node_count(struct device *dev) +{ + struct fwnode_handle *child; + unsigned int count = 0; + + for_each_gpiochip_node(dev, child) + count++; + + return count; +} + #endif /* __LINUX_GPIO_DRIVER_H */ -- cgit v1.2.3 From 2fa33b3518a8da0a5345b7ae0064223b5e4e156f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 6 Apr 2022 11:25:36 +0300 Subject: net/mlx5_fpga: Drop INNOVA IPsec support Mellanox INNOVA IPsec cards are EOL in Nov, 2019 [1]. As such, the code is unmaintained, untested and not in-use by any upstream/distro oriented customers. In order to reduce code complexity, drop the kernel code. [1] https://network.nvidia.com/related-docs/eol/LCR-000535.pdf Link: https://lore.kernel.org/r/2afe88ec5020a491079eacf6fe3c89b64d65195c.1649232994.git.leonro@nvidia.com Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc_fpga.h | 148 ------------------------------------- 1 file changed, 148 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index e3d824f6a309..45c7c0d67635 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -386,68 +386,6 @@ struct mlx5_ifc_fpga_destroy_qp_out_bits { u8 reserved_at_40[0x40]; }; -struct mlx5_ifc_ipsec_extended_cap_bits { - u8 encapsulation[0x20]; - - u8 reserved_0[0x12]; - u8 v2_command[0x1]; - u8 udp_encap[0x1]; - u8 rx_no_trailer[0x1]; - u8 ipv4_fragment[0x1]; - u8 ipv6[0x1]; - u8 esn[0x1]; - u8 lso[0x1]; - u8 transport_and_tunnel_mode[0x1]; - u8 tunnel_mode[0x1]; - u8 transport_mode[0x1]; - u8 ah_esp[0x1]; - u8 esp[0x1]; - u8 ah[0x1]; - u8 ipv4_options[0x1]; - - u8 auth_alg[0x20]; - - u8 enc_alg[0x20]; - - u8 sa_cap[0x20]; - - u8 reserved_1[0x10]; - u8 number_of_ipsec_counters[0x10]; - - u8 ipsec_counters_addr_low[0x20]; - u8 ipsec_counters_addr_high[0x20]; -}; - -struct mlx5_ifc_ipsec_counters_bits { - u8 dec_in_packets[0x40]; - - u8 dec_out_packets[0x40]; - - u8 dec_bypass_packets[0x40]; - - u8 enc_in_packets[0x40]; - - u8 enc_out_packets[0x40]; - - u8 enc_bypass_packets[0x40]; - - u8 drop_dec_packets[0x40]; - - u8 failed_auth_dec_packets[0x40]; - - u8 drop_enc_packets[0x40]; - - u8 success_add_sa[0x40]; - - u8 fail_add_sa[0x40]; - - u8 success_delete_sa[0x40]; - - u8 fail_delete_sa[0x40]; - - u8 dropped_cmd[0x40]; -}; - enum { MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RETRY_COUNTER_EXPIRED = 0x1, MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RNR_EXPIRED = 0x2, @@ -464,90 +402,4 @@ struct mlx5_ifc_fpga_qp_error_event_bits { u8 reserved_at_c0[0x8]; u8 fpga_qpn[0x18]; }; -enum mlx5_ifc_fpga_ipsec_response_syndrome { - MLX5_FPGA_IPSEC_RESPONSE_SUCCESS = 0, - MLX5_FPGA_IPSEC_RESPONSE_ILLEGAL_REQUEST = 1, - MLX5_FPGA_IPSEC_RESPONSE_SADB_ISSUE = 2, - MLX5_FPGA_IPSEC_RESPONSE_WRITE_RESPONSE_ISSUE = 3, -}; - -struct mlx5_ifc_fpga_ipsec_cmd_resp { - __be32 syndrome; - union { - __be32 sw_sa_handle; - __be32 flags; - }; - u8 reserved[24]; -} __packed; - -enum mlx5_ifc_fpga_ipsec_cmd_opcode { - MLX5_FPGA_IPSEC_CMD_OP_ADD_SA = 0, - MLX5_FPGA_IPSEC_CMD_OP_DEL_SA = 1, - MLX5_FPGA_IPSEC_CMD_OP_ADD_SA_V2 = 2, - MLX5_FPGA_IPSEC_CMD_OP_DEL_SA_V2 = 3, - MLX5_FPGA_IPSEC_CMD_OP_MOD_SA_V2 = 4, - MLX5_FPGA_IPSEC_CMD_OP_SET_CAP = 5, -}; - -enum mlx5_ifc_fpga_ipsec_cap { - MLX5_FPGA_IPSEC_CAP_NO_TRAILER = BIT(0), -}; - -struct mlx5_ifc_fpga_ipsec_cmd_cap { - __be32 cmd; - __be32 flags; - u8 reserved[24]; -} __packed; - -enum mlx5_ifc_fpga_ipsec_sa_flags { - MLX5_FPGA_IPSEC_SA_ESN_EN = BIT(0), - MLX5_FPGA_IPSEC_SA_ESN_OVERLAP = BIT(1), - MLX5_FPGA_IPSEC_SA_IPV6 = BIT(2), - MLX5_FPGA_IPSEC_SA_DIR_SX = BIT(3), - MLX5_FPGA_IPSEC_SA_SPI_EN = BIT(4), - MLX5_FPGA_IPSEC_SA_SA_VALID = BIT(5), - MLX5_FPGA_IPSEC_SA_IP_ESP = BIT(6), - MLX5_FPGA_IPSEC_SA_IP_AH = BIT(7), -}; - -enum mlx5_ifc_fpga_ipsec_sa_enc_mode { - MLX5_FPGA_IPSEC_SA_ENC_MODE_NONE = 0, - MLX5_FPGA_IPSEC_SA_ENC_MODE_AES_GCM_128_AUTH_128 = 1, - MLX5_FPGA_IPSEC_SA_ENC_MODE_AES_GCM_256_AUTH_128 = 3, -}; - -struct mlx5_ifc_fpga_ipsec_sa_v1 { - __be32 cmd; - u8 key_enc[32]; - u8 key_auth[32]; - __be32 sip[4]; - __be32 dip[4]; - union { - struct { - __be32 reserved; - u8 salt_iv[8]; - __be32 salt; - } __packed gcm; - struct { - u8 salt[16]; - } __packed cbc; - }; - __be32 spi; - __be32 sw_sa_handle; - __be16 tfclen; - u8 enc_mode; - u8 reserved1[2]; - u8 flags; - u8 reserved2[2]; -}; - -struct mlx5_ifc_fpga_ipsec_sa { - struct mlx5_ifc_fpga_ipsec_sa_v1 ipsec_sa_v1; - __be16 udp_sp; - __be16 udp_dp; - u8 reserved1[4]; - __be32 esn; - __be16 vid; /* only 12 bits, rest is reserved */ - __be16 reserved2; -} __packed; #endif /* MLX5_IFC_FPGA_H */ -- cgit v1.2.3 From de8bdb476908e64805df4bfbad20618cbb1f9ffa Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 6 Apr 2022 11:25:42 +0300 Subject: RDMA/mlx5: Drop crypto flow steering API The mlx5 flow steering crypto API was intended to be used in FPGA devices, which is not supported for years already. The removal of mlx5 crypto FPGA code together with inability to configure encryption keys makes the low steering API completely unusable. So delete the code, so any ESP flow steering requests will fail with not supported error, as it is happening now anyway as no device support this type of API. Link: https://lore.kernel.org/r/634a5face7734381463d809bfb89850f6998deac.1649232994.git.leonro@nvidia.com Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky --- include/linux/mlx5/accel.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index dacf69516002..af67d51308cf 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -111,10 +111,6 @@ struct mlx5_accel_esp_xfrm { struct mlx5_accel_esp_xfrm_attrs attrs; }; -enum { - MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA = 1UL << 0, -}; - enum mlx5_accel_ipsec_cap { MLX5_ACCEL_IPSEC_CAP_DEVICE = 1 << 0, MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA = 1 << 1, @@ -132,8 +128,7 @@ u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev); struct mlx5_accel_esp_xfrm * mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, - const struct mlx5_accel_esp_xfrm_attrs *attrs, - u32 flags); + const struct mlx5_accel_esp_xfrm_attrs *attrs); void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm); int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, const struct mlx5_accel_esp_xfrm_attrs *attrs); @@ -144,8 +139,10 @@ static inline u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev) { ret static inline struct mlx5_accel_esp_xfrm * mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, - const struct mlx5_accel_esp_xfrm_attrs *attrs, - u32 flags) { return ERR_PTR(-EOPNOTSUPP); } + const struct mlx5_accel_esp_xfrm_attrs *attrs) +{ + return ERR_PTR(-EOPNOTSUPP); +} static inline void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm) {} static inline int -- cgit v1.2.3 From 2451da081a343e079d9f5a7b063fcdf0bc439aa8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 6 Apr 2022 11:25:46 +0300 Subject: net/mlx5: Unify device IPsec capabilities check Merge two different function to one in order to provide coherent picture if the device is IPsec capable or not. Link: https://lore.kernel.org/r/8f10ea06ad19c6f651e9fb33921009658f01e1d5.1649232994.git.leonro@nvidia.com Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky --- include/linux/mlx5/accel.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index af67d51308cf..9145e2d37c0e 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -124,7 +124,7 @@ enum mlx5_accel_ipsec_cap { #ifdef CONFIG_MLX5_ACCEL -u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev); +u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev); struct mlx5_accel_esp_xfrm * mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, @@ -135,7 +135,10 @@ int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, #else -static inline u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev) { return 0; } +static inline u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev) +{ + return 0; +} static inline struct mlx5_accel_esp_xfrm * mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, -- cgit v1.2.3 From 54deb0e77561973f4ca4515e18ab972c281eea1d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 6 Apr 2022 11:25:48 +0300 Subject: net/mlx5: Remove not-needed IPsec config In current code, the CONFIG_MLX5_IPSEC and CONFIG_MLX5_EN_IPSEC are the same. So remove useless indirection. Link: https://lore.kernel.org/r/fd14492cbc01a0d51a5bfedde02bcd2154123fde.1649232994.git.leonro@nvidia.com Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky --- include/linux/mlx5/accel.h | 4 ++-- include/linux/mlx5/driver.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index 9145e2d37c0e..73e4d50a9f02 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -122,7 +122,7 @@ enum mlx5_accel_ipsec_cap { MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN = 1 << 7, }; -#ifdef CONFIG_MLX5_ACCEL +#ifdef CONFIG_MLX5_EN_IPSEC u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev); @@ -152,5 +152,5 @@ static inline int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, const struct mlx5_accel_esp_xfrm_attrs *attrs) { return -EOPNOTSUPP; } -#endif /* CONFIG_MLX5_ACCEL */ +#endif /* CONFIG_MLX5_EN_IPSEC */ #endif /* __MLX5_ACCEL_H__ */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9424503eb8d3..5af53c035949 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -778,7 +778,7 @@ struct mlx5_core_dev { #ifdef CONFIG_MLX5_FPGA struct mlx5_fpga_device *fpga; #endif -#ifdef CONFIG_MLX5_ACCEL +#ifdef CONFIG_MLX5_EN_IPSEC const struct mlx5_accel_ipsec_ops *ipsec_ops; #endif struct mlx5_clock clock; -- cgit v1.2.3 From f2b41b32cde8453a0a26875261f0e26809c2805a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 6 Apr 2022 11:25:51 +0300 Subject: net/mlx5: Remove ipsec_ops function table There is only one IPsec implementation and ipsec_ops is not needed at all in this situation. Together with removal of ipsec_ops, we can drop the entry checks as these functions are called for IPsec devices only. Link: https://lore.kernel.org/r/bc8dd1c8a77b65dbf5e2cf92c813ffaca2505c5f.1649232994.git.leonro@nvidia.com Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5af53c035949..ff47d49d8be4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -777,9 +777,6 @@ struct mlx5_core_dev { } roce; #ifdef CONFIG_MLX5_FPGA struct mlx5_fpga_device *fpga; -#endif -#ifdef CONFIG_MLX5_EN_IPSEC - const struct mlx5_accel_ipsec_ops *ipsec_ops; #endif struct mlx5_clock clock; struct mlx5_ib_clock_info *clock_info; -- cgit v1.2.3 From 2984287c4c19949d7eb451dcad0bd5c54a2a376f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 6 Apr 2022 11:25:52 +0300 Subject: net/mlx5: Remove not-implemented IPsec capabilities Clean a capabilities enum to remove not-implemented bits. Link: https://lore.kernel.org/r/1044bb7b779107ff38e48e3f6553421104f3f819.1649232994.git.leonro@nvidia.com Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky --- include/linux/mlx5/accel.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index 73e4d50a9f02..0f2596297f6a 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -113,13 +113,10 @@ struct mlx5_accel_esp_xfrm { enum mlx5_accel_ipsec_cap { MLX5_ACCEL_IPSEC_CAP_DEVICE = 1 << 0, - MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA = 1 << 1, - MLX5_ACCEL_IPSEC_CAP_ESP = 1 << 2, - MLX5_ACCEL_IPSEC_CAP_IPV6 = 1 << 3, - MLX5_ACCEL_IPSEC_CAP_LSO = 1 << 4, - MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER = 1 << 5, - MLX5_ACCEL_IPSEC_CAP_ESN = 1 << 6, - MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN = 1 << 7, + MLX5_ACCEL_IPSEC_CAP_ESP = 1 << 1, + MLX5_ACCEL_IPSEC_CAP_IPV6 = 1 << 2, + MLX5_ACCEL_IPSEC_CAP_LSO = 1 << 3, + MLX5_ACCEL_IPSEC_CAP_ESN = 1 << 4, }; #ifdef CONFIG_MLX5_EN_IPSEC -- cgit v1.2.3 From efaa0227f6c6a5073951b20cf2f8c63c4155306c Mon Sep 17 00:00:00 2001 From: tangmeng Date: Tue, 15 Feb 2022 14:50:19 +0800 Subject: timers: Move timer sysctl into the timer code This is part of the effort to reduce kernel/sysctl.c to only contain the core logic. Signed-off-by: tangmeng Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220215065019.7520-1-tangmeng@uniontech.com --- include/linux/timer.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index fda13c9d1256..648f00105f58 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -196,14 +196,6 @@ extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -struct ctl_table; - -extern unsigned int sysctl_timer_migration; -int timer_migration_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -#endif - unsigned long __round_jiffies(unsigned long j, int cpu); unsigned long __round_jiffies_relative(unsigned long j, int cpu); unsigned long round_jiffies(unsigned long j); -- cgit v1.2.3 From a8b6d6708bb682108d8c899bc0cb7873240daf8a Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 7 Feb 2022 15:38:28 +0100 Subject: iio: core: Enhance the kernel doc of modes and currentmodes iio_dev entries Let's provide more details about these two variables because their understanding may not be straightforward for someone not used to the IIO subsystem internal logic. The different modes will soon be also be more documented for the same reason. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20220207143840.707510-2-miquel.raynal@bootlin.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index faf00f2c0be6..f191b80466cd 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -488,8 +488,15 @@ struct iio_buffer_setup_ops { /** * struct iio_dev - industrial I/O device - * @modes: [DRIVER] operating modes supported by device - * @currentmode: [INTERN] current operating mode + * @modes: [DRIVER] bitmask listing all the operating modes + * supported by the IIO device. This list should be + * initialized before registering the IIO device. It can + * also be filed up by the IIO core, as a result of + * enabling particular features in the driver + * (see iio_triggered_event_setup()). + * @currentmode: [INTERN] operating mode currently in use, may be + * eventually checked by device drivers but should be + * considered read-only as this is a core internal bit * @dev: [DRIVER] device structure, should be assigned a parent * and owner * @buffer: [DRIVER] any buffer present -- cgit v1.2.3 From 474010127e2505fc463236470908e1ff5ddb3578 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 7 Feb 2022 15:38:33 +0100 Subject: iio: st_sensors: Add a local lock for protecting odr Right now the (framework) mlock lock is (ab)used for multiple purposes: 1- protecting concurrent accesses over the odr local cache 2- avoid changing samplig frequency whilst buffer is running Let's start by handling situation #1 with a local lock. Suggested-by: Jonathan Cameron Cc: Denis Ciocca Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20220207143840.707510-7-miquel.raynal@bootlin.com Signed-off-by: Jonathan Cameron --- include/linux/iio/common/st_sensors.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index 22f67845cdd3..db4a1b260348 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -237,6 +237,7 @@ struct st_sensor_settings { * @hw_irq_trigger: if we're using the hardware interrupt on the sensor. * @hw_timestamp: Latest timestamp from the interrupt handler, when in use. * @buffer_data: Data used by buffer part. + * @odr_lock: Local lock for preventing concurrent ODR accesses/changes */ struct st_sensor_data { struct iio_trigger *trig; @@ -261,6 +262,8 @@ struct st_sensor_data { s64 hw_timestamp; char buffer_data[ST_SENSORS_MAX_BUFFER_SIZE] ____cacheline_aligned; + + struct mutex odr_lock; }; #ifdef CONFIG_IIO_BUFFER -- cgit v1.2.3 From 2f53b4adfede66f1bc1c8bb7efd7ced2bad1191a Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 7 Feb 2022 15:38:36 +0100 Subject: iio: Un-inline iio_buffer_enabled() As we are going to hide the currentmode inside the opaque structure, this helper would soon need to call a non-inline function which would simply drop the benefit of having the helper defined inline in a header. One alternative is to move this helper in the core as there is no more interest in defining it inline in a header. We will pay the minor cost either way. Let's do like the iio_device_id() helper which also refers to the opaque structure and gets defined in the core. Suggested-by: Jonathan Cameron Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20220207143840.707510-10-miquel.raynal@bootlin.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index f191b80466cd..faabb852128a 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -550,6 +550,7 @@ struct iio_dev { }; int iio_device_id(struct iio_dev *indio_dev); +bool iio_buffer_enabled(struct iio_dev *indio_dev); const struct iio_chan_spec *iio_find_channel_from_si(struct iio_dev *indio_dev, int si); @@ -679,16 +680,6 @@ struct iio_dev *devm_iio_device_alloc(struct device *parent, int sizeof_priv); __printf(2, 3) struct iio_trigger *devm_iio_trigger_alloc(struct device *parent, const char *fmt, ...); -/** - * iio_buffer_enabled() - helper function to test if the buffer is enabled - * @indio_dev: IIO device structure for device - **/ -static inline bool iio_buffer_enabled(struct iio_dev *indio_dev) -{ - return indio_dev->currentmode - & (INDIO_BUFFER_TRIGGERED | INDIO_BUFFER_HARDWARE | - INDIO_BUFFER_SOFTWARE); -} /** * iio_get_debugfs_dentry() - helper function to get the debugfs_dentry -- cgit v1.2.3 From 8c576f87ad7eb639b8bd4472a9bb830e0696dda5 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 7 Feb 2022 15:38:37 +0100 Subject: iio: core: Hide read accesses to iio_dev->currentmode In order to later move this variable within the opaque structure, let's create a helper for accessing it in read-only mode. This helper will be exposed to device drivers and kept accessible for the few that could need it. The write access to this variable however should be fully reserved to the core so in a second step we will hide this variable into the opaque structure. Cc: Eugen Hristev Cc: Nicolas Ferre Cc: Alexandre Belloni Cc: Ludovic Desroches Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20220207143840.707510-11-miquel.raynal@bootlin.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index faabb852128a..31098ffa7dc9 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -550,6 +550,7 @@ struct iio_dev { }; int iio_device_id(struct iio_dev *indio_dev); +int iio_device_get_current_mode(struct iio_dev *indio_dev); bool iio_buffer_enabled(struct iio_dev *indio_dev); const struct iio_chan_spec -- cgit v1.2.3 From 51570c9d4b3a678f77a50ac139f67290e946ec86 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 7 Feb 2022 15:38:38 +0100 Subject: iio: core: Move the currentmode entry to the opaque structure This entry should, under no situation, be modified by device drivers. Now that we have limited its read access to device drivers really needing it and did so through a dedicated helper, we can easily move this variable to the opaque structure in order to prevent any further modification from non-authorized code (out of the core, basically). Signed-off-by: Miquel Raynal Reviewed-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20220207143840.707510-12-miquel.raynal@bootlin.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio-opaque.h | 4 ++++ include/linux/iio/iio.h | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h index 2be12b7b5dc5..6b3586b3f952 100644 --- a/include/linux/iio/iio-opaque.h +++ b/include/linux/iio/iio-opaque.h @@ -7,6 +7,9 @@ * struct iio_dev_opaque - industrial I/O device opaque information * @indio_dev: public industrial I/O device information * @id: used to identify device internally + * @currentmode: operating mode currently in use, may be eventually + * checked by device drivers but should be considered + * read-only as this is a core internal bit * @driver_module: used to make it harder to undercut users * @info_exist_lock: lock to prevent use during removal * @trig_readonly: mark the current trigger immutable @@ -36,6 +39,7 @@ */ struct iio_dev_opaque { struct iio_dev indio_dev; + int currentmode; int id; struct module *driver_module; struct mutex info_exist_lock; diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 31098ffa7dc9..85cb924debd9 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -494,9 +494,6 @@ struct iio_buffer_setup_ops { * also be filed up by the IIO core, as a result of * enabling particular features in the driver * (see iio_triggered_event_setup()). - * @currentmode: [INTERN] operating mode currently in use, may be - * eventually checked by device drivers but should be - * considered read-only as this is a core internal bit * @dev: [DRIVER] device structure, should be assigned a parent * and owner * @buffer: [DRIVER] any buffer present @@ -523,7 +520,6 @@ struct iio_buffer_setup_ops { */ struct iio_dev { int modes; - int currentmode; struct device dev; struct iio_buffer *buffer; -- cgit v1.2.3 From f67c6c73cb07a4778425a2064640333ef7bfa42b Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 7 Feb 2022 15:38:39 +0100 Subject: iio: core: Simplify the registration of kfifo buffers Among all the users of the kfifo buffers, no one uses the INDIO_BUFFER_HARDWARE mode. So let's take this as a general rule and simplify a little bit the internals - overall the documentation - by eliminating unused specific cases. Use the INDIO_BUFFER_SOFTWARE mode by default with kfifo buffers, which will basically mimic what all the "non direct" modes do. Cc: Benson Leung Cc: Guenter Roeck Cc: Jyoti Bhayana Cc: Jean-Baptiste Maneyrol Cc: Lorenzo Bianconi Cc: Michael Hennerich Cc: Greg Kroah-Hartman Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20220207143840.707510-13-miquel.raynal@bootlin.com Signed-off-by: Jonathan Cameron --- include/linux/iio/kfifo_buf.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/kfifo_buf.h b/include/linux/iio/kfifo_buf.h index ccd2ceae7b25..8a83fb58232d 100644 --- a/include/linux/iio/kfifo_buf.h +++ b/include/linux/iio/kfifo_buf.h @@ -12,11 +12,10 @@ void iio_kfifo_free(struct iio_buffer *r); int devm_iio_kfifo_buffer_setup_ext(struct device *dev, struct iio_dev *indio_dev, - int mode_flags, const struct iio_buffer_setup_ops *setup_ops, const struct attribute **buffer_attrs); -#define devm_iio_kfifo_buffer_setup(dev, indio_dev, mode_flags, setup_ops) \ - devm_iio_kfifo_buffer_setup_ext((dev), (indio_dev), (mode_flags), (setup_ops), NULL) +#define devm_iio_kfifo_buffer_setup(dev, indio_dev, setup_ops) \ + devm_iio_kfifo_buffer_setup_ext((dev), (indio_dev), (setup_ops), NULL) #endif -- cgit v1.2.3 From 4f1a22ee7b576a38dc5705837c9b0de0c7b5b064 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 8 Apr 2022 17:04:12 +0800 Subject: libata: Improve ATA queued command allocation Improve ATA queued command allocation as follows: - For attaining a qc tag for a SAS host we need to allocate a bit in ata_port.sas_tag_allocated bitmap. However we already have a unique tag per device in range [0, ATA_MAX_QUEUE -1] in the scsi cmnd budget token, so just use that instead. - It is a bit pointless to have ata_qc_new_init() in libata-core.c since it pokes scsi internals, so inline it in ata_scsi_qc_new() (in libata-scsi.c). Also update Doc accordingly. - Use standard SCSI helpers set_host_byte() and set_status_byte() in ata_scsi_qc_new(). Christoph Hellwig originally contributed the change to inline ata_qc_new_init(). Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Signed-off-by: Damien Le Moal --- include/linux/libata.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 9b1d3d8b1252..16107122e587 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -820,7 +820,6 @@ struct ata_port { unsigned int cbl; /* cable type; ATA_CBL_xxx */ struct ata_queued_cmd qcmd[ATA_MAX_QUEUE + 1]; - unsigned long sas_tag_allocated; /* for sas tag allocation only */ u64 qc_active; int nr_active_links; /* #links with active qcs */ unsigned int sas_last_tag; /* track next tag hw expects */ -- cgit v1.2.3 From 9f8ed577c28813410614b418bad42285840c1a00 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Apr 2022 14:20:50 +0800 Subject: net: skb: rename SKB_DROP_REASON_PTYPE_ABSENT As David Ahern suggested, the reasons for skb drops should be more general and not be code based. Therefore, rename SKB_DROP_REASON_PTYPE_ABSENT to SKB_DROP_REASON_UNHANDLED_PROTO, which is used for the cases of no L3 protocol handler, no L4 protocol handler, version extensions, etc. From previous discussion, now we have the aim to make these reasons more abstract and users based, avoiding code based. Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2394441fa3dd..173bc35a10a3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -408,11 +408,9 @@ enum skb_drop_reason { */ SKB_DROP_REASON_XDP, /* dropped by XDP in input path */ SKB_DROP_REASON_TC_INGRESS, /* dropped in TC ingress HOOK */ - SKB_DROP_REASON_PTYPE_ABSENT, /* not packet_type found to handle - * the skb. For an etner packet, - * this means that L3 protocol is - * not supported - */ + SKB_DROP_REASON_UNHANDLED_PROTO, /* protocol not implemented + * or not supported + */ SKB_DROP_REASON_SKB_CSUM, /* sk_buff checksum computation * error */ -- cgit v1.2.3 From b384c95a861eebf47e88695cf6a29f34e0b10b0f Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Apr 2022 14:20:52 +0800 Subject: net: icmp: add skb drop reasons to icmp protocol Replace kfree_skb() used in icmp_rcv() and icmpv6_rcv() with kfree_skb_reason(). In order to get the reasons of the skb drops after icmp message handle, we change the return type of 'handler()' in 'struct icmp_control' from 'bool' to 'enum skb_drop_reason'. This may change its original intention, as 'false' means failure, but 'SKB_NOT_DROPPED_YET' means success now. Therefore, all 'handler' and the call of them need to be handled. Following 'handler' functions are involved: icmp_unreach() icmp_redirect() icmp_echo() icmp_timestamp() icmp_discard() And following new drop reasons are added: SKB_DROP_REASON_ICMP_CSUM SKB_DROP_REASON_INVALID_PROTO The reason 'INVALID_PROTO' is introduced for the case that the packet doesn't follow rfc 1122 and is dropped. This is not a common case, and I believe we can locate the problem from the data in the packet. For now, this 'INVALID_PROTO' is used for the icmp broadcasts with wrong types. Maybe there should be a document file for these reasons. For example, list all the case that causes the 'UNHANDLED_PROTO' and 'INVALID_PROTO' drop reason. Therefore, users can locate their problems according to the document. Reviewed-by: Hao Peng Reviewed-by: Jiang Biao Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 173bc35a10a3..9b81ba497665 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -442,6 +442,11 @@ enum skb_drop_reason { SKB_DROP_REASON_TAP_TXFILTER, /* dropped by tx filter implemented * at tun/tap, e.g., check_filter() */ + SKB_DROP_REASON_ICMP_CSUM, /* ICMP checksum error */ + SKB_DROP_REASON_INVALID_PROTO, /* the packet doesn't follow RFC + * 2211, such as a broadcasts + * ICMP_TIMESTAMP + */ SKB_DROP_REASON_MAX, }; -- cgit v1.2.3 From 2e2ac4a3327479f7e2744cdd88a5c823f2057bad Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Wed, 6 Apr 2022 22:15:20 +0200 Subject: tty: goldfish: Introduce gf_ioread32()/gf_iowrite32() The goldfish TTY device was clearly defined as having little-endian registers, but the switch to __raw_{read,write}l(() broke its driver when running on big-endian kernels (if anyone ever tried this). The m68k qemu implementation got this wrong, and assumed native-endian registers. While this is a bug in qemu, it is probably impossible to fix that since there is no way of knowing which other operating systems have started relying on that bug over the years. Hence revert commit da31de35cd2f ("tty: goldfish: use __raw_writel()/__raw_readl()", and define gf_ioread32()/gf_iowrite32() to be able to use accessors defined by the architecture. Cc: stable@vger.kernel.org # v5.11+ Fixes: da31de35cd2fb78f ("tty: goldfish: use __raw_writel()/__raw_readl()") Signed-off-by: Laurent Vivier Link: https://lore.kernel.org/r/20220406201523.243733-2-laurent@vivier.eu [geert: Add rationale based on Arnd's comments] Signed-off-by: Geert Uytterhoeven --- include/linux/goldfish.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/goldfish.h b/include/linux/goldfish.h index 12be1601fd84..bcc17f95b906 100644 --- a/include/linux/goldfish.h +++ b/include/linux/goldfish.h @@ -8,14 +8,21 @@ /* Helpers for Goldfish virtual platform */ +#ifndef gf_ioread32 +#define gf_ioread32 ioread32 +#endif +#ifndef gf_iowrite32 +#define gf_iowrite32 iowrite32 +#endif + static inline void gf_write_ptr(const void *ptr, void __iomem *portl, void __iomem *porth) { const unsigned long addr = (unsigned long)ptr; - __raw_writel(lower_32_bits(addr), portl); + gf_iowrite32(lower_32_bits(addr), portl); #ifdef CONFIG_64BIT - __raw_writel(upper_32_bits(addr), porth); + gf_iowrite32(upper_32_bits(addr), porth); #endif } @@ -23,9 +30,9 @@ static inline void gf_write_dma_addr(const dma_addr_t addr, void __iomem *portl, void __iomem *porth) { - __raw_writel(lower_32_bits(addr), portl); + gf_iowrite32(lower_32_bits(addr), portl); #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT - __raw_writel(upper_32_bits(addr), porth); + gf_iowrite32(upper_32_bits(addr), porth); #endif } -- cgit v1.2.3 From 52126d4c03798cc55aa927fea4c776ab26b5a5f0 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 6 Feb 2022 10:47:45 +0100 Subject: dmaengine: Remove a useless mutex According to lib/idr.c, The IDA handles its own locking. It is safe to call any of the IDA functions without synchronisation in your code. so the 'chan_mutex' mutex can just be removed. It is here only to protect some ida_alloc()/ida_free() calls. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/7180452c1d77b039e27b6f9418e0e7d9dd33c431.1644140845.git.christophe.jaillet@wanadoo.fr Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 842d4f7ca752..6db9e03afd0b 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -870,7 +870,6 @@ struct dma_device { struct device *dev; struct module *owner; struct ida chan_ida; - struct mutex chan_mutex; /* to protect chan_ida */ u32 src_addr_widths; u32 dst_addr_widths; -- cgit v1.2.3 From 95ebe80d99de3cb849c522a1f768e5e8befa0b7c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Jan 2022 13:16:18 -0800 Subject: srcu: Fix s/is/if/ typo in srcu_node comment This commit fixed a typo in the srcu_node structure's ->srcu_have_cbs comment. While in the area, redo a couple of comments to take advantage of 100-character line lengths. Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index cb1f4351e8ba..4025840ba9a3 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -47,11 +47,9 @@ struct srcu_data { */ struct srcu_node { spinlock_t __private lock; - unsigned long srcu_have_cbs[4]; /* GP seq for children */ - /* having CBs, but only */ - /* is > ->srcu_gq_seq. */ - unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs */ - /* have CBs for given GP? */ + unsigned long srcu_have_cbs[4]; /* GP seq for children having CBs, but only */ + /* if greater than ->srcu_gq_seq. */ + unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs have CBs for given GP? */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ struct srcu_node *srcu_parent; /* Next up in tree. */ int grplo; /* Least CPU for node. */ -- cgit v1.2.3 From 994f706872e6ce080506bd795ecf783d5b617de6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Jan 2022 09:46:57 -0800 Subject: srcu: Make Tree SRCU able to operate without snp_node array This commit makes Tree SRCU able to operate without an snp_node array, that is, when the srcu_data structures' ->mynode pointers are NULL. This can result in high contention on the srcu_struct structure's ->lock, but only when there are lots of call_srcu(), synchronize_srcu(), and synchronize_srcu_expedited() calls. Note that when there is no snp_node array, all SRCU callbacks use CPU 0's callback queue. This is optimal in the common case of low update-side load because it removes the need to search each CPU for the single callback that made the grace period happen. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 4025840ba9a3..8d1da136a93a 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -63,8 +63,9 @@ struct srcu_struct { struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */ struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ + int srcu_size_state; /* Small-to-big transition state. */ struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ - spinlock_t __private lock; /* Protect counters */ + spinlock_t __private lock; /* Protect counters and size state. */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ unsigned int srcu_idx; /* Current rdr array element. */ unsigned long srcu_gp_seq; /* Grace-period seq #. */ @@ -83,6 +84,17 @@ struct srcu_struct { struct lockdep_map dep_map; }; +/* Values for size state variable (->srcu_size_state). */ +#define SRCU_SIZE_SMALL 0 +#define SRCU_SIZE_ALLOC 1 +#define SRCU_SIZE_WAIT_BARRIER 2 +#define SRCU_SIZE_WAIT_CALL 3 +#define SRCU_SIZE_WAIT_CBS1 4 +#define SRCU_SIZE_WAIT_CBS2 5 +#define SRCU_SIZE_WAIT_CBS3 6 +#define SRCU_SIZE_WAIT_CBS4 7 +#define SRCU_SIZE_BIG 8 + /* Values for state variable (bottom bits of ->srcu_gp_seq). */ #define SRCU_STATE_IDLE 0 #define SRCU_STATE_SCAN1 1 -- cgit v1.2.3 From 2ec303113d978931ef368886c4c6bc854493e8bf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Jan 2022 16:13:52 -0800 Subject: srcu: Dynamically allocate srcu_node array This commit shrinks the srcu_struct structure by converting its ->node field from a fixed-size compile-time array to a pointer to a dynamically allocated array. In kernels built with large values of NR_CPUS that boot on systems with smaller numbers of CPUs, this can save significant memory. [ paulmck: Apply kernel test robot feedback. ] Reported-by: A cast of thousands Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 8d1da136a93a..8501b6b45941 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -60,7 +60,7 @@ struct srcu_node { * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */ + struct srcu_node *node; /* Combining tree. */ struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ int srcu_size_state; /* Small-to-big transition state. */ -- cgit v1.2.3 From db8f1471c61336477e2bf74dcb00e67d650e6dea Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 26 Jan 2022 10:03:54 -0500 Subject: srcu: Use export for srcu_struct defined by DEFINE_STATIC_SRCU() If an srcu_struct structure defined by tree SRCU's DEFINE_STATIC_SRCU() is used by a module, sparse will give the following diagnostic: sparse: symbol '__srcu_struct_nodes_srcu' was not declared. Should it be static? The problem is that a within-module DEFINE_STATIC_SRCU() must define a non-static srcu_struct because it is exported by referencing it in a special '__section("___srcu_struct_ptrs")'. This reference is needed so that module load and unloading can invoke init_srcu_struct() and cleanup_srcu_struct(), respectively. Unfortunately, sparse is unaware of '__section("___srcu_struct_ptrs")', resulting in the above false-positive diagnostic. To avoid this false positive, this commit therefore creates a prototype of the srcu_struct with an "extern" keyword. Signed-off-by: Alexander Aring Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 8501b6b45941..44e998643f48 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -131,6 +131,7 @@ struct srcu_struct { #ifdef MODULE # define __DEFINE_SRCU(name, is_static) \ is_static struct srcu_struct name; \ + extern struct srcu_struct * const __srcu_struct_##name; \ struct srcu_struct * const __srcu_struct_##name \ __section("___srcu_struct_ptrs") = &name #else -- cgit v1.2.3 From 46470cf85d2b61abd37c6f66c4dacc1bc510d10f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jan 2022 13:20:49 -0800 Subject: srcu: Prevent cleanup_srcu_struct() from freeing non-dynamic ->sda When an srcu_struct structure is created (but not in a kernel module) by DEFINE_SRCU() and friends, the per-CPU srcu_data structure is statically allocated. In all other cases, that structure is obtained from alloc_percpu(), in which case cleanup_srcu_struct() must invoke free_percpu() on the resulting ->sda pointer in the srcu_struct pointer. Which it does. Except that it also invokes free_percpu() on the ->sda pointer referencing the statically allocated per-CPU srcu_data structures. Which free_percpu() is surprisingly OK with. This commit nevertheless stops cleanup_srcu_struct() from freeing statically allocated per-CPU srcu_data structures. Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 44e998643f48..44bd204498a1 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -73,6 +73,7 @@ struct srcu_struct { unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ + bool sda_is_static; /* May ->sda be passed to free_percpu()? */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ struct completion srcu_barrier_completion; -- cgit v1.2.3 From 9f2e91d94c91558e3764fe4e01c5e6281a90f239 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jan 2022 20:32:05 -0800 Subject: srcu: Add contention-triggered addition of srcu_node tree This commit instruments the acquisitions of the srcu_struct structure's ->lock, enabling the initiation of a transition from SRCU_SIZE_SMALL to SRCU_SIZE_BIG when sufficient contention is experienced. The instrumentation counts the number of trylock failures within the confines of a single jiffy. If that number exceeds the value specified by the srcutree.small_contention_lim kernel boot parameter (which defaults to 100), and if the value specified by the srcutree.convert_to_big kernel boot parameter has the 0x10 bit set (defaults to 0), then a transition will be automatically initiated. By default, there will never be any transitions, so that none of the srcu_struct structures ever gains an srcu_node array. The useful values for srcutree.convert_to_big are: 0x00: Never convert. 0x01: Always convert at init_srcu_struct() time. 0x02: Convert when rcutorture prints its first round of statistics. 0x03: Decide conversion approach at boot given system size. 0x10: Convert if contention is encountered. 0x12: Convert if contention is encountered or when rcutorture prints its first round of statistics, whichever comes first. The value 0x11 acts the same as 0x01 because the conversion happens before there is any chance of contention. [ paulmck: Apply "static" feedback from kernel test robot. ] Co-developed-by: Neeraj Upadhyay Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 44bd204498a1..1b9ff4ed37e4 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -72,6 +72,8 @@ struct srcu_struct { unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ + unsigned long srcu_size_jiffies; /* Current contention-measurement interval. */ + unsigned long srcu_n_lock_retries; /* Contention events in current interval. */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ bool sda_is_static; /* May ->sda be passed to free_percpu()? */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ -- cgit v1.2.3 From 5d90070816534882b9158f14154b7e2cdef1194a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 Mar 2022 10:41:44 -0800 Subject: rcu-tasks: Make Tasks RCU account for userspace execution The main Tasks RCU quiescent state is voluntary context switch. However, userspace execution is also a valid quiescent state, and is a valuable one for userspace applications that spin repeatedly executing light-weight non-sleeping system calls. Currently, such an application can delay a Tasks RCU grace period for many tens of seconds. This commit therefore enlists the aid of the scheduler-clock interrupt to provide a Tasks RCU quiescent state when it interrupted a task executing in userspace. [ paulmck: Apply feedback from kernel test robot. ] Cc: Martin KaFai Lau Cc: Neil Spring Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index e7c39c200e2b..1a32036c918c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -196,6 +196,7 @@ void synchronize_rcu_tasks_rude(void); void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +#define rcu_tasks_classic_qs(t, preempt) do { } while (0) #define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu -- cgit v1.2.3 From bd6c375b92c3f367e184d164e12952e4b9d9fb4f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 15 Mar 2022 16:33:38 +0100 Subject: rcutorture: Call preempt_schedule() through static call/key The rcutorture test suite sometimess triggers a random scheduler preemption call while simulating a read delay. Unfortunately, its direct call to preempt_schedule() bypasses the static call/key filter used by CONFIG_PREEMPT_DYNAMIC. This breaks the no-preempt assumption when the dynamic preemption mode is "none". For example, rcu_blocking_is_gp() is fooled and abbreviates grace periods when the CPU runs in no-preempt UP mode. Fix this by making torture_preempt_schedule() call __preempt_schedule(), which uses the static call/key. Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 63fa4196e51c..7038104463e4 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -118,7 +118,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); _torture_stop_kthread("Stopping " #n " task", &(tp)) #ifdef CONFIG_PREEMPTION -#define torture_preempt_schedule() preempt_schedule() +#define torture_preempt_schedule() __preempt_schedule() #else #define torture_preempt_schedule() do { } while (0) #endif -- cgit v1.2.3 From a28c1ab312712c26a8d004af1f68628d625dafac Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 9 Apr 2022 22:13:56 +0300 Subject: ata: libata-core: fix parameter type in ata_xfer_mode2shift() The data transfer mode that corresponds to the 'xfer_mode' parameter for ata_xfer_mode2shift() is a 8-bit *unsigned* value. Using *unsigned long* to declare the parameter leads to a problematic implicit *int* to *unsigned long* cast and was most probably a result of a copy/paste mistake -- use the 'u8' type instead, as in ata_xfer_mode2mask()... Found by Linux Verification Center (linuxtesting.org) with the SVACE static analysis tool. Signed-off-by: Sergey Shtylyov Signed-off-by: Damien Le Moal --- include/linux/libata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 16107122e587..732de9014626 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1110,7 +1110,7 @@ extern void ata_unpack_xfermask(unsigned long xfer_mask, unsigned long *udma_mask); extern u8 ata_xfer_mask2mode(unsigned long xfer_mask); extern unsigned long ata_xfer_mode2mask(u8 xfer_mode); -extern int ata_xfer_mode2shift(unsigned long xfer_mode); +extern int ata_xfer_mode2shift(u8 xfer_mode); extern const char *ata_mode_string(unsigned long xfer_mask); extern unsigned long ata_id_xfermask(const u16 *id); extern int ata_std_qc_defer(struct ata_queued_cmd *qc); -- cgit v1.2.3 From 1a95e04e29a116c3424988c70c441ca8ec2779ff Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 12 Apr 2022 11:24:00 +0100 Subject: net: phylink: remove phylink_helper_basex_speed() As there are now no users of phylink_helper_basex_speed(), we can remove this obsolete functionality. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 223781622b33..6d06896fc20d 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -160,11 +160,6 @@ struct phylink_mac_ops { * clearing unsupported speeds and duplex settings. The port modes * should not be cleared; phylink_set_port_modes() will help with this. * - * If the @state->interface mode is %PHY_INTERFACE_MODE_1000BASEX - * or %PHY_INTERFACE_MODE_2500BASEX, select the appropriate mode - * based on @state->advertising and/or @state->speed and update - * @state->interface accordingly. See phylink_helper_basex_speed(). - * * When @config->supported_interfaces has been set, phylink will iterate * over the supported interfaces to determine the full capability of the * MAC. The validation function must not print errors if @state->interface @@ -579,7 +574,6 @@ int phylink_speed_up(struct phylink *pl); #define phylink_test(bm, mode) __phylink_do_bit(test_bit, bm, mode) void phylink_set_port_modes(unsigned long *bits); -void phylink_helper_basex_speed(struct phylink_link_state *state); void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state, u16 bmsr, u16 lpa); -- cgit v1.2.3 From 1306d5362a591493a2d07f685ed2cc480dcda320 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 13 Apr 2022 13:51:56 +0300 Subject: net: add ndo_fdb_del_bulk Add a new netdev op called ndo_fdb_del_bulk, it will be later used for driver-specific bulk delete implementation dispatched from rtnetlink. The first user will be the bridge, we need it to signal to rtnetlink from the driver that we support bulk delete operation (NLM_F_BULK). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28ea4f8269d4..a602f29365b0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1260,6 +1260,10 @@ struct netdev_net_notifier { * struct net_device *dev, * const unsigned char *addr, u16 vid) * Deletes the FDB entry from dev coresponding to addr. + * int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, struct nlattr *tb[], + * struct net_device *dev, + * u16 vid, + * struct netlink_ext_ack *extack); * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, * struct net_device *dev, struct net_device *filter_dev, * int *idx) @@ -1510,6 +1514,11 @@ struct net_device_ops { struct net_device *dev, const unsigned char *addr, u16 vid); + int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, + struct nlattr *tb[], + struct net_device *dev, + u16 vid, + struct netlink_ext_ack *extack); int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, -- cgit v1.2.3 From d6d3146ce532268ad0ffd8d92d2b7492898decf1 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 13 Apr 2022 16:15:52 +0800 Subject: skb: add some helpers for skb drop reasons In order to simply the definition and assignment for 'enum skb_drop_reason', introduce some helpers. SKB_DR() is used to define a variable of type 'enum skb_drop_reason' with the 'SKB_DROP_REASON_NOT_SPECIFIED' initial value. SKB_DR_SET() is used to set the value of the variable. Seems it is a little useless? But it makes the code shorter. SKB_DR_OR() is used to set the value of the variable if it is not set yet, which means its value is SKB_DROP_REASON_NOT_SPECIFIED. Signed-off-by: Menglong Dong Reviewed-by: Jiang Biao Reviewed-by: Hao Peng Signed-off-by: David S. Miller --- include/linux/skbuff.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9b81ba497665..0cbd6ada957c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -450,6 +450,18 @@ enum skb_drop_reason { SKB_DROP_REASON_MAX, }; +#define SKB_DR_INIT(name, reason) \ + enum skb_drop_reason name = SKB_DROP_REASON_##reason +#define SKB_DR(name) \ + SKB_DR_INIT(name, NOT_SPECIFIED) +#define SKB_DR_SET(name, reason) \ + (name = SKB_DROP_REASON_##reason) +#define SKB_DR_OR(name, reason) \ + do { \ + if (name == SKB_DROP_REASON_NOT_SPECIFIED) \ + SKB_DR_SET(name, reason); \ + } while (0) + /* To allow 64K frame to be packed as single skb without frag_list we * require 64K/PAGE_SIZE pages plus 1 additional page to allow for * buffers which do not start on a page boundary. -- cgit v1.2.3 From c4eb664191b4a5ff6856478f903924176697719e Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 13 Apr 2022 16:15:53 +0800 Subject: net: ipv4: add skb drop reasons to ip_error() Eventually, I find out the handler function for inputting route lookup fail: ip_error(). The drop reasons we used in ip_error() are almost corresponding to IPSTATS_MIB_*, and following new reasons are introduced: SKB_DROP_REASON_IP_INADDRERRORS SKB_DROP_REASON_IP_INNOROUTES Isn't the name SKB_DROP_REASON_IP_HOSTUNREACH and SKB_DROP_REASON_IP_NETUNREACH more accurate? To make them corresponding to IPSTATS_MIB_*, we keep their name still. Signed-off-by: Menglong Dong Reviewed-by: Jiang Biao Reviewed-by: Hao Peng Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0cbd6ada957c..886e83ac4b70 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -447,6 +447,12 @@ enum skb_drop_reason { * 2211, such as a broadcasts * ICMP_TIMESTAMP */ + SKB_DROP_REASON_IP_INADDRERRORS, /* host unreachable, corresponding + * to IPSTATS_MIB_INADDRERRORS + */ + SKB_DROP_REASON_IP_INNOROUTES, /* network unreachable, corresponding + * to IPSTATS_MIB_INADDRERRORS + */ SKB_DROP_REASON_MAX, }; -- cgit v1.2.3 From 2edc1a383fda8d2f580216292dfd9daeae691e47 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 13 Apr 2022 16:15:55 +0800 Subject: net: ip: add skb drop reasons to ip forwarding Replace kfree_skb() which is used in ip6_forward() and ip_forward() with kfree_skb_reason(). The new drop reason 'SKB_DROP_REASON_PKT_TOO_BIG' is introduced for the case that the length of the packet exceeds MTU and can't fragment. Signed-off-by: Menglong Dong Reviewed-by: Jiang Biao Reviewed-by: Hao Peng Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 886e83ac4b70..0ef11df1bc67 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -453,6 +453,9 @@ enum skb_drop_reason { SKB_DROP_REASON_IP_INNOROUTES, /* network unreachable, corresponding * to IPSTATS_MIB_INADDRERRORS */ + SKB_DROP_REASON_PKT_TOO_BIG, /* packet size is too big (maybe exceed + * the MTU) + */ SKB_DROP_REASON_MAX, }; -- cgit v1.2.3 From 1ad6d548e2a452f21bcee4606ee4ec7afcde5f37 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 13 Apr 2022 16:15:56 +0800 Subject: net: icmp: introduce function icmpv6_param_prob_reason() In order to add the skb drop reasons support to icmpv6_param_prob(), introduce the function icmpv6_param_prob_reason() and make icmpv6_param_prob() an inline call to it. This new function will be used in the following patches. Signed-off-by: Menglong Dong Reviewed-by: Jiang Biao Reviewed-by: Hao Peng Signed-off-by: David S. Miller --- include/linux/icmpv6.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index 9055cb380ee2..db0f4fcfdaf4 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -79,8 +79,9 @@ extern int icmpv6_init(void); extern int icmpv6_err_convert(u8 type, u8 code, int *err); extern void icmpv6_cleanup(void); -extern void icmpv6_param_prob(struct sk_buff *skb, - u8 code, int pos); +extern void icmpv6_param_prob_reason(struct sk_buff *skb, + u8 code, int pos, + enum skb_drop_reason reason); struct flowi6; struct in6_addr; @@ -91,6 +92,12 @@ extern void icmpv6_flow_init(struct sock *sk, const struct in6_addr *daddr, int oif); +static inline void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) +{ + icmpv6_param_prob_reason(skb, code, pos, + SKB_DROP_REASON_NOT_SPECIFIED); +} + static inline bool icmpv6_is_err(int type) { switch (type) { -- cgit v1.2.3 From bdc21a4d286c6917fe966fd765de99411095b1b4 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 21 Mar 2022 09:57:22 +0000 Subject: PM: EM: Add .get_cost() callback The Energy Model (EM) supports devices which report abstract power scale, not only real Watts. The primary goal for EM is to enable the Energy Aware Scheduler (EAS) for a given platform. Some of the platforms might not be able to deliver proper power values. The only information that they might have is the relative efficiency between CPU types. Thus, it makes sense to remove some restrictions in the EM framework and introduce a mechanism which would support those platforms. What is crucial for EAS to operate is the 'cost' field in the EM. The 'cost' is calculated internally in EM framework based on knowledge from 'power' values. The 'cost' values must be strictly increasing. The existing API with its 'power' value size restrictions cannot guarantee that the 'cost' will meet this requirement. Since the platform is missing this detailed information, but has only efficiency details, introduce a new custom callback in the EM framework. The new callback would allow to provide the 'cost' values which reflect efficiency of the CPUs. This would allow to provide EAS information which has different relation than what would be forced by the EM internal formulas calculating 'cost' values. Thanks to this new callback it is possible to create a system view for EAS which has no overlapping performance states across many Performance Domains. Signed-off-by: Lukasz Luba Reviewed-by: Ionela Voinescu Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 9f3c400bc52d..0a3a5663177b 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -114,9 +114,30 @@ struct em_data_callback { */ int (*active_power)(unsigned long *power, unsigned long *freq, struct device *dev); + + /** + * get_cost() - Provide the cost at the given performance state of + * a device + * @dev : Device for which we do this operation (can be a CPU) + * @freq : Frequency at the performance state in kHz + * @cost : The cost value for the performance state + * (modified) + * + * In case of CPUs, the cost is the one of a single CPU in the domain. + * It is expected to fit in the [0, EM_MAX_POWER] range due to internal + * usage in EAS calculation. + * + * Return 0 on success, or appropriate error value in case of failure. + */ + int (*get_cost)(struct device *dev, unsigned long freq, + unsigned long *cost); }; -#define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb } #define EM_SET_ACTIVE_POWER_CB(em_cb, cb) ((em_cb).active_power = cb) +#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) \ + { .active_power = _active_power_cb, \ + .get_cost = _cost_cb } +#define EM_DATA_CB(_active_power_cb) \ + EM_ADV_DATA_CB(_active_power_cb, NULL) struct em_perf_domain *em_cpu_get(int cpu); struct em_perf_domain *em_pd_get(struct device *dev); @@ -264,6 +285,7 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) #else struct em_data_callback {}; +#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) { } #define EM_DATA_CB(_active_power_cb) { } #define EM_SET_ACTIVE_POWER_CB(em_cb, cb) do { } while (0) -- cgit v1.2.3 From fc3a9a9858478ab5f8441765a3f9552a0ceba10c Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Mon, 21 Mar 2022 09:57:23 +0000 Subject: PM: EM: Add artificial EM flag The Energy Model (EM) can be used on platforms which are missing real power information. Those platforms would implement .get_cost() which populates needed values for the Energy Aware Scheduler (EAS). The EAS doesn't use 'power' fields from EM, but other frameworks might use them. Thus, to avoid miss-usage of this specific type of EM, introduce a new flags which can be checked by other frameworks. Signed-off-by: Pierre Gondois Signed-off-by: Lukasz Luba Reviewed-by: Ionela Voinescu Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 0a3a5663177b..92e82a322859 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -67,11 +67,16 @@ struct em_perf_domain { * * EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating * energy consumption. + * + * EM_PERF_DOMAIN_ARTIFICIAL: The power values are artificial and might be + * created by platform missing real power information */ #define EM_PERF_DOMAIN_MILLIWATTS BIT(0) #define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1) +#define EM_PERF_DOMAIN_ARTIFICIAL BIT(2) #define em_span_cpus(em) (to_cpumask((em)->cpus)) +#define em_is_artificial(em) ((em)->flags & EM_PERF_DOMAIN_ARTIFICIAL) #ifdef CONFIG_ENERGY_MODEL #define EM_MAX_POWER 0xFFFF -- cgit v1.2.3 From 75a3a99a5a9886af13be44e640cb415ebda80db2 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 21 Mar 2022 09:57:25 +0000 Subject: PM: EM: Change the order of arguments in the .active_power() callback The .active_power() callback passes the device pointer when it's called. Aligned with a convetion present in other subsystems and pass the 'dev' as a first argument. It looks more cleaner. Adjust all affected drivers which implement that API callback. Suggested-by: Ionela Voinescu Signed-off-by: Lukasz Luba Reviewed-by: Ionela Voinescu Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 92e82a322859..8419bffb4398 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -101,11 +101,11 @@ struct em_data_callback { /** * active_power() - Provide power at the next performance state of * a device + * @dev : Device for which we do this operation (can be a CPU) * @power : Active power at the performance state * (modified) * @freq : Frequency at the performance state in kHz * (modified) - * @dev : Device for which we do this operation (can be a CPU) * * active_power() must find the lowest performance state of 'dev' above * 'freq' and update 'power' and 'freq' to the matching active power @@ -117,8 +117,8 @@ struct em_data_callback { * * Return 0 on success. */ - int (*active_power)(unsigned long *power, unsigned long *freq, - struct device *dev); + int (*active_power)(struct device *dev, unsigned long *power, + unsigned long *freq); /** * get_cost() - Provide the cost at the given performance state of -- cgit v1.2.3 From ce1cb680ff1c5b88505f878137796b1723e00193 Mon Sep 17 00:00:00 2001 From: David Cohen Date: Thu, 24 Mar 2022 08:07:30 +0000 Subject: PM: sleep: enable dynamic debug support within pm_pr_dbg() Currently pm_pr_dbg() is used to filter kernel pm debug messages based on pm_debug_messages_on flag. The problem is if we enable/disable this flag it will affect all pm_pr_dbg() calls at once, so we can't individually control them. This patch changes pm_pr_dbg() implementation as such: - If pm_debug_messages_on is enabled, print the message. - If pm_debug_messages_on is disabled and CONFIG_DYNAMIC_DEBUG is enabled, only print the messages explicitly enabled on /sys/kernel/debug/dynamic_debug/control. - If pm_debug_messages_on is disabled and CONFIG_DYNAMIC_DEBUG is disabled, don't print the message. Signed-off-by: David Cohen Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 300273ff40cc..70f2921e2e70 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -542,22 +542,56 @@ static inline void unlock_system_sleep(void) {} #ifdef CONFIG_PM_SLEEP_DEBUG extern bool pm_print_times_enabled; extern bool pm_debug_messages_on; -extern __printf(2, 3) void __pm_pr_dbg(bool defer, const char *fmt, ...); +static inline int pm_dyn_debug_messages_on(void) +{ +#ifdef CONFIG_DYNAMIC_DEBUG + return 1; +#else + return 0; +#endif +} +#ifndef pr_fmt +#define pr_fmt(fmt) "PM: " fmt +#endif +#define __pm_pr_dbg(fmt, ...) \ + do { \ + if (pm_debug_messages_on) \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ + else if (pm_dyn_debug_messages_on()) \ + pr_debug(fmt, ##__VA_ARGS__); \ + } while (0) +#define __pm_deferred_pr_dbg(fmt, ...) \ + do { \ + if (pm_debug_messages_on) \ + printk_deferred(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ + } while (0) #else #define pm_print_times_enabled (false) #define pm_debug_messages_on (false) #include -#define __pm_pr_dbg(defer, fmt, ...) \ - no_printk(KERN_DEBUG fmt, ##__VA_ARGS__) +#define __pm_pr_dbg(fmt, ...) \ + no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) +#define __pm_deferred_pr_dbg(fmt, ...) \ + no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif +/** + * pm_pr_dbg - print pm sleep debug messages + * + * If pm_debug_messages_on is enabled, print message. + * If pm_debug_messages_on is disabled and CONFIG_DYNAMIC_DEBUG is enabled, + * print message only from instances explicitly enabled on dynamic debug's + * control. + * If pm_debug_messages_on is disabled and CONFIG_DYNAMIC_DEBUG is disabled, + * don't print message. + */ #define pm_pr_dbg(fmt, ...) \ - __pm_pr_dbg(false, fmt, ##__VA_ARGS__) + __pm_pr_dbg(fmt, ##__VA_ARGS__) #define pm_deferred_pr_dbg(fmt, ...) \ - __pm_pr_dbg(true, fmt, ##__VA_ARGS__) + __pm_deferred_pr_dbg(fmt, ##__VA_ARGS__) #ifdef CONFIG_PM_AUTOSLEEP -- cgit v1.2.3 From 1227418989346af3af179742cf42ce842e0ad484 Mon Sep 17 00:00:00 2001 From: Dov Murik Date: Tue, 12 Apr 2022 21:21:24 +0000 Subject: efi: Save location of EFI confidential computing area Confidential computing (coco) hardware such as AMD SEV (Secure Encrypted Virtualization) allows a guest owner to inject secrets into the VMs memory without the host/hypervisor being able to read them. Firmware support for secret injection is available in OVMF, which reserves a memory area for secret injection and includes a pointer to it the in EFI config table entry LINUX_EFI_COCO_SECRET_TABLE_GUID. If EFI exposes such a table entry, uefi_init() will keep a pointer to the EFI config table entry in efi.coco_secret, so it can be used later by the kernel (specifically drivers/virt/coco/efi_secret). It will also appear in the kernel log as "CocoSecret=ADDRESS"; for example: [ 0.000000] efi: EFI v2.70 by EDK II [ 0.000000] efi: CocoSecret=0x7f22e680 SMBIOS=0x7f541000 ACPI=0x7f77e000 ACPI 2.0=0x7f77e014 MEMATTR=0x7ea0c018 The new functionality can be enabled with CONFIG_EFI_COCO_SECRET=y. Signed-off-by: Dov Murik Reviewed-by: Gerd Hoffmann Link: https://lore.kernel.org/r/20220412212127.154182-2-dovmurik@linux.ibm.com Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index ccd4d3f91c98..771d4cd06b56 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -405,6 +405,7 @@ void efi_native_runtime_setup(void); #define LINUX_EFI_MEMRESERVE_TABLE_GUID EFI_GUID(0x888eb0c6, 0x8ede, 0x4ff5, 0xa8, 0xf0, 0x9a, 0xee, 0x5c, 0xb9, 0x77, 0xc2) #define LINUX_EFI_INITRD_MEDIA_GUID EFI_GUID(0x5568e427, 0x68fc, 0x4f3d, 0xac, 0x74, 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68) #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID EFI_GUID(0xc451ed2b, 0x9694, 0x45d3, 0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89) +#define LINUX_EFI_COCO_SECRET_AREA_GUID EFI_GUID(0xadf956ad, 0xe98c, 0x484c, 0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47) /* OEM GUIDs */ #define DELLEMC_EFI_RCI2_TABLE_GUID EFI_GUID(0x2d9f28a2, 0xa886, 0x456a, 0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55) @@ -596,6 +597,7 @@ extern struct efi { unsigned long tpm_log; /* TPM2 Event Log table */ unsigned long tpm_final_log; /* TPM2 Final Events Log table */ unsigned long mokvar_table; /* MOK variable config table */ + unsigned long coco_secret; /* Confidential computing secret table */ efi_get_time_t *get_time; efi_set_time_t *set_time; @@ -1335,4 +1337,12 @@ extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt); static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) { } #endif +struct linux_efi_coco_secret_area { + u64 base_pa; + u64 size; +}; + +/* Header of a populated EFI secret area */ +#define EFI_SECRET_TABLE_HEADER_GUID EFI_GUID(0x1e74f542, 0x71dd, 0x4d66, 0x96, 0x3e, 0xef, 0x42, 0x87, 0xff, 0x17, 0x3b) + #endif /* _LINUX_EFI_H */ -- cgit v1.2.3 From aa480379d8bdb33920d68acfd90f823c8af32578 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 4 Mar 2022 07:36:37 +0100 Subject: efi: Add missing prototype for efi_capsule_setup_info Fixes "no previous declaration for 'efi_capsule_setup_info'" warnings under W=1. Fixes: 2959c95d510c ("efi/capsule: Add support for Quark security header") Signed-off-by: Jan Kiszka Link: https://lore.kernel.org/r/c28d3f86-dd72-27d1-e2c2-40971b8da6bd@siemens.com Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 771d4cd06b56..fd266198f9d8 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -213,6 +213,8 @@ struct capsule_info { size_t page_bytes_remain; }; +int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, + size_t hdr_bytes); int __efi_capsule_setup_info(struct capsule_info *cap_info); /* -- cgit v1.2.3 From 002752af7b89b74c64fe6bec8c5fde3d3a7810d8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 8 Apr 2022 21:48:40 +0300 Subject: device property: Allow error pointer to be passed to fwnode APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the fwnode APIs might return an error pointer instead of NULL or valid fwnode handle. The result of such API call may be considered optional and hence the test for it is usually done in a form of fwnode = fwnode_find_reference(...); if (IS_ERR(fwnode)) ...error handling... Nevertheless the resulting fwnode may have bumped the reference count and hence caller of the above API is obliged to call fwnode_handle_put(). Since fwnode may be not valid either as NULL or error pointer the check has to be performed there. This approach uglifies the code and adds a point of making a mistake, i.e. forgetting about error point case. To prevent this, allow an error pointer to be passed to the fwnode APIs. Fixes: 83b34afb6b79 ("device property: Introduce fwnode_find_reference()") Reported-by: Nuno Sá Tested-by: Nuno Sá Acked-by: Nuno Sá Reviewed-by: Sakari Ailus Reviewed-by: Heikki Krogerus Signed-off-by: Andy Shevchenko Tested-by: Michael Walle Signed-off-by: Rafael J. Wysocki --- include/linux/fwnode.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 6ab69871b06d..9a81c4410b9f 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -153,12 +153,12 @@ struct fwnode_operations { int (*add_links)(struct fwnode_handle *fwnode); }; -#define fwnode_has_op(fwnode, op) \ - ((fwnode) && (fwnode)->ops && (fwnode)->ops->op) +#define fwnode_has_op(fwnode, op) \ + (!IS_ERR_OR_NULL(fwnode) && (fwnode)->ops && (fwnode)->ops->op) + #define fwnode_call_int_op(fwnode, op, ...) \ - (fwnode ? (fwnode_has_op(fwnode, op) ? \ - (fwnode)->ops->op(fwnode, ## __VA_ARGS__) : -ENXIO) : \ - -EINVAL) + (fwnode_has_op(fwnode, op) ? \ + (fwnode)->ops->op(fwnode, ## __VA_ARGS__) : (IS_ERR_OR_NULL(fwnode) ? -EINVAL : -ENXIO)) #define fwnode_call_bool_op(fwnode, op, ...) \ (fwnode_has_op(fwnode, op) ? \ -- cgit v1.2.3 From 87ffea09470d94c93dd6a5a22d4b2216b395d1ea Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 8 Apr 2022 21:48:41 +0300 Subject: device property: Introduce fwnode_for_each_parent_node() In a few cases the functionality of fwnode_for_each_parent_node() is already in use. Introduce a common helper macro for it. It may be used by others as well in the future. Signed-off-by: Andy Shevchenko Reviewed-by: Sakari Ailus Signed-off-by: Rafael J. Wysocki --- include/linux/property.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 4cd4b326941f..15d6863ae962 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -83,9 +83,14 @@ struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode, const char *fwnode_get_name(const struct fwnode_handle *fwnode); const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode); + struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode); -struct fwnode_handle *fwnode_get_next_parent( - struct fwnode_handle *fwnode); +struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode); + +#define fwnode_for_each_parent_node(fwnode, parent) \ + for (parent = fwnode_get_parent(fwnode); parent; \ + parent = fwnode_get_next_parent(parent)) + struct device *fwnode_get_next_parent_dev(struct fwnode_handle *fwnode); unsigned int fwnode_count_parents(const struct fwnode_handle *fwn); struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwn, -- cgit v1.2.3 From 022fe6bc8f3bd0f2b1f9bc778bacc38020dfd420 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 8 Apr 2022 21:48:42 +0300 Subject: device property: Drop 'test' prefix in parameters of fwnode_is_ancestor_of() The part 'is' in the function name implies the test against something. Drop unnecessary 'test' prefix in the fwnode_is_ancestor_of() parameters. No functional change intended. Signed-off-by: Andy Shevchenko Reviewed-by: Sakari Ailus Signed-off-by: Rafael J. Wysocki --- include/linux/property.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 15d6863ae962..fc24d45632eb 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -95,8 +95,7 @@ struct device *fwnode_get_next_parent_dev(struct fwnode_handle *fwnode); unsigned int fwnode_count_parents(const struct fwnode_handle *fwn); struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwn, unsigned int depth); -bool fwnode_is_ancestor_of(struct fwnode_handle *test_ancestor, - struct fwnode_handle *test_child); +bool fwnode_is_ancestor_of(struct fwnode_handle *ancestor, struct fwnode_handle *child); struct fwnode_handle *fwnode_get_next_child_node( const struct fwnode_handle *fwnode, struct fwnode_handle *child); struct fwnode_handle *fwnode_get_next_available_child_node( -- cgit v1.2.3 From 4e140f59d285c1ca1e5c81b4c13e27366865bd09 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 10 Jan 2022 23:15:27 +0000 Subject: mm/usercopy: Check kmap addresses properly If you are copying to an address in the kmap region, you may not copy across a page boundary, no matter what the size of the underlying allocation. You can't kmap() a slab page because slab pages always come from low memory. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220110231530.665970-2-willy@infradead.org --- include/linux/highmem-internal.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index a77be5630209..337bd9f32921 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -149,6 +149,11 @@ static inline void totalhigh_pages_add(long count) atomic_long_add(count, &_totalhigh_pages); } +static inline bool is_kmap_addr(const void *x) +{ + unsigned long addr = (unsigned long)x; + return addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP); +} #else /* CONFIG_HIGHMEM */ static inline struct page *kmap_to_page(void *addr) @@ -234,6 +239,11 @@ static inline void __kunmap_atomic(void *addr) static inline unsigned int nr_free_highpages(void) { return 0; } static inline unsigned long totalhigh_pages(void) { return 0UL; } +static inline bool is_kmap_addr(const void *x) +{ + return false; +} + #endif /* CONFIG_HIGHMEM */ /* -- cgit v1.2.3 From e6f3b3c9c109ed57230996cf4a4c1b8ae7e36a81 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Tue, 5 Apr 2022 15:16:18 -0700 Subject: cfi: Use __builtin_function_start Clang 14 added support for the __builtin_function_start function, which allows us to implement the function_nocfi macro without architecture-specific inline assembly and in a way that also works with static initializers. Change CONFIG_CFI_CLANG to depend on Clang >= 14, define function_nocfi using __builtin_function_start, and remove the arm64 inline assembly implementation. Link: https://github.com/llvm/llvm-project/commit/ec2e26eaf63558934f5b73a6e530edc453cf9508 Link: https://github.com/ClangBuiltLinux/linux/issues/1353 Signed-off-by: Sami Tolvanen Reviewed-by: Nick Desaulniers Reviewed-by: Mark Rutland Tested-by: Mark Rutland Acked-by: Will Deacon # arm64 Reviewed-by: Nathan Chancellor Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220405221618.633743-1-samitolvanen@google.com --- include/linux/compiler-clang.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index babb1347148c..c84fec767445 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -69,6 +69,16 @@ #define __nocfi __attribute__((__no_sanitize__("cfi"))) #define __cficanonical __attribute__((__cfi_canonical_jump_table__)) +#if defined(CONFIG_CFI_CLANG) +/* + * With CONFIG_CFI_CLANG, the compiler replaces function address + * references with the address of the function's CFI jump table + * entry. The function_nocfi macro always returns the address of the + * actual function instead. + */ +#define function_nocfi(x) __builtin_function_start(x) +#endif + /* * Turn individual warnings and errors on and off locally, depending * on version. -- cgit v1.2.3 From 63cec1389e116ae0e2a15e612a5b49651e04be3f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 4 Apr 2022 18:09:14 -0700 Subject: fscrypt: split up FS_CRYPTO_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE is neither the filesystem block size nor the granularity of encryption. Rather, it defines two logically separate constraints that both arise from the block size of the AES cipher: - The alignment required for the lengths of file contents blocks - The minimum input/output length for the filenames encryption modes Since there are way too many things called the "block size", and the connection with the AES block size is not easily understood, split FS_CRYPTO_BLOCK_SIZE into two constants FSCRYPT_CONTENTS_ALIGNMENT and FSCRYPT_FNAME_MIN_MSG_LEN that more clearly describe what they are. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20220405010914.18519-1-ebiggers@kernel.org --- include/linux/fscrypt.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 50d92d805bd8..efc7f96e5e26 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -18,7 +18,17 @@ #include #include -#define FS_CRYPTO_BLOCK_SIZE 16 +/* + * The lengths of all file contents blocks must be divisible by this value. + * This is needed to ensure that all contents encryption modes will work, as + * some of the supported modes don't support arbitrarily byte-aligned messages. + * + * Since the needed alignment is 16 bytes, most filesystems will meet this + * requirement naturally, as typical block sizes are powers of 2. However, if a + * filesystem can generate arbitrarily byte-aligned block lengths (e.g., via + * compression), then it will need to pad to this alignment before encryption. + */ +#define FSCRYPT_CONTENTS_ALIGNMENT 16 union fscrypt_policy; struct fscrypt_info; -- cgit v1.2.3 From 9554e908fb5d02e48a681d1eca180225bf109e83 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 25 Mar 2022 11:39:51 -0400 Subject: ELF: Remove elf_core_copy_kernel_regs() x86-32 was the last architecture that implemented separate user and kernel registers. Signed-off-by: Brian Gerst Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Acked-by: Andy Lutomirski Acked-by: Michael Ellerman Link: https://lore.kernel.org/r/20220325153953.162643-3-brgerst@gmail.com --- include/linux/elfcore.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index f8e206e82476..346a8b56cdc8 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -84,15 +84,6 @@ static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *re #endif } -static inline void elf_core_copy_kernel_regs(elf_gregset_t *elfregs, struct pt_regs *regs) -{ -#ifdef ELF_CORE_COPY_KERNEL_REGS - ELF_CORE_COPY_KERNEL_REGS((*elfregs), regs); -#else - elf_core_copy_regs(elfregs, regs); -#endif -} - static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs) { #if defined (ELF_CORE_COPY_TASK_REGS) -- cgit v1.2.3 From 64b97df995f0c943be469a019d6117c89e2131bc Mon Sep 17 00:00:00 2001 From: Lech Perczak Date: Wed, 13 Apr 2022 03:44:14 +0200 Subject: cdc_ether: export usbnet_cdc_zte_rx_fixup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit bfe9b9d2df66 ("cdc_ether: Improve ZTE MF823/831/910 handling") introduces a workaround for certain ZTE modems reporting invalid MAC addresses over CDC-ECM. The same issue was present on their RNDIS interface,which was fixed in commit a5a18bdf7453 ("rndis_host: Set valid random MAC on buggy devices"). However, internal modem of ZTE MF286R router, on its RNDIS interface, also exhibits a second issue fixed already in CDC-ECM, of the device not respecting configured random MAC address. In order to share the fixup for this with rndis_host driver, export the workaround function, which will be re-used in the following commit in rndis_host. Cc: Kristian Evensen Cc: Bjørn Mork Cc: Oliver Neukum Signed-off-by: Lech Perczak Signed-off-by: Paolo Abeni --- include/linux/usb/usbnet.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 8336e86ce606..1b4d72d5e891 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -214,6 +214,7 @@ extern int usbnet_ether_cdc_bind(struct usbnet *dev, struct usb_interface *intf) extern int usbnet_cdc_bind(struct usbnet *, struct usb_interface *); extern void usbnet_cdc_unbind(struct usbnet *, struct usb_interface *); extern void usbnet_cdc_status(struct usbnet *, struct urb *); +extern int usbnet_cdc_zte_rx_fixup(struct usbnet *dev, struct sk_buff *skb); /* CDC and RNDIS support the same host-chosen packet filters for IN transfers */ #define DEFAULT_FILTER (USB_CDC_PACKET_TYPE_BROADCAST \ -- cgit v1.2.3 From 36e747972d8b4c09e6e3275e31a3acba46e2c4d2 Mon Sep 17 00:00:00 2001 From: Lech Perczak Date: Wed, 13 Apr 2022 03:44:15 +0200 Subject: rndis_host: enable the bogus MAC fixup for ZTE devices from cdc_ether MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Certain ZTE modems, namely: MF823. MF831, MF910, built-in modem from MF286R, expose both CDC-ECM and RNDIS network interfaces. They have a trait of ignoring the locally-administered MAC address configured on the interface both in CDC-ECM and RNDIS part, and this leads to dropping of incoming traffic by the host. However, the workaround was only present in CDC-ECM, and MF286R explicitly requires it in RNDIS mode. Re-use the workaround in rndis_host as well, to fix operation of MF286R module, some versions of which expose only the RNDIS interface. Do so by introducing new flag, RNDIS_DRIVER_DATA_DST_MAC_FIXUP, and testing for it in rndis_rx_fixup. This is required, as RNDIS uses frame batching, and all of the packets inside the batch need the fixup. This might introduce a performance penalty, because test is done for every returned Ethernet frame. Apply the workaround to both "flavors" of RNDIS interfaces, as older ZTE modems, like MF823 found in the wild, report the USB_CLASS_COMM class interfaces, while MF286R reports USB_CLASS_WIRELESS_CONTROLLER. Suggested-by: Bjørn Mork Cc: Kristian Evensen Cc: Oliver Neukum Signed-off-by: Lech Perczak Signed-off-by: Paolo Abeni --- include/linux/usb/rndis_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/rndis_host.h b/include/linux/usb/rndis_host.h index 809bccd08455..cc42db51bbba 100644 --- a/include/linux/usb/rndis_host.h +++ b/include/linux/usb/rndis_host.h @@ -197,6 +197,7 @@ struct rndis_keepalive_c { /* IN (optionally OUT) */ /* Flags for driver_info::data */ #define RNDIS_DRIVER_DATA_POLL_STATUS 1 /* poll status before control */ +#define RNDIS_DRIVER_DATA_DST_MAC_FIXUP 2 /* device ignores configured MAC address */ extern void rndis_status(struct usbnet *dev, struct urb *urb); extern int -- cgit v1.2.3 From 3dc6ffae2da201284cb24af66af77ee0bbb2efaa Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Thu, 14 Apr 2022 11:18:03 +0200 Subject: timekeeping: Introduce fast accessor to clock tai Introduce fast/NMI safe accessor to clock tai for tracing. The Linux kernel tracing infrastructure has support for using different clocks to generate timestamps for trace events. Especially in TSN networks it's useful to have TAI as trace clock, because the application scheduling is done in accordance to the network time, which is based on TAI. With a tai trace_clock in place, it becomes very convenient to correlate network activity with Linux kernel application traces. Use the same implementation as ktime_get_boot_fast_ns() does by reading the monotonic time and adding the TAI offset. The same limitations as for the fast boot implementation apply. The TAI offset may change at run time e.g., by setting the time or using adjtimex() with an offset. However, these kind of offset changes are rare events. Nevertheless, the user has to be aware and deal with it in post processing. An alternative approach would be to use the same implementation as ktime_get_real_fast_ns() does. However, this requires to add an additional u64 member to the tk_read_base struct. This struct together with a seqcount is designed to fit into a single cache line on 64 bit architectures. Adding a new member would violate this constraint. Signed-off-by: Kurt Kanzenbach Signed-off-by: Thomas Gleixner Cc: Steven Rostedt Link: https://lore.kernel.org/r/20220414091805.89667-2-kurt@linutronix.de --- include/linux/timekeeping.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 78a98bdff76d..fe1e467ba046 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -177,6 +177,7 @@ static inline u64 ktime_get_raw_ns(void) extern u64 ktime_get_mono_fast_ns(void); extern u64 ktime_get_raw_fast_ns(void); extern u64 ktime_get_boot_fast_ns(void); +extern u64 ktime_get_tai_fast_ns(void); extern u64 ktime_get_real_fast_ns(void); /* -- cgit v1.2.3 From af47d8033fc731f19600efd27ba4a7d0fdfcc77c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 14 Apr 2022 21:42:48 +0300 Subject: gpiolib: Introduce a helper to get first GPIO controller node Introduce a helper to get first GPIO controller node which drivers may want to use. Signed-off-by: Andy Shevchenko Tested-by: Marek Szyprowski --- include/linux/gpio/driver.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 12de0b22b4ef..83e2d72e51bb 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -766,4 +766,14 @@ static inline unsigned int gpiochip_node_count(struct device *dev) return count; } +static inline struct fwnode_handle *gpiochip_node_get_first(struct device *dev) +{ + struct fwnode_handle *fwnode; + + for_each_gpiochip_node(dev, fwnode) + return fwnode; + + return NULL; +} + #endif /* __LINUX_GPIO_DRIVER_H */ -- cgit v1.2.3 From f1724d397c60d296c0805c95a46ae7fc7163b70c Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Sat, 9 Apr 2022 16:03:18 +0800 Subject: crypto: hisilicon/qm - add register checking for ACC Add register detection function to accelerator. Provided a tool that user can checking differential register through Debugfs. e.g. cd /sys/kernel/debug/hisi_zip//zip_dfx cat diff_regs Signed-off-by: Longfang Liu Signed-off-by: Kai Ye Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 177f7b7cd414..39acc0316a60 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -168,6 +168,12 @@ enum qm_vf_state { QM_NOT_READY, }; +struct dfx_diff_registers { + u32 *regs; + u32 reg_offset; + u32 reg_len; +}; + struct qm_dfx { atomic64_t err_irq_cnt; atomic64_t aeq_irq_cnt; @@ -190,6 +196,8 @@ struct qm_debug { struct dentry *debug_root; struct dentry *qm_d; struct debugfs_file files[DEBUG_FILE_NUM]; + struct dfx_diff_registers *qm_diff_regs; + struct dfx_diff_registers *acc_diff_regs; }; struct qm_shaper_factor { @@ -448,6 +456,12 @@ int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen); int hisi_qm_sriov_configure(struct pci_dev *pdev, int num_vfs); void hisi_qm_dev_err_init(struct hisi_qm *qm); void hisi_qm_dev_err_uninit(struct hisi_qm *qm); +int hisi_qm_diff_regs_init(struct hisi_qm *qm, + struct dfx_diff_registers *dregs, int reg_len); +void hisi_qm_diff_regs_uninit(struct hisi_qm *qm, int reg_len); +void hisi_qm_acc_diff_regs_dump(struct hisi_qm *qm, struct seq_file *s, + struct dfx_diff_registers *dregs, int regs_len); + pci_ers_result_t hisi_qm_dev_err_detected(struct pci_dev *pdev, pci_channel_state_t state); pci_ers_result_t hisi_qm_dev_slot_reset(struct pci_dev *pdev); -- cgit v1.2.3 From a888ccd6c66683a49977ba6a2b91fe52fbec9367 Mon Sep 17 00:00:00 2001 From: Kai Ye Date: Sat, 9 Apr 2022 16:03:25 +0800 Subject: crypto: hisilicon/qm - add last word dumping for ACC Add last word dumping function during acc engines controller reset. The last words are reported to the printed information during the reset. The dmesg information included qm debugging registers and engine debugging registers. It can help to improve debugging capability. Signed-off-by: Kai Ye Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 39acc0316a60..e5522eaf88fd 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -196,6 +196,9 @@ struct qm_debug { struct dentry *debug_root; struct dentry *qm_d; struct debugfs_file files[DEBUG_FILE_NUM]; + unsigned int *qm_last_words; + /* ACC engines recoreding last regs */ + unsigned int *last_words; struct dfx_diff_registers *qm_diff_regs; struct dfx_diff_registers *acc_diff_regs; }; @@ -251,6 +254,7 @@ struct hisi_qm_err_ini { void (*open_sva_prefetch)(struct hisi_qm *qm); void (*close_sva_prefetch)(struct hisi_qm *qm); void (*log_dev_hw_err)(struct hisi_qm *qm, u32 err_sts); + void (*show_last_dfx_regs)(struct hisi_qm *qm); void (*err_info_init)(struct hisi_qm *qm); }; -- cgit v1.2.3 From f6f586102add59d57bcc6eea06fdeaae11bb17a1 Mon Sep 17 00:00:00 2001 From: Eric Tremblay Date: Wed, 30 Mar 2022 12:46:40 +0200 Subject: serial: 8250: Handle UART without interrupt on TEMT using em485 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce the UART_CAP_NOTEMT capability. The capability indicates that the UART doesn't have an interrupt available on TEMT. In the case where the device does not support it, we calculate the maximum time it could take for the transmitter to empty the shift register. When we get in the situation where we get the THRE interrupt, we check if the TEMT bit is set. If it's not, we start the a timer and recall __stop_tx() after the delay. The transmit sequence is a bit modified when the capability is set. The new timer is used between the last interrupt(THRE) and a potential stop_tx timer. Signed-off-by: Giulio Benetti [moved to use added UART_CAP_TEMT] Signed-off-by: Heiko Stuebner [moved to use added UART_CAP_NOTEMT, improve timeout] Signed-off-by: Eric Tremblay [rebased to v5.17, making use of tty_get_frame_size] Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20220330104642.229507-2-u.kleine-koenig@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index ff84a3ed10ea..de135852107c 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -79,7 +79,9 @@ struct uart_8250_ops { struct uart_8250_em485 { struct hrtimer start_tx_timer; /* "rs485 start tx" timer */ struct hrtimer stop_tx_timer; /* "rs485 stop tx" timer */ + struct hrtimer no_temt_timer; /* "rs485 no TEMT interrupt" timer */ struct hrtimer *active_timer; /* pointer to active timer */ + unsigned long no_temt_delay; /* Delay for no_temt_timer */ struct uart_8250_port *port; /* for hrtimer callbacks */ unsigned int tx_stopped:1; /* tx is currently stopped */ }; -- cgit v1.2.3 From 4dc84c06a343fcb95fd5a0acb537aefa4ebdd1b0 Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Tue, 12 Apr 2022 10:01:19 +0800 Subject: net: ethtool: extend ringparam set/get APIs for tx_push Currently tx push is a standard driver feature which controls use of a fast path descriptor push. So this patch extends the ringparam APIs and data structures to support set/get tx push by ethtool -G/g. Signed-off-by: Jie Wang Signed-off-by: Guangbin Huang Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 4af58459a1e7..99dc7bfbcd3c 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -71,11 +71,13 @@ enum { * struct kernel_ethtool_ringparam - RX/TX ring configuration * @rx_buf_len: Current length of buffers on the rx ring. * @tcp_data_split: Scatter packet headers and data to separate buffers + * @tx_push: The flag of tx push mode * @cqe_size: Size of TX/RX completion queue event */ struct kernel_ethtool_ringparam { u32 rx_buf_len; u8 tcp_data_split; + u8 tx_push; u32 cqe_size; }; @@ -83,10 +85,12 @@ struct kernel_ethtool_ringparam { * enum ethtool_supported_ring_param - indicator caps for setting ring params * @ETHTOOL_RING_USE_RX_BUF_LEN: capture for setting rx_buf_len * @ETHTOOL_RING_USE_CQE_SIZE: capture for setting cqe_size + * @ETHTOOL_RING_USE_TX_PUSH: capture for setting tx_push */ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), ETHTOOL_RING_USE_CQE_SIZE = BIT(1), + ETHTOOL_RING_USE_TX_PUSH = BIT(2), }; #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) -- cgit v1.2.3 From f9a2fb73318eb4dbf8cd84866b8b0dd012d8b116 Mon Sep 17 00:00:00 2001 From: Arun Ajith S Date: Fri, 15 Apr 2022 08:34:02 +0000 Subject: net/ipv6: Introduce accept_unsolicited_na knob to implement router-side changes for RFC9131 Add a new neighbour cache entry in STALE state for routers on receiving an unsolicited (gratuitous) neighbour advertisement with target link-layer-address option specified. This is similar to the arp_accept configuration for IPv4. A new sysctl endpoint is created to turn on this behaviour: /proc/sys/net/ipv6/conf/interface/accept_unsolicited_na. Signed-off-by: Arun Ajith S Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 16870f86c74d..918bfea4ef5f 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -61,6 +61,7 @@ struct ipv6_devconf { __s32 suppress_frag_ndisc; __s32 accept_ra_mtu; __s32 drop_unsolicited_na; + __s32 accept_unsolicited_na; struct ipv6_stable_secret { bool initialized; struct in6_addr secret; -- cgit v1.2.3 From da40b613f89c43c58986e6f30560ad6573a4d569 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Apr 2022 17:10:41 -0700 Subject: tcp: add drop reason support to tcp_validate_incoming() Creates four new drop reasons for the following cases: 1) packet being rejected by RFC 7323 PAWS check 2) packet being rejected by SEQUENCE check 3) Invalid RST packet 4) Invalid SYN packet Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0ef11df1bc67..a903da1fa0ed 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -381,6 +381,12 @@ enum skb_drop_reason { * the ofo queue, corresponding to * LINUX_MIB_TCPOFOMERGE */ + SKB_DROP_REASON_TCP_RFC7323_PAWS, /* PAWS check, corresponding to + * LINUX_MIB_PAWSESTABREJECTED + */ + SKB_DROP_REASON_TCP_INVALID_SEQUENCE, /* Not acceptable SEQ field */ + SKB_DROP_REASON_TCP_RESET, /* Invalid RST packet */ + SKB_DROP_REASON_TCP_INVALID_SYN, /* Incoming packet has unexpected SYN flag */ SKB_DROP_REASON_IP_OUTNOROUTES, /* route lookup failed */ SKB_DROP_REASON_BPF_CGROUP_EGRESS, /* dropped by * BPF_PROG_TYPE_CGROUP_SKB -- cgit v1.2.3 From 669da7a71890b2b2a31a7e9571c0fdf1123e26ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Apr 2022 17:10:43 -0700 Subject: tcp: add drop reasons to tcp_rcv_state_process() Add basic support for drop reasons in tcp_rcv_state_process() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a903da1fa0ed..6f1410b5ff13 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -387,6 +387,9 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_INVALID_SEQUENCE, /* Not acceptable SEQ field */ SKB_DROP_REASON_TCP_RESET, /* Invalid RST packet */ SKB_DROP_REASON_TCP_INVALID_SYN, /* Incoming packet has unexpected SYN flag */ + SKB_DROP_REASON_TCP_CLOSE, /* TCP socket in CLOSE state */ + SKB_DROP_REASON_TCP_FASTOPEN, /* dropped by FASTOPEN request socket */ + SKB_DROP_REASON_TCP_OLD_ACK, /* TCP ACK is old, but in window */ SKB_DROP_REASON_IP_OUTNOROUTES, /* route lookup failed */ SKB_DROP_REASON_BPF_CGROUP_EGRESS, /* dropped by * BPF_PROG_TYPE_CGROUP_SKB -- cgit v1.2.3 From 4b506af9c5b8de0da34097d50d9448dfb33d70c3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Apr 2022 17:10:44 -0700 Subject: tcp: add two drop reasons for tcp_ack() Add TCP_TOO_OLD_ACK and TCP_ACK_UNSENT_DATA drop reasons so that tcp_rcv_established() can report them. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6f1410b5ff13..9ff5557b1909 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -390,6 +390,8 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_CLOSE, /* TCP socket in CLOSE state */ SKB_DROP_REASON_TCP_FASTOPEN, /* dropped by FASTOPEN request socket */ SKB_DROP_REASON_TCP_OLD_ACK, /* TCP ACK is old, but in window */ + SKB_DROP_REASON_TCP_TOO_OLD_ACK, /* TCP ACK is too old */ + SKB_DROP_REASON_TCP_ACK_UNSENT_DATA, /* TCP ACK for data we haven't sent yet */ SKB_DROP_REASON_IP_OUTNOROUTES, /* route lookup failed */ SKB_DROP_REASON_BPF_CGROUP_EGRESS, /* dropped by * BPF_PROG_TYPE_CGROUP_SKB -- cgit v1.2.3 From e7c89ae4078eab24af71ba26b91642e819a4bd7f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Apr 2022 17:10:45 -0700 Subject: tcp: add drop reason support to tcp_prune_ofo_queue() Add one reason for packets dropped from OFO queue because of memory pressure. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9ff5557b1909..ad15ad208b56 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -392,6 +392,7 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_OLD_ACK, /* TCP ACK is old, but in window */ SKB_DROP_REASON_TCP_TOO_OLD_ACK, /* TCP ACK is too old */ SKB_DROP_REASON_TCP_ACK_UNSENT_DATA, /* TCP ACK for data we haven't sent yet */ + SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE, /* pruned from TCP OFO queue */ SKB_DROP_REASON_IP_OUTNOROUTES, /* route lookup failed */ SKB_DROP_REASON_BPF_CGROUP_EGRESS, /* dropped by * BPF_PROG_TYPE_CGROUP_SKB -- cgit v1.2.3 From 8fbf195798b56e1e87f62d01be636a6425c304c2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Apr 2022 17:10:48 -0700 Subject: tcp: add drop reason support to tcp_ofo_queue() packets in OFO queue might be redundant, and dropped. tcp_drop() is no longer needed. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ad15ad208b56..84d78df60453 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -393,6 +393,7 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_TOO_OLD_ACK, /* TCP ACK is too old */ SKB_DROP_REASON_TCP_ACK_UNSENT_DATA, /* TCP ACK for data we haven't sent yet */ SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE, /* pruned from TCP OFO queue */ + SKB_DROP_REASON_TCP_OFO_DROP, /* data already in receive queue */ SKB_DROP_REASON_IP_OUTNOROUTES, /* route lookup failed */ SKB_DROP_REASON_BPF_CGROUP_EGRESS, /* dropped by * BPF_PROG_TYPE_CGROUP_SKB -- cgit v1.2.3 From 0df71650c051ab106c921de257f4b38e9e3dd251 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 24 Mar 2022 16:35:24 -0400 Subject: block: allow using the per-cpu bio cache from bio_alloc_bioset Replace the BIO_PERCPU_CACHE bio-internal flag with a REQ_ALLOC_CACHE one that can be passed to bio_alloc / bio_alloc_bioset, and implement the percpu cache allocation logic in a helper called from bio_alloc_bioset. This allows any bio_alloc_bioset user to use the percpu caches instead of having the functionality tied to struct kiocb. Signed-off-by: Mike Snitzer [hch: refactored a bit] Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220324203526.62306-2-snitzer@kernel.org Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 -- include/linux/blk_types.h | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 278cc81cc1e7..0fe8a51a0778 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -405,8 +405,6 @@ extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, unsigned int opf, gfp_t gfp_mask, struct bio_set *bs); -struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev, - unsigned short nr_vecs, unsigned int opf, struct bio_set *bs); struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs); extern void bio_put(struct bio *); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1973ef9bd40f..4968cb17b13b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -329,7 +329,6 @@ enum { BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ - BIO_PERCPU_CACHE, /* can participate in per-cpu alloc cache */ BIO_FLAG_LAST }; @@ -414,6 +413,7 @@ enum req_flag_bits { __REQ_NOUNMAP, /* do not free blocks when zeroing */ __REQ_POLLED, /* caller polls for completion using bio_poll */ + __REQ_ALLOC_CACHE, /* allocate IO from cache if available */ /* for driver use */ __REQ_DRV, @@ -439,6 +439,7 @@ enum req_flag_bits { #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) #define REQ_POLLED (1ULL << __REQ_POLLED) +#define REQ_ALLOC_CACHE (1ULL << __REQ_ALLOC_CACHE) #define REQ_DRV (1ULL << __REQ_DRV) #define REQ_SWAP (1ULL << __REQ_SWAP) -- cgit v1.2.3 From b53f3dcd705e52a07bd0311870dbfb6842e88c91 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 24 Mar 2022 16:35:25 -0400 Subject: block: allow use of per-cpu bio alloc cache by block drivers Refine per-cpu bio alloc cache interfaces so that DM and other block drivers can properly create and use the cache: DM uses bioset_init_from_src() to do its final bioset initialization, so must update bioset_init_from_src() to set BIOSET_PERCPU_CACHE if %src bioset has a cache. Also move bio_clear_polled() to include/linux/bio.h to allow users outside of block core. Signed-off-by: Mike Snitzer Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220324203526.62306-3-snitzer@kernel.org Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 0fe8a51a0778..8e87f3065042 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -780,6 +780,12 @@ static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb) bio->bi_opf |= REQ_NOWAIT; } +static inline void bio_clear_polled(struct bio *bio) +{ + /* can't support alloc cache if we turn off polling */ + bio->bi_opf &= ~(REQ_POLLED | REQ_ALLOC_CACHE); +} + struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, unsigned int nr_pages, unsigned int opf, gfp_t gfp); -- cgit v1.2.3 From 066ff571011d8416e903d3d4f1f41e0b5eb91e1d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Apr 2022 08:12:27 +0200 Subject: block: turn bio_kmalloc into a simple kmalloc wrapper Remove the magic autofree semantics and require the callers to explicitly call bio_init to initialize the bio. This allows bio_free to catch accidental bio_put calls on bio_init()ed bios as well. Signed-off-by: Christoph Hellwig Acked-by: Coly Li Acked-by: Mike Snitzer Link: https://lore.kernel.org/r/20220406061228.410163-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 8e87f3065042..49eff01fb829 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -405,7 +405,7 @@ extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, unsigned int opf, gfp_t gfp_mask, struct bio_set *bs); -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs); +struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask); extern void bio_put(struct bio *); struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, -- cgit v1.2.3 From 10f0d2a517796b8f6dc04fb0cc3e49003ae6b0bc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:42 +0200 Subject: block: add a bdev_nonrot helper Add a helper to check the nonrot flag based on the block_device instead of having to poke into the block layer internal request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: David Sterba [btrfs] Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220415045258.199825-12-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 60d016138997..3a9578e14a6b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1326,6 +1326,11 @@ static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) return 0; } +static inline bool bdev_nonrot(struct block_device *bdev) +{ + return blk_queue_nonrot(bdev_get_queue(bdev)); +} + static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From 08e688fdb8f7e862092ae64cee20bc8b463d1046 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:43 +0200 Subject: block: add a bdev_write_cache helper Add a helper to check the write cache flag based on the block_device instead of having to poke into the block layer internal request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: David Sterba [btrfs] Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220415045258.199825-13-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3a9578e14a6b..807a49aa5a27 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1331,6 +1331,11 @@ static inline bool bdev_nonrot(struct block_device *bdev) return blk_queue_nonrot(bdev_get_queue(bdev)); } +static inline bool bdev_write_cache(struct block_device *bdev) +{ + return test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags); +} + static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From a557e82e5a01826f902bd94fc925c03f253cb712 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:44 +0200 Subject: block: add a bdev_fua helper Add a helper to check the FUA flag based on the block_device instead of having to poke into the block layer internal request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220415045258.199825-14-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 807a49aa5a27..075b16d4560e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -602,7 +602,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); REQ_FAILFAST_DRIVER)) #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) -#define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags) @@ -1336,6 +1335,11 @@ static inline bool bdev_write_cache(struct block_device *bdev) return test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags); } +static inline bool bdev_fua(struct block_device *bdev) +{ + return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags); +} + static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From 36d254893aa6a6e204075c3cce94bb572ac32c04 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:45 +0200 Subject: block: add a bdev_stable_writes helper Add a helper to check the stable writes flag based on the block_device instead of having to poke into the block layer internal request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220415045258.199825-15-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 075b16d4560e..a433798c3343 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1330,6 +1330,12 @@ static inline bool bdev_nonrot(struct block_device *bdev) return blk_queue_nonrot(bdev_get_queue(bdev)); } +static inline bool bdev_stable_writes(struct block_device *bdev) +{ + return test_bit(QUEUE_FLAG_STABLE_WRITES, + &bdev_get_queue(bdev)->queue_flags); +} + static inline bool bdev_write_cache(struct block_device *bdev) { return test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags); -- cgit v1.2.3 From 2aba0d19f4d8c8929b4b3b94a9cfde2aa20e6ee2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:46 +0200 Subject: block: add a bdev_max_zone_append_sectors helper Add a helper to check the max supported sectors for zone append based on the block_device instead of having to poke into the block layer internal request_queue. Signed-off-by: Christoph Hellwig Acked-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220415045258.199825-16-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a433798c3343..f8c50b77543e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1188,6 +1188,12 @@ static inline unsigned int queue_max_zone_append_sectors(const struct request_qu return min(l->max_zone_append_sectors, l->max_sectors); } +static inline unsigned int +bdev_max_zone_append_sectors(struct block_device *bdev) +{ + return queue_max_zone_append_sectors(bdev_get_queue(bdev)); +} + static inline unsigned queue_logical_block_size(const struct request_queue *q) { int retval = 512; -- cgit v1.2.3 From 640f2a23911b8388989547f89d055afbb910b88e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:48 +0200 Subject: block: use bdev_alignment_offset in disk_alignment_offset_show This does the same as the open coded variant except for an extra branch, and allows to remove queue_alignment_offset entirely. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220415045258.199825-18-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f8c50b77543e..d5346e72e364 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1251,14 +1251,6 @@ bdev_zone_write_granularity(struct block_device *bdev) return queue_zone_write_granularity(bdev_get_queue(bdev)); } -static inline int queue_alignment_offset(const struct request_queue *q) -{ - if (q->limits.misaligned) - return -1; - - return q->limits.alignment_offset; -} - static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector) { unsigned int granularity = max(lim->physical_block_size, lim->io_min); -- cgit v1.2.3 From 89098b075cb74a80083bc4ed6b71d0ee18b6898f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:49 +0200 Subject: block: move bdev_alignment_offset and queue_limit_alignment_offset out of line No need to inline these fairly larger helpers. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220415045258.199825-19-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d5346e72e364..0a1795ac2627 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1251,26 +1251,7 @@ bdev_zone_write_granularity(struct block_device *bdev) return queue_zone_write_granularity(bdev_get_queue(bdev)); } -static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector) -{ - unsigned int granularity = max(lim->physical_block_size, lim->io_min); - unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT) - << SECTOR_SHIFT; - - return (granularity + lim->alignment_offset - alignment) % granularity; -} - -static inline int bdev_alignment_offset(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - - if (q->limits.misaligned) - return -1; - if (bdev_is_partition(bdev)) - return queue_limit_alignment_offset(&q->limits, - bdev->bd_start_sect); - return q->limits.alignment_offset; -} +int bdev_alignment_offset(struct block_device *bdev); static inline int queue_discard_alignment(const struct request_queue *q) { -- cgit v1.2.3 From 4e1462ffe8998749884d61f91be251a7a8719677 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:50 +0200 Subject: block: remove queue_discard_alignment Just use bdev_alignment_offset in disk_discard_alignment_show instead. That helpers is the same except for an always false branch that doesn't matter in this slow path. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220415045258.199825-20-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0a1795ac2627..5a9b7aeda010 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1253,14 +1253,6 @@ bdev_zone_write_granularity(struct block_device *bdev) int bdev_alignment_offset(struct block_device *bdev); -static inline int queue_discard_alignment(const struct request_queue *q) -{ - if (q->limits.discard_misaligned) - return -1; - - return q->limits.discard_alignment; -} - static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector_t sector) { unsigned int alignment, granularity, offset; -- cgit v1.2.3 From 5c4b4a5c6f11c869a57c6bd977143430bc9dc43d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:52 +0200 Subject: block: move {bdev,queue_limit}_discard_alignment out of line No need to inline these fairly larger helpers. Also fix the return value to be unsigned, just like the field in struct queue_limits. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220415045258.199825-22-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5a9b7aeda010..34b1cfd06742 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1252,39 +1252,7 @@ bdev_zone_write_granularity(struct block_device *bdev) } int bdev_alignment_offset(struct block_device *bdev); - -static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector_t sector) -{ - unsigned int alignment, granularity, offset; - - if (!lim->max_discard_sectors) - return 0; - - /* Why are these in bytes, not sectors? */ - alignment = lim->discard_alignment >> SECTOR_SHIFT; - granularity = lim->discard_granularity >> SECTOR_SHIFT; - if (!granularity) - return 0; - - /* Offset of the partition start in 'granularity' sectors */ - offset = sector_div(sector, granularity); - - /* And why do we do this modulus *again* in blkdev_issue_discard()? */ - offset = (granularity + alignment - offset) % granularity; - - /* Turn it back into bytes, gaah */ - return offset << SECTOR_SHIFT; -} - -static inline int bdev_discard_alignment(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - - if (bdev_is_partition(bdev)) - return queue_limit_discard_alignment(&q->limits, - bdev->bd_start_sect); - return q->limits.discard_alignment; -} +unsigned int bdev_discard_alignment(struct block_device *bdev); static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) { -- cgit v1.2.3 From cf0fbf894bb543f472f682c486be48298eccf199 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:54 +0200 Subject: block: add a bdev_max_discard_sectors helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper to query the number of sectors support per each discard bio based on the block device and use this helper to stop various places from poking into the request_queue to see if discard is supported and if so how much. This mirrors what is done e.g. for write zeroes as well. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: Christoph Böhmwalder [drbd] Acked-by: Coly Li [bcache] Acked-by: David Sterba [btrfs] Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220415045258.199825-24-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 34b1cfd06742..ce16247d3afa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1254,6 +1254,11 @@ bdev_zone_write_granularity(struct block_device *bdev) int bdev_alignment_offset(struct block_device *bdev); unsigned int bdev_discard_alignment(struct block_device *bdev); +static inline unsigned int bdev_max_discard_sectors(struct block_device *bdev) +{ + return bdev_get_queue(bdev)->limits.max_discard_sectors; +} + static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From 70200574cc229f6ba038259e8142af2aa09e6976 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:55 +0200 Subject: block: remove QUEUE_FLAG_DISCARD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Just use a non-zero max_discard_sectors as an indicator for discard support, similar to what is done for write zeroes. The only places where needs special attention is the RAID5 driver, which must clear discard support for security reasons by default, even if the default stacking rules would allow for it. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: Christoph Böhmwalder [drbd] Acked-by: Jan Höppner [s390] Acked-by: Coly Li [bcache] Acked-by: David Sterba [btrfs] Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220415045258.199825-25-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ce16247d3afa..767ab22e1052 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -540,7 +540,6 @@ struct request_queue { #define QUEUE_FLAG_NONROT 6 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ -#define QUEUE_FLAG_DISCARD 8 /* supports DISCARD */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ #define QUEUE_FLAG_SECERASE 11 /* supports secure erase */ @@ -582,7 +581,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) -#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) #define blk_queue_zone_resetall(q) \ test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) #define blk_queue_secure_erase(q) \ -- cgit v1.2.3 From 7b47ef52d0a2025fd1408a8a0990933b8e1e510f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:56 +0200 Subject: block: add a bdev_discard_granularity helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Abstract away implementation details from file systems by providing a block_device based helper to retrieve the discard granularity. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: Christoph Böhmwalder [drbd] Acked-by: Ryusuke Konishi Acked-by: David Sterba [btrfs] Link: https://lore.kernel.org/r/20220415045258.199825-26-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 767ab22e1052..f1cf557ea20e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1257,6 +1257,11 @@ static inline unsigned int bdev_max_discard_sectors(struct block_device *bdev) return bdev_get_queue(bdev)->limits.max_discard_sectors; } +static inline unsigned int bdev_discard_granularity(struct block_device *bdev) +{ + return bdev_get_queue(bdev)->limits.discard_granularity; +} + static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From 44abff2c0b970ae3d310b97617525dc01f248d7c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Apr 2022 06:52:57 +0200 Subject: block: decouple REQ_OP_SECURE_ERASE from REQ_OP_DISCARD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Secure erase is a very different operation from discard in that it is a data integrity operation vs hint. Fully split the limits and helper infrastructure to make the separation more clear. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: Christoph Böhmwalder [drbd] Acked-by: Ryusuke Konishi [nifs2] Acked-by: Jaegeuk Kim [f2fs] Acked-by: Coly Li [bcache] Acked-by: David Sterba [btrfs] Acked-by: Chao Yu Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220415045258.199825-27-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f1cf557ea20e..c9b5925af5a3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -248,6 +248,7 @@ struct queue_limits { unsigned int io_opt; unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; + unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; unsigned int max_zone_append_sectors; unsigned int discard_granularity; @@ -542,7 +543,6 @@ struct request_queue { #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ -#define QUEUE_FLAG_SECERASE 11 /* supports secure erase */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_DEAD 13 /* queue tear-down finished */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ @@ -583,8 +583,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_zone_resetall(q) \ test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) -#define blk_queue_secure_erase(q) \ - (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) #define blk_queue_pci_p2pdma(q) \ test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) @@ -947,6 +945,8 @@ extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_segments(struct request_queue *, unsigned short); extern void blk_queue_max_discard_segments(struct request_queue *, unsigned short); +void blk_queue_max_secure_erase_sectors(struct request_queue *q, + unsigned int max_sectors); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); @@ -1087,13 +1087,12 @@ static inline long nr_blockdev_pages(void) extern void blk_io_schedule(void); -#define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */ - -extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); -extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int flags, - struct bio **biop); +int blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask); +int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop); +int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp); #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ @@ -1112,7 +1111,7 @@ static inline int sb_issue_discard(struct super_block *sb, sector_t block, SECTOR_SHIFT), nr_blocks << (sb->s_blocksize_bits - SECTOR_SHIFT), - gfp_mask, flags); + gfp_mask); } static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, sector_t nr_blocks, gfp_t gfp_mask) @@ -1262,6 +1261,12 @@ static inline unsigned int bdev_discard_granularity(struct block_device *bdev) return bdev_get_queue(bdev)->limits.discard_granularity; } +static inline unsigned int +bdev_max_secure_erase_sectors(struct block_device *bdev) +{ + return bdev_get_queue(bdev)->limits.max_secure_erase_sectors; +} + static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From a2daa27c0c6137481226aee5b3136e453c642929 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Feb 2022 11:44:42 +0100 Subject: swiotlb: simplify swiotlb_max_segment Remove the bogus Xen override that was usually larger than the actual size and just calculate the value on demand. Note that swiotlb_max_segment still doesn't make sense as an interface and should eventually be removed. Signed-off-by: Christoph Hellwig Reviewed-by: Anshuman Khandual Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- include/linux/swiotlb.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index f6c3638255d5..9fb3a568f0c5 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -164,7 +164,6 @@ static inline void swiotlb_adjust_size(unsigned long size) #endif /* CONFIG_SWIOTLB */ extern void swiotlb_print_info(void); -extern void swiotlb_set_max_segment(unsigned int); #ifdef CONFIG_DMA_RESTRICTED_POOL struct page *swiotlb_alloc(struct device *dev, size_t size); -- cgit v1.2.3 From 0d5ffd9a256d8995764f9d4a35a8c3917839d169 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Feb 2022 11:07:28 +0100 Subject: swiotlb: rename swiotlb_late_init_with_default_size swiotlb_late_init_with_default_size is an overly verbose name that doesn't even catch what the function is doing, given that the size is not just a default but the actual requested size. Rename it to swiotlb_init_late. Signed-off-by: Christoph Hellwig Reviewed-by: Anshuman Khandual Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- include/linux/swiotlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 9fb3a568f0c5..b48b26bfa0ed 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -40,7 +40,7 @@ extern void swiotlb_init(int verbose); int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); unsigned long swiotlb_size_or_default(void); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); -extern int swiotlb_late_init_with_default_size(size_t default_size); +int swiotlb_init_late(size_t size); extern void __init swiotlb_update_mem_attributes(void); phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, -- cgit v1.2.3 From 78013eaadf696d2105982abb4018fbae394ca08f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Feb 2022 14:11:44 +0100 Subject: x86: remove the IOMMU table infrastructure The IOMMU table tries to separate the different IOMMUs into different backends, but actually requires various cross calls. Rewrite the code to do the generic swiotlb/swiotlb-xen setup directly in pci-dma.c and then just call into the IOMMU drivers. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- include/linux/dmar.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 45e903d84733..cbd714a198a0 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -121,7 +121,7 @@ extern int dmar_remove_dev_scope(struct dmar_pci_notify_info *info, u16 segment, struct dmar_dev_scope *devices, int count); /* Intel IOMMU detection */ -extern int detect_intel_iommu(void); +void detect_intel_iommu(void); extern int enable_drhd_fault_handling(void); extern int dmar_device_add(acpi_handle handle); extern int dmar_device_remove(acpi_handle handle); @@ -197,6 +197,10 @@ static inline bool dmar_platform_optin(void) return false; } +static inline void detect_intel_iommu(void) +{ +} + #endif /* CONFIG_DMAR_TABLE */ struct irte { -- cgit v1.2.3 From c6af2aa9ffc9763826607bc2664ef3ea4475ed18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Mar 2022 17:27:33 +0200 Subject: swiotlb: make the swiotlb_init interface more useful Pass a boolean flag to indicate if swiotlb needs to be enabled based on the addressing needs, and replace the verbose argument with a set of flags, including one to force enable bounce buffering. Note that this patch removes the possibility to force xen-swiotlb use with the swiotlb=force parameter on the command line on x86 (arm and arm64 never supported that), but this interface will be restored shortly. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- include/linux/swiotlb.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index b48b26bfa0ed..ae0407173e84 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -13,11 +13,8 @@ struct device; struct page; struct scatterlist; -enum swiotlb_force { - SWIOTLB_NORMAL, /* Default - depending on HW DMA mask etc. */ - SWIOTLB_FORCE, /* swiotlb=force */ - SWIOTLB_NO_FORCE, /* swiotlb=noforce */ -}; +#define SWIOTLB_VERBOSE (1 << 0) /* verbose initialization */ +#define SWIOTLB_FORCE (1 << 1) /* force bounce buffering */ /* * Maximum allowable number of contiguous slabs to map, @@ -36,8 +33,7 @@ enum swiotlb_force { /* default to 64MB */ #define IO_TLB_DEFAULT_SIZE (64UL<<20) -extern void swiotlb_init(int verbose); -int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); +int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, unsigned int flags); unsigned long swiotlb_size_or_default(void); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); int swiotlb_init_late(size_t size); @@ -126,13 +122,16 @@ static inline bool is_swiotlb_force_bounce(struct device *dev) return mem && mem->force_bounce; } +void swiotlb_init(bool addressing_limited, unsigned int flags); void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); size_t swiotlb_max_mapping_size(struct device *dev); bool is_swiotlb_active(struct device *dev); void __init swiotlb_adjust_size(unsigned long size); #else -#define swiotlb_force SWIOTLB_NO_FORCE +static inline void swiotlb_init(bool addressing_limited, unsigned int flags) +{ +} static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) { return false; -- cgit v1.2.3 From 8ba2ed1be90fc210126f68186564707478552c95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Feb 2022 13:36:57 +0200 Subject: swiotlb: add a SWIOTLB_ANY flag to lift the low memory restriction Power SVM wants to allocate a swiotlb buffer that is not restricted to low memory for the trusted hypervisor scheme. Consolidate the support for this into the swiotlb_init interface by adding a new flag. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- include/linux/swiotlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index ae0407173e84..eabdd8998702 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -15,6 +15,7 @@ struct scatterlist; #define SWIOTLB_VERBOSE (1 << 0) /* verbose initialization */ #define SWIOTLB_FORCE (1 << 1) /* force bounce buffering */ +#define SWIOTLB_ANY (1 << 2) /* allow any memory for the buffer */ /* * Maximum allowable number of contiguous slabs to map, -- cgit v1.2.3 From 742519538e6b07250c8085bbff4bd358bc03bf16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Feb 2022 11:12:59 +0100 Subject: swiotlb: pass a gfp_mask argument to swiotlb_init_late Let the caller chose a zone to allocate from. This will be used later on by the xen-swiotlb initialization on arm. Signed-off-by: Christoph Hellwig Reviewed-by: Anshuman Khandual Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- include/linux/swiotlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index eabdd8998702..ee655f2e4d28 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -37,7 +37,7 @@ struct scatterlist; int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, unsigned int flags); unsigned long swiotlb_size_or_default(void); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); -int swiotlb_init_late(size_t size); +int swiotlb_init_late(size_t size, gfp_t gfp_mask); extern void __init swiotlb_update_mem_attributes(void); phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, -- cgit v1.2.3 From 7374153d294eb51de5a81ac38ff1c4fef8927bec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Mar 2022 08:02:57 +0100 Subject: swiotlb: provide swiotlb_init variants that remap the buffer To shared more code between swiotlb and xen-swiotlb, offer a swiotlb_init_remap interface and add a remap callback to swiotlb_init_late that will allow Xen to remap the buffer without duplicating much of the logic. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- include/linux/swiotlb.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index ee655f2e4d28..7b50c82f84ce 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -36,8 +36,11 @@ struct scatterlist; int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, unsigned int flags); unsigned long swiotlb_size_or_default(void); +void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, + int (*remap)(void *tlb, unsigned long nslabs)); +int swiotlb_init_late(size_t size, gfp_t gfp_mask, + int (*remap)(void *tlb, unsigned long nslabs)); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); -int swiotlb_init_late(size_t size, gfp_t gfp_mask); extern void __init swiotlb_update_mem_attributes(void); phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, -- cgit v1.2.3 From 6424e31b1c050a25aea033206d5f626f3523448c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Mar 2022 07:41:04 +0100 Subject: swiotlb: remove swiotlb_init_with_tbl and swiotlb_init_late_with_tbl No users left. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- include/linux/swiotlb.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 7b50c82f84ce..7ed35dd3de6e 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -34,13 +34,11 @@ struct scatterlist; /* default to 64MB */ #define IO_TLB_DEFAULT_SIZE (64UL<<20) -int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, unsigned int flags); unsigned long swiotlb_size_or_default(void); void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, int (*remap)(void *tlb, unsigned long nslabs)); int swiotlb_init_late(size_t size, gfp_t gfp_mask, int (*remap)(void *tlb, unsigned long nslabs)); -extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); extern void __init swiotlb_update_mem_attributes(void); phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, -- cgit v1.2.3 From f47a6113f4e87db7ca066635822e1b3ca3ed9514 Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Wed, 16 Feb 2022 16:03:03 +0800 Subject: platform/chrome: cros_ec: remove unused variable `was_wake_device` Reviewed-by: Prashant Malani Signed-off-by: Tzung-Bi Shih --- include/linux/platform_data/cros_ec_proto.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index df3c78c92ca2..c65971ec90ea 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -76,8 +76,6 @@ struct cros_ec_command { * struct cros_ec_device - Information about a ChromeOS EC device. * @phys_name: Name of physical comms layer (e.g. 'i2c-4'). * @dev: Device pointer for physical comms device - * @was_wake_device: True if this device was set to wake the system from - * sleep at the last suspend. * @cros_class: The class structure for this device. * @cmd_readmem: Direct read of the EC memory-mapped region, if supported. * @offset: Is within EC_LPC_ADDR_MEMMAP region. @@ -137,7 +135,6 @@ struct cros_ec_device { /* These are used by other drivers that want to talk to the EC */ const char *phys_name; struct device *dev; - bool was_wake_device; struct class *cros_class; int (*cmd_readmem)(struct cros_ec_device *ec, unsigned int offset, unsigned int bytes, void *dest); -- cgit v1.2.3 From 5f0614a55ecebdf55f1a17db0b5f6b787ed009f1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 17 Apr 2022 22:27:13 -0400 Subject: block: change exported IO accounting interface from gendisk to bdev Export IO accounting interfaces in terms of block_device now that gendisk has become more internal to block core. Rename __part_{start,end}_io_acct's first argument from part to bdev. Rename __part_{start,end}_io_acct to bdev_{start,end}_io_acct and export them. Remove disk_{start,end}_io_acct and update caller (zram) to use bdev_{start,end}_io_acct. DM can now be updated to use bdev_{start,end}_io_acct. Signed-off-by: Ming Lei Signed-off-by: Mike Snitzer Link: https://lore.kernel.org/r/20220418022733.56168-2-snitzer@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c9b5925af5a3..34724b15813b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1463,9 +1463,10 @@ static inline void blk_wake_io_task(struct task_struct *waiter) wake_up_process(waiter); } -unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, - unsigned int op); -void disk_end_io_acct(struct gendisk *disk, unsigned int op, +unsigned long bdev_start_io_acct(struct block_device *bdev, + unsigned int sectors, unsigned int op, + unsigned long start_time); +void bdev_end_io_acct(struct block_device *bdev, unsigned int op, unsigned long start_time); void bio_start_io_acct_time(struct bio *bio, unsigned long start_time); -- cgit v1.2.3 From dbdc1be32591af023db2812706f01e6cd2f42bfc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:06 +0200 Subject: block: add a disk_openers helper Add a helper that returns the openers for a given gendisk to avoid having drivers poke into disk->part0 to get at this information in a somewhat cumbersome way. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220330052917.2566582-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c9b5925af5a3..436645bde13f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -176,6 +176,21 @@ static inline bool disk_live(struct gendisk *disk) return !inode_unhashed(disk->part0->bd_inode); } +/** + * disk_openers - returns how many openers are there for a disk + * @disk: disk to check + * + * This returns the number of openers for a disk. Note that this value is only + * stable if disk->open_mutex is held. + * + * Note: Due to a quirk in the block layer open code, each open partition is + * only counted once even if there are multiple openers. + */ +static inline unsigned int disk_openers(struct gendisk *disk) +{ + return disk->part0->bd_openers; +} + /* * The gendisk is refcounted by the part0 block_device, and the bd_device * therein is also used for device model presentation in sysfs. -- cgit v1.2.3 From 9acf381f3e8f715175c29f4b6d722f6b6797d139 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Mar 2022 07:29:07 +0200 Subject: block: turn bdev->bd_openers into an atomic_t All manipulation of bd_openers is under disk->open_mutex and will remain so for the foreseeable future. But at least one place reads it without the lock (blkdev_get) and there are more to be added. So make sure the compiler does not do turn the increments and decrements into non-atomic sequences by using an atomic_t. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220330052917.2566582-6-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 +- include/linux/blkdev.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 4968cb17b13b..c62274466e72 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -44,7 +44,7 @@ struct block_device { unsigned long bd_stamp; bool bd_read_only; /* read-only policy */ dev_t bd_dev; - int bd_openers; + atomic_t bd_openers; struct inode * bd_inode; /* will die */ struct super_block * bd_super; void * bd_claiming; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 436645bde13f..afad3d1d0dac 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -188,7 +188,7 @@ static inline bool disk_live(struct gendisk *disk) */ static inline unsigned int disk_openers(struct gendisk *disk) { - return disk->part0->bd_openers; + return atomic_read(&disk->part0->bd_openers); } /* -- cgit v1.2.3 From 57b888ca2541785de2fcb90575b378921919b6c0 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 18 Mar 2022 09:54:22 -0700 Subject: platform/chrome: Re-introduce cros_ec_cmd_xfer and use it for ioctls Commit 413dda8f2c6f ("platform/chrome: cros_ec_chardev: Use cros_ec_cmd_xfer_status helper") inadvertendly changed the userspace ABI. Previously, cros_ec ioctls would only report errors if the EC communication failed, and otherwise return success and the result of the EC communication. An EC command execution failure was reported in the EC response field. The above mentioned commit changed this behavior, and the ioctl itself would fail. This breaks userspace commands trying to analyze the EC command execution error since the actual EC command response is no longer reported to userspace. Fix the problem by re-introducing the cros_ec_cmd_xfer() helper, and use it to handle ioctl messages. Fixes: 413dda8f2c6f ("platform/chrome: cros_ec_chardev: Use cros_ec_cmd_xfer_status helper") Cc: Daisuke Nojiri Cc: Rob Barnes Cc: Rajat Jain Cc: Brian Norris Cc: Parth Malkan Reviewed-by: Daisuke Nojiri Reviewed-by: Brian Norris Signed-off-by: Guenter Roeck Signed-off-by: Tzung-Bi Shih --- include/linux/platform_data/cros_ec_proto.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index c65971ec90ea..138fd912c808 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -213,6 +213,9 @@ int cros_ec_prepare_tx(struct cros_ec_device *ec_dev, int cros_ec_check_result(struct cros_ec_device *ec_dev, struct cros_ec_command *msg); +int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev, + struct cros_ec_command *msg); + int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev, struct cros_ec_command *msg); -- cgit v1.2.3 From 2f1e85b1aee459b7d0fd981839042c6a38ffaf0c Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Sat, 16 Apr 2022 00:40:45 +0800 Subject: net: sched: use queue_mapping to pick tx queue This patch fixes issue: * If we install tc filters with act_skbedit in clsact hook. It doesn't work, because netdev_core_pick_tx() overwrites queue_mapping. $ tc filter ... action skbedit queue_mapping 1 And this patch is useful: * We can use FQ + EDT to implement efficient policies. Tx queues are picked by xps, ndo_select_queue of netdev driver, or skb hash in netdev_core_pick_tx(). In fact, the netdev driver, and skb hash are _not_ under control. xps uses the CPUs map to select Tx queues, but we can't figure out which task_struct of pod/containter running on this cpu in most case. We can use clsact filters to classify one pod/container traffic to one Tx queue. Why ? In containter networking environment, there are two kinds of pod/ containter/net-namespace. One kind (e.g. P1, P2), the high throughput is key in these applications. But avoid running out of network resource, the outbound traffic of these pods is limited, using or sharing one dedicated Tx queues assigned HTB/TBF/FQ Qdisc. Other kind of pods (e.g. Pn), the low latency of data access is key. And the traffic is not limited. Pods use or share other dedicated Tx queues assigned FIFO Qdisc. This choice provides two benefits. First, contention on the HTB/FQ Qdisc lock is significantly reduced since fewer CPUs contend for the same queue. More importantly, Qdisc contention can be eliminated completely if each CPU has its own FIFO Qdisc for the second kind of pods. There must be a mechanism in place to support classifying traffic based on pods/container to different Tx queues. Note that clsact is outside of Qdisc while Qdisc can run a classifier to select a sub-queue under the lock. In general recording the decision in the skb seems a little heavy handed. This patch introduces a per-CPU variable, suggested by Eric. The xmit.skip_txqueue flag is firstly cleared in __dev_queue_xmit(). - Tx Qdisc may install that skbedit actions, then xmit.skip_txqueue flag is set in qdisc->enqueue() though tx queue has been selected in netdev_tx_queue_mapping() or netdev_core_pick_tx(). That flag is cleared firstly in __dev_queue_xmit(), is useful: - Avoid picking Tx queue with netdev_tx_queue_mapping() in next netdev in such case: eth0 macvlan - eth0.3 vlan - eth0 ixgbe-phy: For example, eth0, macvlan in pod, which root Qdisc install skbedit queue_mapping, send packets to eth0.3, vlan in host. In __dev_queue_xmit() of eth0.3, clear the flag, does not select tx queue according to skb->queue_mapping because there is no filters in clsact or tx Qdisc of this netdev. Same action taked in eth0, ixgbe in Host. - Avoid picking Tx queue for next packet. If we set xmit.skip_txqueue in tx Qdisc (qdisc->enqueue()), the proper way to clear it is clearing it in __dev_queue_xmit when processing next packets. For performance reasons, use the static key. If user does not config the NET_EGRESS, the patch will not be compiled. +----+ +----+ +----+ | P1 | | P2 | | Pn | +----+ +----+ +----+ | | | +-----------+-----------+ | | clsact/skbedit | MQ v +-----------+-----------+ | q0 | q1 | qn v v v HTB/FQ HTB/FQ ... FIFO Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Jonathan Lemon Cc: Eric Dumazet Cc: Alexander Lobakin Cc: Paolo Abeni Cc: Talal Ahmad Cc: Kevin Hao Cc: Ilias Apalodimas Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Antoine Tenart Cc: Wei Wang Cc: Arnd Bergmann Suggested-by: Eric Dumazet Signed-off-by: Tonghao Zhang Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 3 +++ include/linux/rtnetlink.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a602f29365b0..7dccbfd1bf56 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3061,6 +3061,9 @@ struct softnet_data { struct { u16 recursion; u8 more; +#ifdef CONFIG_NET_EGRESS + u8 skip_txqueue; +#endif } xmit; #ifdef CONFIG_RPS /* input_queue_head should be written by cpu owning this struct, diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 7f970b16da3a..ae2c6a3cec5d 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -100,6 +100,7 @@ void net_dec_ingress_queue(void); #ifdef CONFIG_NET_EGRESS void net_inc_egress_queue(void); void net_dec_egress_queue(void); +void netdev_xmit_skip_txqueue(bool skip); #endif void rtnetlink_init(void); -- cgit v1.2.3 From c6547c2ed0e1487c91983faccad841611ab6a783 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Thu, 14 Apr 2022 18:22:37 +0200 Subject: dmaengine: imx: Move header to include/dma/ The i.MX DMA drivers are device tree only, nothing in include/linux/platform_data/dma-imx.h has platform_data in it, so move the file to include/linux/dma/imx-dma.h. Signed-off-by: Sascha Hauer Acked-By: Vinod Koul Link: https://lore.kernel.org/r/20220414162249.3934543-10-s.hauer@pengutronix.de Signed-off-by: Mark Brown --- include/linux/dma/imx-dma.h | 68 +++++++++++++++++++++++++++++++++++ include/linux/platform_data/dma-imx.h | 68 ----------------------------------- 2 files changed, 68 insertions(+), 68 deletions(-) create mode 100644 include/linux/dma/imx-dma.h delete mode 100644 include/linux/platform_data/dma-imx.h (limited to 'include/linux') diff --git a/include/linux/dma/imx-dma.h b/include/linux/dma/imx-dma.h new file mode 100644 index 000000000000..b06cba85a6d4 --- /dev/null +++ b/include/linux/dma/imx-dma.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2004-2009 Freescale Semiconductor, Inc. All Rights Reserved. + */ + +#ifndef __LINUX_DMA_IMX_H +#define __LINUX_DMA_IMX_H + +#include +#include +#include + +/* + * This enumerates peripheral types. Used for SDMA. + */ +enum sdma_peripheral_type { + IMX_DMATYPE_SSI, /* MCU domain SSI */ + IMX_DMATYPE_SSI_SP, /* Shared SSI */ + IMX_DMATYPE_MMC, /* MMC */ + IMX_DMATYPE_SDHC, /* SDHC */ + IMX_DMATYPE_UART, /* MCU domain UART */ + IMX_DMATYPE_UART_SP, /* Shared UART */ + IMX_DMATYPE_FIRI, /* FIRI */ + IMX_DMATYPE_CSPI, /* MCU domain CSPI */ + IMX_DMATYPE_CSPI_SP, /* Shared CSPI */ + IMX_DMATYPE_SIM, /* SIM */ + IMX_DMATYPE_ATA, /* ATA */ + IMX_DMATYPE_CCM, /* CCM */ + IMX_DMATYPE_EXT, /* External peripheral */ + IMX_DMATYPE_MSHC, /* Memory Stick Host Controller */ + IMX_DMATYPE_MSHC_SP, /* Shared Memory Stick Host Controller */ + IMX_DMATYPE_DSP, /* DSP */ + IMX_DMATYPE_MEMORY, /* Memory */ + IMX_DMATYPE_FIFO_MEMORY,/* FIFO type Memory */ + IMX_DMATYPE_SPDIF, /* SPDIF */ + IMX_DMATYPE_IPU_MEMORY, /* IPU Memory */ + IMX_DMATYPE_ASRC, /* ASRC */ + IMX_DMATYPE_ESAI, /* ESAI */ + IMX_DMATYPE_SSI_DUAL, /* SSI Dual FIFO */ + IMX_DMATYPE_ASRC_SP, /* Shared ASRC */ + IMX_DMATYPE_SAI, /* SAI */ +}; + +enum imx_dma_prio { + DMA_PRIO_HIGH = 0, + DMA_PRIO_MEDIUM = 1, + DMA_PRIO_LOW = 2 +}; + +struct imx_dma_data { + int dma_request; /* DMA request line */ + int dma_request2; /* secondary DMA request line */ + enum sdma_peripheral_type peripheral_type; + int priority; +}; + +static inline int imx_dma_is_ipu(struct dma_chan *chan) +{ + return !strcmp(dev_name(chan->device->dev), "ipu-core"); +} + +static inline int imx_dma_is_general_purpose(struct dma_chan *chan) +{ + return !strcmp(chan->device->dev->driver->name, "imx-sdma") || + !strcmp(chan->device->dev->driver->name, "imx-dma"); +} + +#endif /* __LINUX_DMA_IMX_H */ diff --git a/include/linux/platform_data/dma-imx.h b/include/linux/platform_data/dma-imx.h deleted file mode 100644 index 281adbb26e6b..000000000000 --- a/include/linux/platform_data/dma-imx.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2004-2009 Freescale Semiconductor, Inc. All Rights Reserved. - */ - -#ifndef __ASM_ARCH_MXC_DMA_H__ -#define __ASM_ARCH_MXC_DMA_H__ - -#include -#include -#include - -/* - * This enumerates peripheral types. Used for SDMA. - */ -enum sdma_peripheral_type { - IMX_DMATYPE_SSI, /* MCU domain SSI */ - IMX_DMATYPE_SSI_SP, /* Shared SSI */ - IMX_DMATYPE_MMC, /* MMC */ - IMX_DMATYPE_SDHC, /* SDHC */ - IMX_DMATYPE_UART, /* MCU domain UART */ - IMX_DMATYPE_UART_SP, /* Shared UART */ - IMX_DMATYPE_FIRI, /* FIRI */ - IMX_DMATYPE_CSPI, /* MCU domain CSPI */ - IMX_DMATYPE_CSPI_SP, /* Shared CSPI */ - IMX_DMATYPE_SIM, /* SIM */ - IMX_DMATYPE_ATA, /* ATA */ - IMX_DMATYPE_CCM, /* CCM */ - IMX_DMATYPE_EXT, /* External peripheral */ - IMX_DMATYPE_MSHC, /* Memory Stick Host Controller */ - IMX_DMATYPE_MSHC_SP, /* Shared Memory Stick Host Controller */ - IMX_DMATYPE_DSP, /* DSP */ - IMX_DMATYPE_MEMORY, /* Memory */ - IMX_DMATYPE_FIFO_MEMORY,/* FIFO type Memory */ - IMX_DMATYPE_SPDIF, /* SPDIF */ - IMX_DMATYPE_IPU_MEMORY, /* IPU Memory */ - IMX_DMATYPE_ASRC, /* ASRC */ - IMX_DMATYPE_ESAI, /* ESAI */ - IMX_DMATYPE_SSI_DUAL, /* SSI Dual FIFO */ - IMX_DMATYPE_ASRC_SP, /* Shared ASRC */ - IMX_DMATYPE_SAI, /* SAI */ -}; - -enum imx_dma_prio { - DMA_PRIO_HIGH = 0, - DMA_PRIO_MEDIUM = 1, - DMA_PRIO_LOW = 2 -}; - -struct imx_dma_data { - int dma_request; /* DMA request line */ - int dma_request2; /* secondary DMA request line */ - enum sdma_peripheral_type peripheral_type; - int priority; -}; - -static inline int imx_dma_is_ipu(struct dma_chan *chan) -{ - return !strcmp(dev_name(chan->device->dev), "ipu-core"); -} - -static inline int imx_dma_is_general_purpose(struct dma_chan *chan) -{ - return !strcmp(chan->device->dev->driver->name, "imx-sdma") || - !strcmp(chan->device->dev->driver->name, "imx-dma"); -} - -#endif -- cgit v1.2.3 From 824a0a02cd74776461aaa30d792b1ed9111c9aa5 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Thu, 14 Apr 2022 18:22:39 +0200 Subject: dmaengine: imx-sdma: Add multi fifo support The i.MX SDMA engine can read from / write to multiple successive hardware FIFO registers, referred to as "Multi FIFO support". This is needed for the micfil driver and certain configurations of the SAI driver. This patch adds support for this feature. The number of FIFOs to read from / write to must be communicated from the client driver to the SDMA engine. For this the struct dma_slave_config::peripheral_config field is used. Signed-off-by: Sascha Hauer Acked-By: Vinod Koul Link: https://lore.kernel.org/r/20220414162249.3934543-12-s.hauer@pengutronix.de Signed-off-by: Mark Brown --- include/linux/dma/imx-dma.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma/imx-dma.h b/include/linux/dma/imx-dma.h index b06cba85a6d4..8887762360d4 100644 --- a/include/linux/dma/imx-dma.h +++ b/include/linux/dma/imx-dma.h @@ -39,6 +39,7 @@ enum sdma_peripheral_type { IMX_DMATYPE_SSI_DUAL, /* SSI Dual FIFO */ IMX_DMATYPE_ASRC_SP, /* Shared ASRC */ IMX_DMATYPE_SAI, /* SAI */ + IMX_DMATYPE_MULTI_SAI, /* MULTI FIFOs For Audio */ }; enum imx_dma_prio { @@ -65,4 +66,23 @@ static inline int imx_dma_is_general_purpose(struct dma_chan *chan) !strcmp(chan->device->dev->driver->name, "imx-dma"); } +/** + * struct sdma_peripheral_config - SDMA config for audio + * @n_fifos_src: Number of FIFOs for recording + * @n_fifos_dst: Number of FIFOs for playback + * @sw_done: Use software done. Needed for PDM (micfil) + * + * Some i.MX Audio devices (SAI, micfil) have multiple successive FIFO + * registers. For multichannel recording/playback the SAI/micfil have + * one FIFO register per channel and the SDMA engine has to read/write + * the next channel from/to the next register and wrap around to the + * first register when all channels are handled. The number of active + * channels must be communicated to the SDMA engine using this struct. + */ +struct sdma_peripheral_config { + int n_fifos_src; + int n_fifos_dst; + bool sw_done; +}; + #endif /* __LINUX_DMA_IMX_H */ -- cgit v1.2.3 From 6c846d026d490b2383d395bc8e7b06336219667b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 19 Apr 2022 15:18:37 +0100 Subject: gpio: Don't fiddle with irqchips marked as immutable In order to move away from gpiolib messing with the internals of unsuspecting irqchips, add a flag by which irqchips advertise that they are not to be messed with, and do solemnly swear that they correctly call into the gpiolib helpers when required. Also nudge the users into converting their drivers to the new model. Reviewed-by: Andy Shevchenko Reviewed-by: Bartosz Golaszewski Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220419141846.598305-2-maz@kernel.org --- include/linux/irq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index f92788ccdba2..505308253d23 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -569,6 +569,7 @@ struct irq_chip { * IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND: Invokes __enable_irq()/__disable_irq() for wake irqs * in the suspend path if they are in disabled state * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup + * IRQCHIP_IMMUTABLE: Don't ever change anything in this chip */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), @@ -582,6 +583,7 @@ enum { IRQCHIP_SUPPORTS_NMI = (1 << 8), IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9), IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10), + IRQCHIP_IMMUTABLE = (1 << 11), }; #include -- cgit v1.2.3 From 704f08753b6dcd0e08c1953af0b2c7f3fac87111 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 19 Apr 2022 15:18:38 +0100 Subject: gpio: Expose the gpiochip_irq_re[ql]res helpers The GPIO subsystem has a couple of internal helpers to manage resources on behalf of the irqchip. Expose them so that GPIO drivers can use them directly. Reviewed-by: Andy Shevchenko Reviewed-by: Bartosz Golaszewski Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220419141846.598305-3-maz@kernel.org --- include/linux/gpio/driver.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 98c93510640e..066bcfdf878d 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -579,6 +579,10 @@ void gpiochip_relres_irq(struct gpio_chip *gc, unsigned int offset); void gpiochip_disable_irq(struct gpio_chip *gc, unsigned int offset); void gpiochip_enable_irq(struct gpio_chip *gc, unsigned int offset); +/* irq_data versions of the above */ +int gpiochip_irq_reqres(struct irq_data *data); +void gpiochip_irq_relres(struct irq_data *data); + /* Line status inquiry for drivers */ bool gpiochip_line_is_open_drain(struct gpio_chip *gc, unsigned int offset); bool gpiochip_line_is_open_source(struct gpio_chip *gc, unsigned int offset); -- cgit v1.2.3 From 36b78aae4bfee749bbde73be570796bfd0f56bec Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 19 Apr 2022 15:18:39 +0100 Subject: gpio: Add helpers to ease the transition towards immutable irq_chip Add a couple of new helpers to make it slightly simpler to convert drivers to immutable irq_chip structures: - GPIOCHIP_IRQ_RESOURCE_HELPERS populates the irq_chip structure with the resource management callbacks - gpio_irq_chip_set_chip() populates the gpio_irq_chip.chip structure, avoiding the proliferation of ugly casts Reviewed-by: Andy Shevchenko Reviewed-by: Bartosz Golaszewski Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220419141846.598305-4-maz@kernel.org --- include/linux/gpio/driver.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 066bcfdf878d..832990099097 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -583,6 +583,18 @@ void gpiochip_enable_irq(struct gpio_chip *gc, unsigned int offset); int gpiochip_irq_reqres(struct irq_data *data); void gpiochip_irq_relres(struct irq_data *data); +/* Paste this in your irq_chip structure */ +#define GPIOCHIP_IRQ_RESOURCE_HELPERS \ + .irq_request_resources = gpiochip_irq_reqres, \ + .irq_release_resources = gpiochip_irq_relres + +static inline void gpio_irq_chip_set_chip(struct gpio_irq_chip *girq, + const struct irq_chip *chip) +{ + /* Yes, dropping const is ugly, but it isn't like we have a choice */ + girq->chip = (struct irq_chip *)chip; +} + /* Line status inquiry for drivers */ bool gpiochip_line_is_open_drain(struct gpio_chip *gc, unsigned int offset); bool gpiochip_line_is_open_source(struct gpio_chip *gc, unsigned int offset); -- cgit v1.2.3 From 08d3df8c81537089fc8f21006b56f2f6fb23c6f8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 1 Sep 2019 22:26:10 +0200 Subject: ARM: pxa: split up mach/hardware.h The mach/hardware.h is included in lots of places, and it provides three different things on pxa: - the cpu_is_pxa* macros - an indirect inclusion of mach/addr-map.h - the __REG() and io_pv2() helper macros Split it up into separate and mach/pxa-regs.h headers, then change all the files that use mach/hardware.h to include the exact set of those three headers that they actually need, allowing for further more targeted cleanup. linux/soc/pxa/cpu.h can remain permanently exported and is now in a global location along with similar headers. pxa-regs.h and addr-map.h are only used in a very small number of drivers now and can be moved to arch/arm/mach-pxa/ directly when those drivers are to pass the necessary data as resources. Cc: Michael Turquette Cc: Stephen Boyd Acked-by: Viresh Kumar Acked-by: Dmitry Torokhov Cc: Jacek Anaszewski Cc: Pavel Machek Acked-by: Ulf Hansson Cc: Dominik Brodowski Acked-by: Alexandre Belloni Cc: Greg Kroah-Hartman Cc: Guenter Roeck Acked-by: Mark Brown Cc: linux-clk@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: linux-input@vger.kernel.org Cc: linux-leds@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-mtd@lists.infradead.org Cc: linux-rtc@vger.kernel.org Cc: linux-usb@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-fbdev@vger.kernel.org Cc: linux-watchdog@vger.kernel.org Cc: alsa-devel@alsa-project.org Signed-off-by: Arnd Bergmann --- include/linux/soc/pxa/cpu.h | 252 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 include/linux/soc/pxa/cpu.h (limited to 'include/linux') diff --git a/include/linux/soc/pxa/cpu.h b/include/linux/soc/pxa/cpu.h new file mode 100644 index 000000000000..5782450ee45c --- /dev/null +++ b/include/linux/soc/pxa/cpu.h @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Author: Nicolas Pitre + * Created: Jun 15, 2001 + * Copyright: MontaVista Software Inc. + */ + +#ifndef __SOC_PXA_CPU_H +#define __SOC_PXA_CPU_H + +#ifdef CONFIG_ARM +#include +#endif + +/* + * CPU Stepping CPU_ID JTAG_ID + * + * PXA210 B0 0x69052922 0x2926C013 + * PXA210 B1 0x69052923 0x3926C013 + * PXA210 B2 0x69052924 0x4926C013 + * PXA210 C0 0x69052D25 0x5926C013 + * + * PXA250 A0 0x69052100 0x09264013 + * PXA250 A1 0x69052101 0x19264013 + * PXA250 B0 0x69052902 0x29264013 + * PXA250 B1 0x69052903 0x39264013 + * PXA250 B2 0x69052904 0x49264013 + * PXA250 C0 0x69052D05 0x59264013 + * + * PXA255 A0 0x69052D06 0x69264013 + * + * PXA26x A0 0x69052903 0x39264013 + * PXA26x B0 0x69052D05 0x59264013 + * + * PXA27x A0 0x69054110 0x09265013 + * PXA27x A1 0x69054111 0x19265013 + * PXA27x B0 0x69054112 0x29265013 + * PXA27x B1 0x69054113 0x39265013 + * PXA27x C0 0x69054114 0x49265013 + * PXA27x C5 0x69054117 0x79265013 + * + * PXA30x A0 0x69056880 0x0E648013 + * PXA30x A1 0x69056881 0x1E648013 + * PXA31x A0 0x69056890 0x0E649013 + * PXA31x A1 0x69056891 0x1E649013 + * PXA31x A2 0x69056892 0x2E649013 + * PXA32x B1 0x69056825 0x5E642013 + * PXA32x B2 0x69056826 0x6E642013 + * + * PXA930 B0 0x69056835 0x5E643013 + * PXA930 B1 0x69056837 0x7E643013 + * PXA930 B2 0x69056838 0x8E643013 + * + * PXA935 A0 0x56056931 0x1E653013 + * PXA935 B0 0x56056936 0x6E653013 + * PXA935 B1 0x56056938 0x8E653013 + */ +#ifdef CONFIG_PXA25x +#define __cpu_is_pxa210(id) \ + ({ \ + unsigned int _id = (id) & 0xf3f0; \ + _id == 0x2120; \ + }) + +#define __cpu_is_pxa250(id) \ + ({ \ + unsigned int _id = (id) & 0xf3ff; \ + _id <= 0x2105; \ + }) + +#define __cpu_is_pxa255(id) \ + ({ \ + unsigned int _id = (id) & 0xffff; \ + _id == 0x2d06; \ + }) + +#define __cpu_is_pxa25x(id) \ + ({ \ + unsigned int _id = (id) & 0xf300; \ + _id == 0x2100; \ + }) +#else +#define __cpu_is_pxa210(id) (0) +#define __cpu_is_pxa250(id) (0) +#define __cpu_is_pxa255(id) (0) +#define __cpu_is_pxa25x(id) (0) +#endif + +#ifdef CONFIG_PXA27x +#define __cpu_is_pxa27x(id) \ + ({ \ + unsigned int _id = (id) >> 4 & 0xfff; \ + _id == 0x411; \ + }) +#else +#define __cpu_is_pxa27x(id) (0) +#endif + +#ifdef CONFIG_CPU_PXA300 +#define __cpu_is_pxa300(id) \ + ({ \ + unsigned int _id = (id) >> 4 & 0xfff; \ + _id == 0x688; \ + }) +#else +#define __cpu_is_pxa300(id) (0) +#endif + +#ifdef CONFIG_CPU_PXA310 +#define __cpu_is_pxa310(id) \ + ({ \ + unsigned int _id = (id) >> 4 & 0xfff; \ + _id == 0x689; \ + }) +#else +#define __cpu_is_pxa310(id) (0) +#endif + +#ifdef CONFIG_CPU_PXA320 +#define __cpu_is_pxa320(id) \ + ({ \ + unsigned int _id = (id) >> 4 & 0xfff; \ + _id == 0x603 || _id == 0x682; \ + }) +#else +#define __cpu_is_pxa320(id) (0) +#endif + +#ifdef CONFIG_CPU_PXA930 +#define __cpu_is_pxa930(id) \ + ({ \ + unsigned int _id = (id) >> 4 & 0xfff; \ + _id == 0x683; \ + }) +#else +#define __cpu_is_pxa930(id) (0) +#endif + +#ifdef CONFIG_CPU_PXA935 +#define __cpu_is_pxa935(id) \ + ({ \ + unsigned int _id = (id) >> 4 & 0xfff; \ + _id == 0x693; \ + }) +#else +#define __cpu_is_pxa935(id) (0) +#endif + +#define cpu_is_pxa210() \ + ({ \ + __cpu_is_pxa210(read_cpuid_id()); \ + }) + +#define cpu_is_pxa250() \ + ({ \ + __cpu_is_pxa250(read_cpuid_id()); \ + }) + +#define cpu_is_pxa255() \ + ({ \ + __cpu_is_pxa255(read_cpuid_id()); \ + }) + +#define cpu_is_pxa25x() \ + ({ \ + __cpu_is_pxa25x(read_cpuid_id()); \ + }) + +#define cpu_is_pxa27x() \ + ({ \ + __cpu_is_pxa27x(read_cpuid_id()); \ + }) + +#define cpu_is_pxa300() \ + ({ \ + __cpu_is_pxa300(read_cpuid_id()); \ + }) + +#define cpu_is_pxa310() \ + ({ \ + __cpu_is_pxa310(read_cpuid_id()); \ + }) + +#define cpu_is_pxa320() \ + ({ \ + __cpu_is_pxa320(read_cpuid_id()); \ + }) + +#define cpu_is_pxa930() \ + ({ \ + __cpu_is_pxa930(read_cpuid_id()); \ + }) + +#define cpu_is_pxa935() \ + ({ \ + __cpu_is_pxa935(read_cpuid_id()); \ + }) + + + +/* + * CPUID Core Generation Bit + * <= 0x2 for pxa21x/pxa25x/pxa26x/pxa27x + */ +#if defined(CONFIG_PXA25x) || defined(CONFIG_PXA27x) +#define __cpu_is_pxa2xx(id) \ + ({ \ + unsigned int _id = (id) >> 13 & 0x7; \ + _id <= 0x2; \ + }) +#else +#define __cpu_is_pxa2xx(id) (0) +#endif + +#ifdef CONFIG_PXA3xx +#define __cpu_is_pxa3xx(id) \ + ({ \ + __cpu_is_pxa300(id) \ + || __cpu_is_pxa310(id) \ + || __cpu_is_pxa320(id) \ + || __cpu_is_pxa93x(id); \ + }) +#else +#define __cpu_is_pxa3xx(id) (0) +#endif + +#if defined(CONFIG_CPU_PXA930) || defined(CONFIG_CPU_PXA935) +#define __cpu_is_pxa93x(id) \ + ({ \ + __cpu_is_pxa930(id) \ + || __cpu_is_pxa935(id); \ + }) +#else +#define __cpu_is_pxa93x(id) (0) +#endif + +#define cpu_is_pxa2xx() \ + ({ \ + __cpu_is_pxa2xx(read_cpuid_id()); \ + }) + +#define cpu_is_pxa3xx() \ + ({ \ + __cpu_is_pxa3xx(read_cpuid_id()); \ + }) + +#define cpu_is_pxa93x() \ + ({ \ + __cpu_is_pxa93x(read_cpuid_id()); \ + }) + +#endif -- cgit v1.2.3 From 22f0866513c2e531ae65a9d5dfc82f24497ef3b3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 2 Sep 2019 00:02:08 +0200 Subject: ARM: pxa: move mach/sound.h to linux/platform_data/ This is a basically a platform_data file, so move it out of the mach/* header directory. Cc: Marek Vasut Cc: Tomas Cech Cc: Sergey Lapin Acked-by: Mark Brown Acked-by: Robert Jarzmik Cc: alsa-devel@alsa-project.org Signed-off-by: Arnd Bergmann --- include/linux/platform_data/asoc-pxa.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 include/linux/platform_data/asoc-pxa.h (limited to 'include/linux') diff --git a/include/linux/platform_data/asoc-pxa.h b/include/linux/platform_data/asoc-pxa.h new file mode 100644 index 000000000000..327454cd8246 --- /dev/null +++ b/include/linux/platform_data/asoc-pxa.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __SOC_PXA_AUDIO_H__ +#define __SOC_PXA_AUDIO_H__ + +#include +#include +#include + +/* + * @reset_gpio: AC97 reset gpio (normally gpio113 or gpio95) + * a -1 value means no gpio will be used for reset + * @codec_pdata: AC97 codec platform_data + + * reset_gpio should only be specified for pxa27x CPUs where a silicon + * bug prevents correct operation of the reset line. If not specified, + * the default behaviour on these CPUs is to consider gpio 113 as the + * AC97 reset line, which is the default on most boards. + */ +typedef struct { + int (*startup)(struct snd_pcm_substream *, void *); + void (*shutdown)(struct snd_pcm_substream *, void *); + void (*suspend)(void *); + void (*resume)(void *); + void *priv; + int reset_gpio; + void *codec_pdata[AC97_BUS_MAX_DEVICES]; +} pxa2xx_audio_ops_t; + +extern void pxa_set_ac97_info(pxa2xx_audio_ops_t *ops); + +#endif -- cgit v1.2.3 From ee84cbd5df2beaf14e8af0955f1ab15ad3f81504 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 2 Sep 2019 00:15:44 +0200 Subject: ARM: pxa: move regs-lcd.h into driver Only the pxafb driver uses this header, so move it into the same directory. The SMART_* macros are required by some platform data definitions and can go into the linux/platform_data/video-pxafb.h header. Acked-by: Bartlomiej Zolnierkiewicz Acked-by: Robert Jarzmik Cc: dri-devel@lists.freedesktop.org Cc: linux-fbdev@vger.kernel.org Signed-off-by: Arnd Bergmann --- include/linux/platform_data/video-pxafb.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/video-pxafb.h b/include/linux/platform_data/video-pxafb.h index b3d574778326..6333bac166a5 100644 --- a/include/linux/platform_data/video-pxafb.h +++ b/include/linux/platform_data/video-pxafb.h @@ -8,7 +8,6 @@ */ #include -#include /* * Supported LCD connections @@ -153,6 +152,27 @@ struct pxafb_mach_info { void pxa_set_fb_info(struct device *, struct pxafb_mach_info *); unsigned long pxafb_get_hsync_time(struct device *dev); +/* smartpanel related */ +#define SMART_CMD_A0 (0x1 << 8) +#define SMART_CMD_READ_STATUS_REG (0x0 << 9) +#define SMART_CMD_READ_FRAME_BUFFER ((0x0 << 9) | SMART_CMD_A0) +#define SMART_CMD_WRITE_COMMAND (0x1 << 9) +#define SMART_CMD_WRITE_DATA ((0x1 << 9) | SMART_CMD_A0) +#define SMART_CMD_WRITE_FRAME ((0x2 << 9) | SMART_CMD_A0) +#define SMART_CMD_WAIT_FOR_VSYNC (0x3 << 9) +#define SMART_CMD_NOOP (0x4 << 9) +#define SMART_CMD_INTERRUPT (0x5 << 9) + +#define SMART_CMD(x) (SMART_CMD_WRITE_COMMAND | ((x) & 0xff)) +#define SMART_DAT(x) (SMART_CMD_WRITE_DATA | ((x) & 0xff)) + +/* SMART_DELAY() is introduced for software controlled delay primitive which + * can be inserted between command sequences, unused command 0x6 is used here + * and delay ranges from 0ms ~ 255ms + */ +#define SMART_CMD_DELAY (0x6 << 9) +#define SMART_DELAY(ms) (SMART_CMD_DELAY | ((ms) & 0xff)) + #ifdef CONFIG_FB_PXA_SMARTPANEL extern int pxafb_smart_queue(struct fb_info *info, uint16_t *cmds, int); extern int pxafb_smart_flush(struct fb_info *info); -- cgit v1.2.3 From eb38c2053b67977844404cbbdee341dbf3a02d36 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 14 Mar 2022 23:09:10 +0100 Subject: can: rx-offload: rename can_rx_offload_queue_sorted() -> can_rx_offload_queue_timestamp() This patch renames the function can_rx_offload_queue_sorted() to can_rx_offload_queue_timestamp(). This better describes what the function does, it adds a newly RX'ed skb to the sorted queue by its timestamp. Link: https://lore.kernel.org/all/20220417194327.2699059-1-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/rx-offload.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h index c11477620403..c205c51d79c9 100644 --- a/include/linux/can/rx-offload.h +++ b/include/linux/can/rx-offload.h @@ -42,8 +42,8 @@ int can_rx_offload_add_manual(struct net_device *dev, int can_rx_offload_irq_offload_timestamp(struct can_rx_offload *offload, u64 reg); int can_rx_offload_irq_offload_fifo(struct can_rx_offload *offload); -int can_rx_offload_queue_sorted(struct can_rx_offload *offload, - struct sk_buff *skb, u32 timestamp); +int can_rx_offload_queue_timestamp(struct can_rx_offload *offload, + struct sk_buff *skb, u32 timestamp); unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload, unsigned int idx, u32 timestamp, unsigned int *frame_len_ptr); -- cgit v1.2.3 From 055eb95533273bc334794dbc598400d10800528f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 14 Apr 2022 09:12:33 -0700 Subject: bpf: Move rcu lock management out of BPF_PROG_RUN routines Commit 7d08c2c91171 ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions") switched a bunch of BPF_PROG_RUN macros to inline routines. This changed the semantic a bit. Due to arguments expansion of macros, it used to be: rcu_read_lock(); array = rcu_dereference(cgrp->bpf.effective[atype]); ... Now, with with inline routines, we have: array_rcu = rcu_dereference(cgrp->bpf.effective[atype]); /* array_rcu can be kfree'd here */ rcu_read_lock(); array = rcu_dereference(array_rcu); I'm assuming in practice rcu subsystem isn't fast enough to trigger this but let's use rcu API properly. Also, rename to lower caps to not confuse with macros. Additionally, drop and expand BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY. See [1] for more context. [1] https://lore.kernel.org/bpf/CAKH8qBs60fOinFdxiiQikK_q0EcVxGvNTQoWvHLEUGbgcj1UYg@mail.gmail.com/T/#u v2 - keep rcu locks inside by passing cgroup_bpf Fixes: 7d08c2c91171 ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220414161233.170780-1-sdf@google.com --- include/linux/bpf.h | 115 ++++------------------------------------------------ 1 file changed, 7 insertions(+), 108 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bdb5298735ce..7bf441563ffc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1221,7 +1221,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, /* an array of programs to be executed under rcu_lock. * * Typical usage: - * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, bpf_prog_run); + * ret = bpf_prog_run_array(rcu_dereference(&bpf_prog_array), ctx, bpf_prog_run); * * the structure returned by bpf_prog_array_alloc() should be populated * with program pointers and the last pointer must be NULL. @@ -1315,83 +1315,22 @@ static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); -static __always_inline int -BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, - const void *ctx, bpf_prog_run_fn run_prog, - int retval, u32 *ret_flags) -{ - const struct bpf_prog_array_item *item; - const struct bpf_prog *prog; - const struct bpf_prog_array *array; - struct bpf_run_ctx *old_run_ctx; - struct bpf_cg_run_ctx run_ctx; - u32 func_ret; - - run_ctx.retval = retval; - migrate_disable(); - rcu_read_lock(); - array = rcu_dereference(array_rcu); - item = &array->items[0]; - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - while ((prog = READ_ONCE(item->prog))) { - run_ctx.prog_item = item; - func_ret = run_prog(prog, ctx); - if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval)) - run_ctx.retval = -EPERM; - *(ret_flags) |= (func_ret >> 1); - item++; - } - bpf_reset_run_ctx(old_run_ctx); - rcu_read_unlock(); - migrate_enable(); - return run_ctx.retval; -} - -static __always_inline int -BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, - const void *ctx, bpf_prog_run_fn run_prog, - int retval) -{ - const struct bpf_prog_array_item *item; - const struct bpf_prog *prog; - const struct bpf_prog_array *array; - struct bpf_run_ctx *old_run_ctx; - struct bpf_cg_run_ctx run_ctx; - - run_ctx.retval = retval; - migrate_disable(); - rcu_read_lock(); - array = rcu_dereference(array_rcu); - item = &array->items[0]; - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - while ((prog = READ_ONCE(item->prog))) { - run_ctx.prog_item = item; - if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval)) - run_ctx.retval = -EPERM; - item++; - } - bpf_reset_run_ctx(old_run_ctx); - rcu_read_unlock(); - migrate_enable(); - return run_ctx.retval; -} - static __always_inline u32 -BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, +bpf_prog_run_array(const struct bpf_prog_array *array, const void *ctx, bpf_prog_run_fn run_prog) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; - const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; u32 ret = 1; - migrate_disable(); - rcu_read_lock(); - array = rcu_dereference(array_rcu); + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); + if (unlikely(!array)) - goto out; + return ret; + + migrate_disable(); old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { @@ -1400,50 +1339,10 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, item++; } bpf_reset_run_ctx(old_run_ctx); -out: - rcu_read_unlock(); migrate_enable(); return ret; } -/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs - * so BPF programs can request cwr for TCP packets. - * - * Current cgroup skb programs can only return 0 or 1 (0 to drop the - * packet. This macro changes the behavior so the low order bit - * indicates whether the packet should be dropped (0) or not (1) - * and the next bit is a congestion notification bit. This could be - * used by TCP to call tcp_enter_cwr() - * - * Hence, new allowed return values of CGROUP EGRESS BPF programs are: - * 0: drop packet - * 1: keep packet - * 2: drop packet and cn - * 3: keep packet and cn - * - * This macro then converts it to one of the NET_XMIT or an error - * code that is then interpreted as drop packet (and no cn): - * 0: NET_XMIT_SUCCESS skb should be transmitted - * 1: NET_XMIT_DROP skb should be dropped and cn - * 2: NET_XMIT_CN skb should be transmitted and cn - * 3: -err skb should be dropped - */ -#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ - ({ \ - u32 _flags = 0; \ - bool _cn; \ - u32 _ret; \ - _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \ - _cn = _flags & BPF_RET_SET_CN; \ - if (_ret && !IS_ERR_VALUE((long)_ret)) \ - _ret = -EFAULT; \ - if (!_ret) \ - _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ - else \ - _ret = (_cn ? NET_XMIT_DROP : _ret); \ - _ret; \ - }) - #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; -- cgit v1.2.3 From b83deaa741558babf4b8d51d34f6637ccfff1b26 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 28 May 2020 22:57:40 +0200 Subject: ARM: pxa: move pcmcia board data into mach-pxa The drivers/pcmcia/pxa2xx_*.c are essentially part of the board files, but for historic reasons located in drivers/pcmcia. Move them into the same place as the actual board file to avoid lots of machine header inclusions. Cc: Marek Vasut Cc: Dominik Brodowski Cc: Jonathan Cameron Signed-off-by: Arnd Bergmann --- include/linux/platform_data/pcmcia-pxa2xx_viper.h | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 include/linux/platform_data/pcmcia-pxa2xx_viper.h (limited to 'include/linux') diff --git a/include/linux/platform_data/pcmcia-pxa2xx_viper.h b/include/linux/platform_data/pcmcia-pxa2xx_viper.h deleted file mode 100644 index a23b58aff9e1..000000000000 --- a/include/linux/platform_data/pcmcia-pxa2xx_viper.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ARCOM_PCMCIA_H -#define __ARCOM_PCMCIA_H - -struct arcom_pcmcia_pdata { - int cd_gpio; - int rdy_gpio; - int pwr_gpio; - void (*reset)(int state); -}; - -#endif -- cgit v1.2.3 From dcf456c9a095a6e71f53d6f6f004133ee851ee70 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Mon, 18 Apr 2022 15:51:58 +0000 Subject: bpf: Fix usage of trace RCU in local storage. bpf_{sk,task,inode}_storage_free() do not need to use call_rcu_tasks_trace as no BPF program should be accessing the owner as it's being destroyed. The only other reader at this point is bpf_local_storage_map_free() which uses normal RCU. The only path that needs trace RCU are: * bpf_local_storage_{delete,update} helpers * map_{delete,update}_elem() syscalls Fixes: 0fe4b381a59e ("bpf: Allow bpf_local_storage to be used by sleepable programs") Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220418155158.2865678-1-kpsingh@kernel.org --- include/linux/bpf_local_storage.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 493e63258497..7ea18d4da84b 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -143,9 +143,9 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_omem); + bool uncharge_omem, bool use_trace_rcu); -void bpf_selem_unlink(struct bpf_local_storage_elem *selem); +void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem); -- cgit v1.2.3 From 3abfaefb9a6dda5e0bfa6160f55abfd0e32d8b0a Mon Sep 17 00:00:00 2001 From: Liu Ying Date: Tue, 19 Apr 2022 09:08:49 +0800 Subject: phy: Add LVDS configuration options This patch allows LVDS PHYs to be configured through the generic functions and through a custom structure added to the generic union. The parameters added here are based on common LVDS PHY implementation practices. The set of parameters should cover all potential users. Cc: Kishon Vijay Abraham I Cc: Vinod Koul Cc: NXP Linux Team Signed-off-by: Liu Ying Link: https://lore.kernel.org/r/20220419010852.452169-3-victor.liu@nxp.com Signed-off-by: Vinod Koul --- include/linux/phy/phy-lvds.h | 32 ++++++++++++++++++++++++++++++++ include/linux/phy/phy.h | 4 ++++ 2 files changed, 36 insertions(+) create mode 100644 include/linux/phy/phy-lvds.h (limited to 'include/linux') diff --git a/include/linux/phy/phy-lvds.h b/include/linux/phy/phy-lvds.h new file mode 100644 index 000000000000..09931d080a6d --- /dev/null +++ b/include/linux/phy/phy-lvds.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2020,2022 NXP + */ + +#ifndef __PHY_LVDS_H_ +#define __PHY_LVDS_H_ + +/** + * struct phy_configure_opts_lvds - LVDS configuration set + * @bits_per_lane_and_dclk_cycle: Number of bits per lane per differential + * clock cycle. + * @differential_clk_rate: Clock rate, in Hertz, of the LVDS + * differential clock. + * @lanes: Number of active, consecutive, + * data lanes, starting from lane 0, + * used for the transmissions. + * @is_slave: Boolean, true if the phy is a slave + * which works together with a master + * phy to support dual link transmission, + * otherwise a regular phy or a master phy. + * + * This structure is used to represent the configuration state of a LVDS phy. + */ +struct phy_configure_opts_lvds { + unsigned int bits_per_lane_and_dclk_cycle; + unsigned long differential_clk_rate; + unsigned int lanes; + bool is_slave; +}; + +#endif /* __PHY_LVDS_H_ */ diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index f3286f4cd306..b1413757fcc3 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -17,6 +17,7 @@ #include #include +#include #include struct phy; @@ -57,10 +58,13 @@ enum phy_media { * the MIPI_DPHY phy mode. * @dp: Configuration set applicable for phys supporting * the DisplayPort protocol. + * @lvds: Configuration set applicable for phys supporting + * the LVDS phy mode. */ union phy_configure_opts { struct phy_configure_opts_mipi_dphy mipi_dphy; struct phy_configure_opts_dp dp; + struct phy_configure_opts_lvds lvds; }; /** -- cgit v1.2.3 From fc44ff0ae9f2aa20b64c4e63ab4614156df80240 Mon Sep 17 00:00:00 2001 From: Ben Walker Date: Tue, 1 Mar 2022 11:25:48 -0700 Subject: dmaengine: Document dmaengine_prep_dma_memset Document this function to make clear the expected behavior of the 'value' parameter. It was intended to match the behavior of POSIX memset as laid out here: https://lore.kernel.org/dmaengine/YejrA5ZWZ3lTRO%2F1@matsya/ Signed-off-by: Ben Walker Link: https://lore.kernel.org/r/20220301182551.883474-2-benjamin.walker@intel.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 6db9e03afd0b..b46b88e6aa0d 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -1030,6 +1030,14 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma( return chan->device->device_prep_interleaved_dma(chan, xt, flags); } +/** + * dmaengine_prep_dma_memset() - Prepare a DMA memset descriptor. + * @chan: The channel to be used for this descriptor + * @dest: Address of buffer to be set + * @value: Treated as a single byte value that fills the destination buffer + * @len: The total size of dest + * @flags: DMA engine flags + */ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memset( struct dma_chan *chan, dma_addr_t dest, int value, size_t len, unsigned long flags) -- cgit v1.2.3 From 5252c1c5a08e583ab1363f809002cd8a59272b35 Mon Sep 17 00:00:00 2001 From: Chun-Kuang Hu Date: Sat, 16 Apr 2022 17:54:28 +0800 Subject: soc: mediatek: cmdq: Use mailbox rx_callback instead of cmdq_task_cb rx_callback is a standard mailbox callback mechanism and could cover the function of proprietary cmdq_task_cb, so use the standard one instead of the proprietary one. Client has changed to use the standard callback machanism and sync dma buffer in client driver, so remove the proprietary callback in cmdq helper. Signed-off-by: Chun-Kuang Hu Reviewed-by: jason-jh.lin Tested-by: jason-jh.lin Link: https://lore.kernel.org/r/1650102868-26219-1-git-send-email-chunkuang.hu@kernel.org Signed-off-by: Matthias Brugger --- include/linux/soc/mediatek/mtk-cmdq.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h index ac6b5f3cba95..2b498f4f3946 100644 --- a/include/linux/soc/mediatek/mtk-cmdq.h +++ b/include/linux/soc/mediatek/mtk-cmdq.h @@ -268,8 +268,6 @@ int cmdq_pkt_finalize(struct cmdq_pkt *pkt); * cmdq_pkt_flush_async() - trigger CMDQ to asynchronously execute the CMDQ * packet and call back at the end of done packet * @pkt: the CMDQ packet - * @cb: called at the end of done packet - * @data: this data will pass back to cb * * Return: 0 for success; else the error code is returned * @@ -277,7 +275,6 @@ int cmdq_pkt_finalize(struct cmdq_pkt *pkt); * at the end of done packet. Note that this is an ASYNC function. When the * function returned, it may or may not be finished. */ -int cmdq_pkt_flush_async(struct cmdq_pkt *pkt, cmdq_async_flush_cb cb, - void *data); +int cmdq_pkt_flush_async(struct cmdq_pkt *pkt); #endif /* __MTK_CMDQ_H__ */ -- cgit v1.2.3 From 7a107b2c6b813c3c587d1e76cb405dc637dd2b36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 20 Apr 2022 11:19:01 +0300 Subject: Revert "serial: 8250: Handle UART without interrupt on TEMT using em485" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This partially reverts commit f6f586102add. The code added by that commit containted math overflow for 32-bit archs. In addition, the approach used in it is unnecessarily complicated requiring a dedicated timer just for notemt. A simpler approach for providing UART_CAP_NOTEMT already exists (patches 1-2): https://lore.kernel.org/linux-serial/20220411083321.9131-3-ilpo.jarvinen@linux.intel.com/T/#u Thus, simply revert the UART_CAP_NOTEMT change for now. There were two driver changes within the patch series adding UART_CAP_NOTEMT taking advantage of the newly added flag. This does not revert the driver changes and therefore also UART_CAP_NOTEMT define has to remain. UART_CAP_NOTEMT remains no-op until support is again added. Fixes: f6f586102add ("serial: 8250: Handle UART without interrupt on TEMT using em485") Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/5f874142-fb1f-bff7-f33-fac823e65e2e@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index de135852107c..ff84a3ed10ea 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -79,9 +79,7 @@ struct uart_8250_ops { struct uart_8250_em485 { struct hrtimer start_tx_timer; /* "rs485 start tx" timer */ struct hrtimer stop_tx_timer; /* "rs485 stop tx" timer */ - struct hrtimer no_temt_timer; /* "rs485 no TEMT interrupt" timer */ struct hrtimer *active_timer; /* pointer to active timer */ - unsigned long no_temt_delay; /* Delay for no_temt_timer */ struct uart_8250_port *port; /* for hrtimer callbacks */ unsigned int tx_stopped:1; /* tx is currently stopped */ }; -- cgit v1.2.3 From 92ece28072f18f30099770c5d4b8e300ea6820fa Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Sat, 16 Apr 2022 18:58:00 +0800 Subject: net: Change skb_ensure_writable()'s write_len param to unsigned int type Both pskb_may_pull() and skb_clone_writable()'s length parameters are of type unsigned int already. Therefore, change this function's write_len param to unsigned int type. Signed-off-by: Liu Jian Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220416105801.88708-3-liujian56@huawei.com --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2394441fa3dd..b37c829d08e1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3885,7 +3885,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features, unsigned int offset); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); -int skb_ensure_writable(struct sk_buff *skb, int write_len); +int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); -- cgit v1.2.3 From f4c5c7f9d2e5ab005d57826b740b694b042a737c Mon Sep 17 00:00:00 2001 From: Felix Matouschek Date: Mon, 18 Apr 2022 15:28:03 +0200 Subject: mtd: spinand: Add support for XTX XT26G0xA Add support for XTX Technology XT26G01AXXXXX, XTX26G02AXXXXX and XTX26G04AXXXXX SPI NAND. These are 3V, 1G/2G/4Gbit serial SLC NAND flash devices with on-die ECC (8bit strength per 512bytes). Tested on Teltonika RUTX10 flashed with OpenWrt. Links: - http://www.xtxtech.com/download/?AId=225 - https://datasheet.lcsc.com/szlcsc/2005251034_XTX-XT26G01AWSEGA_C558841.pdf Signed-off-by: Felix Matouschek Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220418132803.664103-1-felix@matouschek.org --- include/linux/mtd/spinand.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 3aa28240a77f..5584d3bb6556 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -266,6 +266,7 @@ extern const struct spinand_manufacturer micron_spinand_manufacturer; extern const struct spinand_manufacturer paragon_spinand_manufacturer; extern const struct spinand_manufacturer toshiba_spinand_manufacturer; extern const struct spinand_manufacturer winbond_spinand_manufacturer; +extern const struct spinand_manufacturer xtx_spinand_manufacturer; /** * struct spinand_op_variants - SPI NAND operation variants -- cgit v1.2.3 From 0b0002315adfea3d9eb102665a4441727d4d2621 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Sat, 16 Apr 2022 18:45:56 +0800 Subject: crypto: hisilicon/qm - remove unused function declaration The 'hisi_qm_get_hw_version' function is unused, so remove the function declaration. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index e5522eaf88fd..5177858aeea9 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -453,7 +453,6 @@ int hisi_qp_send(struct hisi_qp *qp, const void *msg); int hisi_qm_get_free_qp_num(struct hisi_qm *qm); int hisi_qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number); void hisi_qm_debug_init(struct hisi_qm *qm); -enum qm_hw_ver hisi_qm_get_hw_version(struct pci_dev *pdev); void hisi_qm_debug_regs_clear(struct hisi_qm *qm); int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs); int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen); -- cgit v1.2.3 From fb06eb9727d67eac16e6b6aff35362476389cb88 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Sat, 16 Apr 2022 18:45:57 +0800 Subject: crypto: hisilicon/qm - set function with static These functions 'hisi_qm_create_qp' and 'hisi_qm_set_vft' are not used outside qm.c, so they are marked as static. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 5177858aeea9..e99b286b50bc 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -445,13 +445,11 @@ int hisi_qm_init(struct hisi_qm *qm); void hisi_qm_uninit(struct hisi_qm *qm); int hisi_qm_start(struct hisi_qm *qm); int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r); -struct hisi_qp *hisi_qm_create_qp(struct hisi_qm *qm, u8 alg_type); int hisi_qm_start_qp(struct hisi_qp *qp, unsigned long arg); int hisi_qm_stop_qp(struct hisi_qp *qp); void hisi_qm_release_qp(struct hisi_qp *qp); int hisi_qp_send(struct hisi_qp *qp, const void *msg); int hisi_qm_get_free_qp_num(struct hisi_qm *qm); -int hisi_qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number); void hisi_qm_debug_init(struct hisi_qm *qm); void hisi_qm_debug_regs_clear(struct hisi_qm *qm); int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs); -- cgit v1.2.3 From 7982996c5b0812cbf7846deff0e2f5dcbf290129 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Sat, 16 Apr 2022 18:45:58 +0800 Subject: crypto: hisilicon/qm - replace hisi_qm_release_qp() with hisi_qm_free_qps() hisi_qm_free_qps() can release multiple queues in one call, and it is already exported. So, replace hisi_qm_release_qp() with hisi_qm_free_qps() in zip_crypto.c, and do not export hisi_qm_release_qp() outside qm.c. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index e99b286b50bc..d2ccb11071c5 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -447,7 +447,6 @@ int hisi_qm_start(struct hisi_qm *qm); int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r); int hisi_qm_start_qp(struct hisi_qp *qp, unsigned long arg); int hisi_qm_stop_qp(struct hisi_qp *qp); -void hisi_qm_release_qp(struct hisi_qp *qp); int hisi_qp_send(struct hisi_qp *qp, const void *msg); int hisi_qm_get_free_qp_num(struct hisi_qm *qm); void hisi_qm_debug_init(struct hisi_qm *qm); -- cgit v1.2.3 From b0c42232fce499ba96fbf2c5ebd2368efeb6597e Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Sat, 16 Apr 2022 18:45:59 +0800 Subject: crypto: hisilicon/qm - remove hisi_qm_get_free_qp_num() hisi_qm_get_free_qp_num() is to get the free queue number on the function. It is a simple function and is only called by hisi_qm_get_available_instances(). This patch modifies to get the free queue directly in hisi_qm_get_available_instances(), and remove hisi_qm_get_free_qp_num(). Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index d2ccb11071c5..6cabafffd0dd 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -448,7 +448,6 @@ int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r); int hisi_qm_start_qp(struct hisi_qp *qp, unsigned long arg); int hisi_qm_stop_qp(struct hisi_qp *qp); int hisi_qp_send(struct hisi_qp *qp, const void *msg); -int hisi_qm_get_free_qp_num(struct hisi_qm *qm); void hisi_qm_debug_init(struct hisi_qm *qm); void hisi_qm_debug_regs_clear(struct hisi_qm *qm); int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs); -- cgit v1.2.3 From 6c7f98b334a32df5cac8abac8983ac4ce17cab57 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 11 Apr 2022 16:13:59 +0200 Subject: vfio/mdev: Remove vfio_mdev.c Now that all mdev drivers directly create their own mdev_device driver and directly register with the vfio core's vfio_device_ops this is all dead code. Delete vfio_mdev.c and the mdev_parent_ops members that are connected to it. Signed-off-by: Jason Gunthorpe Signed-off-by: Christoph Hellwig Signed-off-by: Zhi Wang Link: http://patchwork.freedesktop.org/patch/msgid/20220411141403.86980-31-hch@lst.de Reviewed-by: Kirti Wankhede Reviewed-by: Zhi Wang --- include/linux/mdev.h | 48 +----------------------------------------------- 1 file changed, 1 insertion(+), 47 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 15d03f6532d0..192aed656116 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -40,40 +40,7 @@ struct device *mtype_get_parent_dev(struct mdev_type *mtype); * @mdev_attr_groups: Attributes of the mediated device. * @supported_type_groups: Attributes to define supported types. It is mandatory * to provide supported types. - * @create: Called to allocate basic resources in parent device's - * driver for a particular mediated device. It is - * mandatory to provide create ops. - * @mdev: mdev_device structure on of mediated device - * that is being created - * Returns integer: success (0) or error (< 0) - * @remove: Called to free resources in parent device's driver for - * a mediated device. It is mandatory to provide 'remove' - * ops. - * @mdev: mdev_device device structure which is being - * destroyed - * Returns integer: success (0) or error (< 0) - * @read: Read emulation callback - * @mdev: mediated device structure - * @buf: read buffer - * @count: number of bytes to read - * @ppos: address. - * Retuns number on bytes read on success or error. - * @write: Write emulation callback - * @mdev: mediated device structure - * @buf: write buffer - * @count: number of bytes to be written - * @ppos: address. - * Retuns number on bytes written on success or error. - * @ioctl: IOCTL callback - * @mdev: mediated device structure - * @cmd: ioctl command - * @arg: arguments to ioctl - * @mmap: mmap callback - * @mdev: mediated device structure - * @vma: vma structure - * @request: request callback to release device - * @mdev: mediated device structure - * @count: request sequence number + * * Parent device that support mediated device should be registered with mdev * module with mdev_parent_ops structure. **/ @@ -83,19 +50,6 @@ struct mdev_parent_ops { const struct attribute_group **dev_attr_groups; const struct attribute_group **mdev_attr_groups; struct attribute_group **supported_type_groups; - - int (*create)(struct mdev_device *mdev); - int (*remove)(struct mdev_device *mdev); - int (*open_device)(struct mdev_device *mdev); - void (*close_device)(struct mdev_device *mdev); - ssize_t (*read)(struct mdev_device *mdev, char __user *buf, - size_t count, loff_t *ppos); - ssize_t (*write)(struct mdev_device *mdev, const char __user *buf, - size_t count, loff_t *ppos); - long (*ioctl)(struct mdev_device *mdev, unsigned int cmd, - unsigned long arg); - int (*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma); - void (*request)(struct mdev_device *mdev, unsigned int count); }; /* interface for exporting mdev supported type attributes */ -- cgit v1.2.3 From e6486939d8ea22569af942a1888aa3824f59cc5e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 11 Apr 2022 16:14:00 +0200 Subject: vfio/mdev: Remove mdev_parent_ops dev_attr_groups This is only used by one sample to print a fixed string that is pointless. In general, having a device driver attach sysfs attributes to the parent is horrific. This should never happen, and always leads to some kind of liftime bug as it become very difficult for the sysfs attribute to go back to any data owned by the device driver. Remove the general mechanism to create this abuse. Signed-off-by: Jason Gunthorpe Signed-off-by: Christoph Hellwig Signed-off-by: Zhi Wang Link: http://patchwork.freedesktop.org/patch/msgid/20220411141403.86980-32-hch@lst.de Reviewed-by: Kirti Wankhede Reviewed-by: Zhi Wang --- include/linux/mdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 192aed656116..69df1fa2cb6f 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -36,7 +36,6 @@ struct device *mtype_get_parent_dev(struct mdev_type *mtype); * * @owner: The module owner. * @device_driver: Which device driver to probe() on newly created devices - * @dev_attr_groups: Attributes of the parent device. * @mdev_attr_groups: Attributes of the mediated device. * @supported_type_groups: Attributes to define supported types. It is mandatory * to provide supported types. @@ -47,7 +46,6 @@ struct device *mtype_get_parent_dev(struct mdev_type *mtype); struct mdev_parent_ops { struct module *owner; struct mdev_driver *device_driver; - const struct attribute_group **dev_attr_groups; const struct attribute_group **mdev_attr_groups; struct attribute_group **supported_type_groups; }; -- cgit v1.2.3 From 6b42f491e17ce13f5ff7f2d1f49c73a0f4c47b20 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 11 Apr 2022 16:14:01 +0200 Subject: vfio/mdev: Remove mdev_parent_ops The last useful member in this struct is the supported_type_groups, move it to the mdev_driver and delete mdev_parent_ops. Replace it with mdev_driver as an argument to mdev_register_device() Signed-off-by: Jason Gunthorpe Signed-off-by: Christoph Hellwig Signed-off-by: Zhi Wang Link: http://patchwork.freedesktop.org/patch/msgid/20220411141403.86980-33-hch@lst.de Reviewed-by: Kirti Wankhede Reviewed-by: Zhi Wang --- include/linux/mdev.h | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 69df1fa2cb6f..1f6f57a3c316 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -30,26 +30,6 @@ unsigned int mdev_get_type_group_id(struct mdev_device *mdev); unsigned int mtype_get_type_group_id(struct mdev_type *mtype); struct device *mtype_get_parent_dev(struct mdev_type *mtype); -/** - * struct mdev_parent_ops - Structure to be registered for each parent device to - * register the device to mdev module. - * - * @owner: The module owner. - * @device_driver: Which device driver to probe() on newly created devices - * @mdev_attr_groups: Attributes of the mediated device. - * @supported_type_groups: Attributes to define supported types. It is mandatory - * to provide supported types. - * - * Parent device that support mediated device should be registered with mdev - * module with mdev_parent_ops structure. - **/ -struct mdev_parent_ops { - struct module *owner; - struct mdev_driver *device_driver; - const struct attribute_group **mdev_attr_groups; - struct attribute_group **supported_type_groups; -}; - /* interface for exporting mdev supported type attributes */ struct mdev_type_attribute { struct attribute attr; @@ -74,12 +54,15 @@ struct mdev_type_attribute mdev_type_attr_##_name = \ * struct mdev_driver - Mediated device driver * @probe: called when new device created * @remove: called when device removed + * @supported_type_groups: Attributes to define supported types. It is mandatory + * to provide supported types. * @driver: device driver structure * **/ struct mdev_driver { int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); + struct attribute_group **supported_type_groups; struct device_driver driver; }; @@ -98,7 +81,7 @@ static inline const guid_t *mdev_uuid(struct mdev_device *mdev) extern struct bus_type mdev_bus_type; -int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops); +int mdev_register_device(struct device *dev, struct mdev_driver *mdev_driver); void mdev_unregister_device(struct device *dev); int mdev_register_driver(struct mdev_driver *drv); -- cgit v1.2.3 From 2917f53113be3b7a0f374e02cebe6d6b749366b5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 11 Apr 2022 16:14:03 +0200 Subject: vfio/mdev: Remove mdev drvdata This is no longer used, remove it. All usages were moved over to either use container_of() from a vfio_device or to use dev_drvdata() directly on the mdev. Signed-off-by: Jason Gunthorpe Signed-off-by: Christoph Hellwig Signed-off-by: Zhi Wang Link: http://patchwork.freedesktop.org/patch/msgid/20220411141403.86980-35-hch@lst.de Reviewed-by: Kirti Wankhede Reviewed-by: Zhi Wang --- include/linux/mdev.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 1f6f57a3c316..bb539794f54a 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -15,7 +15,6 @@ struct mdev_type; struct mdev_device { struct device dev; guid_t uuid; - void *driver_data; struct list_head next; struct mdev_type *type; bool active; @@ -66,14 +65,6 @@ struct mdev_driver { struct device_driver driver; }; -static inline void *mdev_get_drvdata(struct mdev_device *mdev) -{ - return mdev->driver_data; -} -static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data) -{ - mdev->driver_data = data; -} static inline const guid_t *mdev_uuid(struct mdev_device *mdev) { return &mdev->uuid; -- cgit v1.2.3 From 042c48848b7d8dab4f095bce2872c02f711bf5d0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 5 Aug 2019 23:15:37 +0200 Subject: ARM: omap1: move lcd_dma code into omapfb driver The omapfb driver is split into platform specific code for omap1, and driver code that is also specific to omap1. Moving both parts into the driver directory simplifies the structure and avoids the dependency on certain omap machine header files. As mach/lcd_dma.h can not be included from include/linux/omap-dma.h any more now, move the omap_lcd_dma_running() declaration into the omap-dma header, which matches where it is defined. Acked-by: Bartlomiej Zolnierkiewicz Acked-by: Tony Lindgren Signed-off-by: Arnd Bergmann --- include/linux/omap-dma.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/omap-dma.h b/include/linux/omap-dma.h index 5c5c93ad6b50..441f5f0919c6 100644 --- a/include/linux/omap-dma.h +++ b/include/linux/omap-dma.h @@ -328,8 +328,8 @@ extern dma_addr_t omap_get_dma_dst_pos(int lch); extern int omap_get_dma_active_status(int lch); extern int omap_dma_running(void); -#if defined(CONFIG_ARCH_OMAP1) && IS_ENABLED(CONFIG_FB_OMAP) -#include +#if IS_ENABLED(CONFIG_FB_OMAP) +extern int omap_lcd_dma_running(void); #else static inline int omap_lcd_dma_running(void) { -- cgit v1.2.3 From 0768fb6709343679e55f7135e2ed2c432e4500d8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 9 Aug 2019 21:42:31 +0200 Subject: ARM: omap1: declare a dummy omap_set_dma_priority omapfb calls directly into the omap_set_dma_priority() function in the DMA driver. This prevents compile-testing omapfb on other architectures. Add an inline function next to the other ones for non-omap configurations. Acked-by: Bartlomiej Zolnierkiewicz Acked-by: Tony Lindgren Signed-off-by: Arnd Bergmann --- include/linux/omap-dma.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/omap-dma.h b/include/linux/omap-dma.h index 441f5f0919c6..5e228428fda1 100644 --- a/include/linux/omap-dma.h +++ b/include/linux/omap-dma.h @@ -338,6 +338,9 @@ static inline int omap_lcd_dma_running(void) #endif #else /* CONFIG_ARCH_OMAP */ +static inline void omap_set_dma_priority(int lch, int dst_port, int priority) +{ +} static inline struct omap_system_dma_plat_info *omap_get_plat_info(void) { -- cgit v1.2.3 From e8e77e97507bf3ec5419b8067d242b538cf75cba Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 6 Aug 2019 14:34:31 +0200 Subject: ARM: omap1: move mach/usb.h to include/linux/soc The register definitions in this header are used in at least four different places, with little hope of completely cleaning that up. Split up the file into a portion that becomes a linux-wide header under include/linux/soc/ti/, and the parts that are actually only needed by board files. Acked-by: Felipe Balbi Acked-by: Tony Lindgren Signed-off-by: Arnd Bergmann --- include/linux/soc/ti/omap1-usb.h | 116 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 include/linux/soc/ti/omap1-usb.h (limited to 'include/linux') diff --git a/include/linux/soc/ti/omap1-usb.h b/include/linux/soc/ti/omap1-usb.h new file mode 100644 index 000000000000..67488698601a --- /dev/null +++ b/include/linux/soc/ti/omap1-usb.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __SOC_TI_OMAP1_USB +#define __SOC_TI_OMAP1_USB +/* + * Constants in this file are used all over the place, in platform + * code, as well as the udc, phy and ohci drivers. + * This is not a great design, but unlikely to get fixed after + * such a long time. Don't do this elsewhere. + */ + +#define OMAP1_OTG_BASE 0xfffb0400 +#define OMAP1_UDC_BASE 0xfffb4000 + +#define OMAP2_UDC_BASE 0x4805e200 +#define OMAP2_OTG_BASE 0x4805e300 +#define OTG_BASE OMAP1_OTG_BASE +#define UDC_BASE OMAP1_UDC_BASE + +/* + * OTG and transceiver registers, for OMAPs starting with ARM926 + */ +#define OTG_REV (OTG_BASE + 0x00) +#define OTG_SYSCON_1 (OTG_BASE + 0x04) +# define USB2_TRX_MODE(w) (((w)>>24)&0x07) +# define USB1_TRX_MODE(w) (((w)>>20)&0x07) +# define USB0_TRX_MODE(w) (((w)>>16)&0x07) +# define OTG_IDLE_EN (1 << 15) +# define HST_IDLE_EN (1 << 14) +# define DEV_IDLE_EN (1 << 13) +# define OTG_RESET_DONE (1 << 2) +# define OTG_SOFT_RESET (1 << 1) +#define OTG_SYSCON_2 (OTG_BASE + 0x08) +# define OTG_EN (1 << 31) +# define USBX_SYNCHRO (1 << 30) +# define OTG_MST16 (1 << 29) +# define SRP_GPDATA (1 << 28) +# define SRP_GPDVBUS (1 << 27) +# define SRP_GPUVBUS(w) (((w)>>24)&0x07) +# define A_WAIT_VRISE(w) (((w)>>20)&0x07) +# define B_ASE_BRST(w) (((w)>>16)&0x07) +# define SRP_DPW (1 << 14) +# define SRP_DATA (1 << 13) +# define SRP_VBUS (1 << 12) +# define OTG_PADEN (1 << 10) +# define HMC_PADEN (1 << 9) +# define UHOST_EN (1 << 8) +# define HMC_TLLSPEED (1 << 7) +# define HMC_TLLATTACH (1 << 6) +# define OTG_HMC(w) (((w)>>0)&0x3f) +#define OTG_CTRL (OTG_BASE + 0x0c) +# define OTG_USB2_EN (1 << 29) +# define OTG_USB2_DP (1 << 28) +# define OTG_USB2_DM (1 << 27) +# define OTG_USB1_EN (1 << 26) +# define OTG_USB1_DP (1 << 25) +# define OTG_USB1_DM (1 << 24) +# define OTG_USB0_EN (1 << 23) +# define OTG_USB0_DP (1 << 22) +# define OTG_USB0_DM (1 << 21) +# define OTG_ASESSVLD (1 << 20) +# define OTG_BSESSEND (1 << 19) +# define OTG_BSESSVLD (1 << 18) +# define OTG_VBUSVLD (1 << 17) +# define OTG_ID (1 << 16) +# define OTG_DRIVER_SEL (1 << 15) +# define OTG_A_SETB_HNPEN (1 << 12) +# define OTG_A_BUSREQ (1 << 11) +# define OTG_B_HNPEN (1 << 9) +# define OTG_B_BUSREQ (1 << 8) +# define OTG_BUSDROP (1 << 7) +# define OTG_PULLDOWN (1 << 5) +# define OTG_PULLUP (1 << 4) +# define OTG_DRV_VBUS (1 << 3) +# define OTG_PD_VBUS (1 << 2) +# define OTG_PU_VBUS (1 << 1) +# define OTG_PU_ID (1 << 0) +#define OTG_IRQ_EN (OTG_BASE + 0x10) /* 16-bit */ +# define DRIVER_SWITCH (1 << 15) +# define A_VBUS_ERR (1 << 13) +# define A_REQ_TMROUT (1 << 12) +# define A_SRP_DETECT (1 << 11) +# define B_HNP_FAIL (1 << 10) +# define B_SRP_TMROUT (1 << 9) +# define B_SRP_DONE (1 << 8) +# define B_SRP_STARTED (1 << 7) +# define OPRT_CHG (1 << 0) +#define OTG_IRQ_SRC (OTG_BASE + 0x14) /* 16-bit */ + // same bits as in IRQ_EN +#define OTG_OUTCTRL (OTG_BASE + 0x18) /* 16-bit */ +# define OTGVPD (1 << 14) +# define OTGVPU (1 << 13) +# define OTGPUID (1 << 12) +# define USB2VDR (1 << 10) +# define USB2PDEN (1 << 9) +# define USB2PUEN (1 << 8) +# define USB1VDR (1 << 6) +# define USB1PDEN (1 << 5) +# define USB1PUEN (1 << 4) +# define USB0VDR (1 << 2) +# define USB0PDEN (1 << 1) +# define USB0PUEN (1 << 0) +#define OTG_TEST (OTG_BASE + 0x20) /* 16-bit */ +#define OTG_VENDOR_CODE (OTG_BASE + 0xfc) /* 16-bit */ + +/*-------------------------------------------------------------------------*/ + +/* OMAP1 */ +#define USB_TRANSCEIVER_CTRL (0xfffe1000 + 0x0064) +# define CONF_USB2_UNI_R (1 << 8) +# define CONF_USB1_UNI_R (1 << 7) +# define CONF_USB_PORT0_R(x) (((x)>>4)&0x7) +# define CONF_USB0_ISOLATE_R (1 << 3) +# define CONF_USB_PWRDN_DM_R (1 << 2) +# define CONF_USB_PWRDN_DP_R (1 << 1) + +#endif -- cgit v1.2.3 From 1e9ca7c811f76b16c7822ad009b6b0c352227a15 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 6 Aug 2019 15:24:58 +0200 Subject: ARM: omap1: move some headers to include/linux/soc There are three remaining header files that are used by omap1 specific device drivers: - mach/soc.h provides cpu_is_omapXXX abstractions - mach/hardware.h provides omap_read/omap_write functions and physical addresses - mach/mux.h provides an omap specific pinctrl abstraction This is generally not how we do platform abstractions today, and it would be good to completely get rid of these in favor of passing information through platform devices and the pinctrl subsystem. However, given that nobody is working on that, just move it one step forward by splitting out the header files that are used by drivers today from the machine headers that are only used internally. Acked-by: Tony Lindgren Signed-off-by: Arnd Bergmann --- include/linux/soc/ti/omap1-io.h | 143 ++++++++++++++++++ include/linux/soc/ti/omap1-mux.h | 311 +++++++++++++++++++++++++++++++++++++++ include/linux/soc/ti/omap1-soc.h | 198 +++++++++++++++++++++++++ 3 files changed, 652 insertions(+) create mode 100644 include/linux/soc/ti/omap1-io.h create mode 100644 include/linux/soc/ti/omap1-mux.h create mode 100644 include/linux/soc/ti/omap1-soc.h (limited to 'include/linux') diff --git a/include/linux/soc/ti/omap1-io.h b/include/linux/soc/ti/omap1-io.h new file mode 100644 index 000000000000..9332c92690f4 --- /dev/null +++ b/include/linux/soc/ti/omap1-io.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_ARCH_OMAP_IO_H +#define __ASM_ARCH_OMAP_IO_H + +#ifndef __ASSEMBLER__ +#include + +#if defined(CONFIG_ARCH_OMAP) && defined(CONFIG_ARCH_OMAP1) +/* + * NOTE: Please use ioremap + __raw_read/write where possible instead of these + */ +extern u8 omap_readb(u32 pa); +extern u16 omap_readw(u32 pa); +extern u32 omap_readl(u32 pa); +extern void omap_writeb(u8 v, u32 pa); +extern void omap_writew(u16 v, u32 pa); +extern void omap_writel(u32 v, u32 pa); +#elif defined(CONFIG_COMPILE_TEST) +static inline u8 omap_readb(u32 pa) { return 0; } +static inline u16 omap_readw(u32 pa) { return 0; } +static inline u32 omap_readl(u32 pa) { return 0; } +static inline void omap_writeb(u8 v, u32 pa) { } +static inline void omap_writew(u16 v, u32 pa) { } +static inline void omap_writel(u32 v, u32 pa) { } +#endif +#endif + +/* + * ---------------------------------------------------------------------------- + * System control registers + * ---------------------------------------------------------------------------- + */ +#define MOD_CONF_CTRL_0 0xfffe1080 +#define MOD_CONF_CTRL_1 0xfffe1110 + +/* + * --------------------------------------------------------------------------- + * UPLD + * --------------------------------------------------------------------------- + */ +#define ULPD_REG_BASE (0xfffe0800) +#define ULPD_IT_STATUS (ULPD_REG_BASE + 0x14) +#define ULPD_SETUP_ANALOG_CELL_3 (ULPD_REG_BASE + 0x24) +#define ULPD_CLOCK_CTRL (ULPD_REG_BASE + 0x30) +# define DIS_USB_PVCI_CLK (1 << 5) /* no USB/FAC synch */ +# define USB_MCLK_EN (1 << 4) /* enable W4_USB_CLKO */ +#define ULPD_SOFT_REQ (ULPD_REG_BASE + 0x34) +# define SOFT_UDC_REQ (1 << 4) +# define SOFT_USB_CLK_REQ (1 << 3) +# define SOFT_DPLL_REQ (1 << 0) +#define ULPD_DPLL_CTRL (ULPD_REG_BASE + 0x3c) +#define ULPD_STATUS_REQ (ULPD_REG_BASE + 0x40) +#define ULPD_APLL_CTRL (ULPD_REG_BASE + 0x4c) +#define ULPD_POWER_CTRL (ULPD_REG_BASE + 0x50) +#define ULPD_SOFT_DISABLE_REQ_REG (ULPD_REG_BASE + 0x68) +# define DIS_MMC2_DPLL_REQ (1 << 11) +# define DIS_MMC1_DPLL_REQ (1 << 10) +# define DIS_UART3_DPLL_REQ (1 << 9) +# define DIS_UART2_DPLL_REQ (1 << 8) +# define DIS_UART1_DPLL_REQ (1 << 7) +# define DIS_USB_HOST_DPLL_REQ (1 << 6) +#define ULPD_SDW_CLK_DIV_CTRL_SEL (ULPD_REG_BASE + 0x74) +#define ULPD_CAM_CLK_CTRL (ULPD_REG_BASE + 0x7c) + +/* + * ---------------------------------------------------------------------------- + * Clocks + * ---------------------------------------------------------------------------- + */ +#define CLKGEN_REG_BASE (0xfffece00) +#define ARM_CKCTL (CLKGEN_REG_BASE + 0x0) +#define ARM_IDLECT1 (CLKGEN_REG_BASE + 0x4) +#define ARM_IDLECT2 (CLKGEN_REG_BASE + 0x8) +#define ARM_EWUPCT (CLKGEN_REG_BASE + 0xC) +#define ARM_RSTCT1 (CLKGEN_REG_BASE + 0x10) +#define ARM_RSTCT2 (CLKGEN_REG_BASE + 0x14) +#define ARM_SYSST (CLKGEN_REG_BASE + 0x18) +#define ARM_IDLECT3 (CLKGEN_REG_BASE + 0x24) + +#define CK_RATEF 1 +#define CK_IDLEF 2 +#define CK_ENABLEF 4 +#define CK_SELECTF 8 +#define SETARM_IDLE_SHIFT + +/* DPLL control registers */ +#define DPLL_CTL (0xfffecf00) + +/* DSP clock control. Must use __raw_readw() and __raw_writew() with these */ +#define DSP_CONFIG_REG_BASE IOMEM(0xe1008000) +#define DSP_CKCTL (DSP_CONFIG_REG_BASE + 0x0) +#define DSP_IDLECT1 (DSP_CONFIG_REG_BASE + 0x4) +#define DSP_IDLECT2 (DSP_CONFIG_REG_BASE + 0x8) +#define DSP_RSTCT2 (DSP_CONFIG_REG_BASE + 0x14) + +/* + * ---------------------------------------------------------------------------- + * Pulse-Width Light + * ---------------------------------------------------------------------------- + */ +#define OMAP_PWL_BASE 0xfffb5800 +#define OMAP_PWL_ENABLE (OMAP_PWL_BASE + 0x00) +#define OMAP_PWL_CLK_ENABLE (OMAP_PWL_BASE + 0x04) + +/* + * ---------------------------------------------------------------------------- + * Pin multiplexing registers + * ---------------------------------------------------------------------------- + */ +#define FUNC_MUX_CTRL_0 0xfffe1000 +#define FUNC_MUX_CTRL_1 0xfffe1004 +#define FUNC_MUX_CTRL_2 0xfffe1008 +#define COMP_MODE_CTRL_0 0xfffe100c +#define FUNC_MUX_CTRL_3 0xfffe1010 +#define FUNC_MUX_CTRL_4 0xfffe1014 +#define FUNC_MUX_CTRL_5 0xfffe1018 +#define FUNC_MUX_CTRL_6 0xfffe101C +#define FUNC_MUX_CTRL_7 0xfffe1020 +#define FUNC_MUX_CTRL_8 0xfffe1024 +#define FUNC_MUX_CTRL_9 0xfffe1028 +#define FUNC_MUX_CTRL_A 0xfffe102C +#define FUNC_MUX_CTRL_B 0xfffe1030 +#define FUNC_MUX_CTRL_C 0xfffe1034 +#define FUNC_MUX_CTRL_D 0xfffe1038 +#define PULL_DWN_CTRL_0 0xfffe1040 +#define PULL_DWN_CTRL_1 0xfffe1044 +#define PULL_DWN_CTRL_2 0xfffe1048 +#define PULL_DWN_CTRL_3 0xfffe104c +#define PULL_DWN_CTRL_4 0xfffe10ac + +/* OMAP-1610 specific multiplexing registers */ +#define FUNC_MUX_CTRL_E 0xfffe1090 +#define FUNC_MUX_CTRL_F 0xfffe1094 +#define FUNC_MUX_CTRL_10 0xfffe1098 +#define FUNC_MUX_CTRL_11 0xfffe109c +#define FUNC_MUX_CTRL_12 0xfffe10a0 +#define PU_PD_SEL_0 0xfffe10b4 +#define PU_PD_SEL_1 0xfffe10b8 +#define PU_PD_SEL_2 0xfffe10bc +#define PU_PD_SEL_3 0xfffe10c0 +#define PU_PD_SEL_4 0xfffe10c4 + +#endif diff --git a/include/linux/soc/ti/omap1-mux.h b/include/linux/soc/ti/omap1-mux.h new file mode 100644 index 000000000000..59c239b5569c --- /dev/null +++ b/include/linux/soc/ti/omap1-mux.h @@ -0,0 +1,311 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __SOC_TI_OMAP1_MUX_H +#define __SOC_TI_OMAP1_MUX_H +/* + * This should not really be a global header, it reflects the + * traditional way that omap1 does pin muxing without the + * pinctrl subsystem. + */ + +enum omap7xx_index { + /* OMAP 730 keyboard */ + E2_7XX_KBR0, + J7_7XX_KBR1, + E1_7XX_KBR2, + F3_7XX_KBR3, + D2_7XX_KBR4, + C2_7XX_KBC0, + D3_7XX_KBC1, + E4_7XX_KBC2, + F4_7XX_KBC3, + E3_7XX_KBC4, + + /* USB */ + AA17_7XX_USB_DM, + W16_7XX_USB_PU_EN, + W17_7XX_USB_VBUSI, + W18_7XX_USB_DMCK_OUT, + W19_7XX_USB_DCRST, + + /* MMC */ + MMC_7XX_CMD, + MMC_7XX_CLK, + MMC_7XX_DAT0, + + /* I2C */ + I2C_7XX_SCL, + I2C_7XX_SDA, + + /* SPI */ + SPI_7XX_1, + SPI_7XX_2, + SPI_7XX_3, + SPI_7XX_4, + SPI_7XX_5, + SPI_7XX_6, + + /* UART */ + UART_7XX_1, + UART_7XX_2, +}; + +enum omap1xxx_index { + /* UART1 (BT_UART_GATING)*/ + UART1_TX = 0, + UART1_RTS, + + /* UART2 (COM_UART_GATING)*/ + UART2_TX, + UART2_RX, + UART2_CTS, + UART2_RTS, + + /* UART3 (GIGA_UART_GATING) */ + UART3_TX, + UART3_RX, + UART3_CTS, + UART3_RTS, + UART3_CLKREQ, + UART3_BCLK, /* 12MHz clock out */ + Y15_1610_UART3_RTS, + + /* PWT & PWL */ + PWT, + PWL, + + /* USB master generic */ + R18_USB_VBUS, + R18_1510_USB_GPIO0, + W4_USB_PUEN, + W4_USB_CLKO, + W4_USB_HIGHZ, + W4_GPIO58, + + /* USB1 master */ + USB1_SUSP, + USB1_SEO, + W13_1610_USB1_SE0, + USB1_TXEN, + USB1_TXD, + USB1_VP, + USB1_VM, + USB1_RCV, + USB1_SPEED, + R13_1610_USB1_SPEED, + R13_1710_USB1_SE0, + + /* USB2 master */ + USB2_SUSP, + USB2_VP, + USB2_TXEN, + USB2_VM, + USB2_RCV, + USB2_SEO, + USB2_TXD, + + /* OMAP-1510 GPIO */ + R18_1510_GPIO0, + R19_1510_GPIO1, + M14_1510_GPIO2, + + /* OMAP1610 GPIO */ + P18_1610_GPIO3, + Y15_1610_GPIO17, + + /* OMAP-1710 GPIO */ + R18_1710_GPIO0, + V2_1710_GPIO10, + N21_1710_GPIO14, + W15_1710_GPIO40, + + /* MPUIO */ + MPUIO2, + N15_1610_MPUIO2, + MPUIO4, + MPUIO5, + T20_1610_MPUIO5, + W11_1610_MPUIO6, + V10_1610_MPUIO7, + W11_1610_MPUIO9, + V10_1610_MPUIO10, + W10_1610_MPUIO11, + E20_1610_MPUIO13, + U20_1610_MPUIO14, + E19_1610_MPUIO15, + + /* MCBSP2 */ + MCBSP2_CLKR, + MCBSP2_CLKX, + MCBSP2_DR, + MCBSP2_DX, + MCBSP2_FSR, + MCBSP2_FSX, + + /* MCBSP3 */ + MCBSP3_CLKX, + + /* Misc ballouts */ + BALLOUT_V8_ARMIO3, + N20_HDQ, + + /* OMAP-1610 MMC2 */ + W8_1610_MMC2_DAT0, + V8_1610_MMC2_DAT1, + W15_1610_MMC2_DAT2, + R10_1610_MMC2_DAT3, + Y10_1610_MMC2_CLK, + Y8_1610_MMC2_CMD, + V9_1610_MMC2_CMDDIR, + V5_1610_MMC2_DATDIR0, + W19_1610_MMC2_DATDIR1, + R18_1610_MMC2_CLKIN, + + /* OMAP-1610 External Trace Interface */ + M19_1610_ETM_PSTAT0, + L15_1610_ETM_PSTAT1, + L18_1610_ETM_PSTAT2, + L19_1610_ETM_D0, + J19_1610_ETM_D6, + J18_1610_ETM_D7, + + /* OMAP16XX GPIO */ + P20_1610_GPIO4, + V9_1610_GPIO7, + W8_1610_GPIO9, + N20_1610_GPIO11, + N19_1610_GPIO13, + P10_1610_GPIO22, + V5_1610_GPIO24, + AA20_1610_GPIO_41, + W19_1610_GPIO48, + M7_1610_GPIO62, + V14_16XX_GPIO37, + R9_16XX_GPIO18, + L14_16XX_GPIO49, + + /* OMAP-1610 uWire */ + V19_1610_UWIRE_SCLK, + U18_1610_UWIRE_SDI, + W21_1610_UWIRE_SDO, + N14_1610_UWIRE_CS0, + P15_1610_UWIRE_CS3, + N15_1610_UWIRE_CS1, + + /* OMAP-1610 SPI */ + U19_1610_SPIF_SCK, + U18_1610_SPIF_DIN, + P20_1610_SPIF_DIN, + W21_1610_SPIF_DOUT, + R18_1610_SPIF_DOUT, + N14_1610_SPIF_CS0, + N15_1610_SPIF_CS1, + T19_1610_SPIF_CS2, + P15_1610_SPIF_CS3, + + /* OMAP-1610 Flash */ + L3_1610_FLASH_CS2B_OE, + M8_1610_FLASH_CS2B_WE, + + /* First MMC */ + MMC_CMD, + MMC_DAT1, + MMC_DAT2, + MMC_DAT0, + MMC_CLK, + MMC_DAT3, + + /* OMAP-1710 MMC CMDDIR and DATDIR0 */ + M15_1710_MMC_CLKI, + P19_1710_MMC_CMDDIR, + P20_1710_MMC_DATDIR0, + + /* OMAP-1610 USB0 alternate pin configuration */ + W9_USB0_TXEN, + AA9_USB0_VP, + Y5_USB0_RCV, + R9_USB0_VM, + V6_USB0_TXD, + W5_USB0_SE0, + V9_USB0_SPEED, + V9_USB0_SUSP, + + /* USB2 */ + W9_USB2_TXEN, + AA9_USB2_VP, + Y5_USB2_RCV, + R9_USB2_VM, + V6_USB2_TXD, + W5_USB2_SE0, + + /* 16XX UART */ + R13_1610_UART1_TX, + V14_16XX_UART1_RX, + R14_1610_UART1_CTS, + AA15_1610_UART1_RTS, + R9_16XX_UART2_RX, + L14_16XX_UART3_RX, + + /* I2C OMAP-1610 */ + I2C_SCL, + I2C_SDA, + + /* Keypad */ + F18_1610_KBC0, + D20_1610_KBC1, + D19_1610_KBC2, + E18_1610_KBC3, + C21_1610_KBC4, + G18_1610_KBR0, + F19_1610_KBR1, + H14_1610_KBR2, + E20_1610_KBR3, + E19_1610_KBR4, + N19_1610_KBR5, + + /* Power management */ + T20_1610_LOW_PWR, + + /* MCLK Settings */ + V5_1710_MCLK_ON, + V5_1710_MCLK_OFF, + R10_1610_MCLK_ON, + R10_1610_MCLK_OFF, + + /* CompactFlash controller */ + P11_1610_CF_CD2, + R11_1610_CF_IOIS16, + V10_1610_CF_IREQ, + W10_1610_CF_RESET, + W11_1610_CF_CD1, + + /* parallel camera */ + J15_1610_CAM_LCLK, + J18_1610_CAM_D7, + J19_1610_CAM_D6, + J14_1610_CAM_D5, + K18_1610_CAM_D4, + K19_1610_CAM_D3, + K15_1610_CAM_D2, + K14_1610_CAM_D1, + L19_1610_CAM_D0, + L18_1610_CAM_VS, + L15_1610_CAM_HS, + M19_1610_CAM_RSTZ, + Y15_1610_CAM_OUTCLK, + + /* serial camera */ + H19_1610_CAM_EXCLK, + Y12_1610_CCP_CLKP, + W13_1610_CCP_CLKM, + W14_1610_CCP_DATAP, + Y14_1610_CCP_DATAM, + +}; + +#ifdef CONFIG_OMAP_MUX +extern int omap_cfg_reg(unsigned long reg_cfg); +#else +static inline int omap_cfg_reg(unsigned long reg_cfg) { return 0; } +#endif + +#endif diff --git a/include/linux/soc/ti/omap1-soc.h b/include/linux/soc/ti/omap1-soc.h new file mode 100644 index 000000000000..81008d400bb6 --- /dev/null +++ b/include/linux/soc/ti/omap1-soc.h @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * OMAP cpu type detection + * + * Copyright (C) 2004, 2008 Nokia Corporation + * + * Copyright (C) 2009-11 Texas Instruments. + * + * Written by Tony Lindgren + * + * Added OMAP4/5 specific defines - Santosh Shilimkar + */ + +#ifndef __ASM_ARCH_OMAP_CPU_H +#define __ASM_ARCH_OMAP_CPU_H + +/* + * Test if multicore OMAP support is needed + */ +#undef MULTI_OMAP1 +#undef OMAP_NAME + +#ifdef CONFIG_ARCH_OMAP730 +# ifdef OMAP_NAME +# undef MULTI_OMAP1 +# define MULTI_OMAP1 +# else +# define OMAP_NAME omap730 +# endif +#endif +#ifdef CONFIG_ARCH_OMAP850 +# ifdef OMAP_NAME +# undef MULTI_OMAP1 +# define MULTI_OMAP1 +# else +# define OMAP_NAME omap850 +# endif +#endif +#ifdef CONFIG_ARCH_OMAP15XX +# ifdef OMAP_NAME +# undef MULTI_OMAP1 +# define MULTI_OMAP1 +# else +# define OMAP_NAME omap1510 +# endif +#endif +#ifdef CONFIG_ARCH_OMAP16XX +# ifdef OMAP_NAME +# undef MULTI_OMAP1 +# define MULTI_OMAP1 +# else +# define OMAP_NAME omap16xx +# endif +#endif + +/* + * omap_rev bits: + * CPU id bits (0730, 1510, 1710, 2422...) [31:16] + * CPU revision (See _REV_ defined in cpu.h) [15:08] + * CPU class bits (15xx, 16xx, 24xx, 34xx...) [07:00] + */ +unsigned int omap_rev(void); + +/* + * Get the CPU revision for OMAP devices + */ +#define GET_OMAP_REVISION() ((omap_rev() >> 8) & 0xff) + +/* + * Macros to group OMAP into cpu classes. + * These can be used in most places. + * cpu_is_omap7xx(): True for OMAP730, OMAP850 + * cpu_is_omap15xx(): True for OMAP1510, OMAP5910 and OMAP310 + * cpu_is_omap16xx(): True for OMAP1610, OMAP5912 and OMAP1710 + */ +#define GET_OMAP_CLASS (omap_rev() & 0xff) + +#define IS_OMAP_CLASS(class, id) \ +static inline int is_omap ##class (void) \ +{ \ + return (GET_OMAP_CLASS == (id)) ? 1 : 0; \ +} + +#define GET_OMAP_SUBCLASS ((omap_rev() >> 20) & 0x0fff) + +#define IS_OMAP_SUBCLASS(subclass, id) \ +static inline int is_omap ##subclass (void) \ +{ \ + return (GET_OMAP_SUBCLASS == (id)) ? 1 : 0; \ +} + +IS_OMAP_CLASS(7xx, 0x07) +IS_OMAP_CLASS(15xx, 0x15) +IS_OMAP_CLASS(16xx, 0x16) + +#define cpu_is_omap7xx() 0 +#define cpu_is_omap15xx() 0 +#define cpu_is_omap16xx() 0 + +#if defined(MULTI_OMAP1) +# if defined(CONFIG_ARCH_OMAP730) +# undef cpu_is_omap7xx +# define cpu_is_omap7xx() is_omap7xx() +# endif +# if defined(CONFIG_ARCH_OMAP850) +# undef cpu_is_omap7xx +# define cpu_is_omap7xx() is_omap7xx() +# endif +# if defined(CONFIG_ARCH_OMAP15XX) +# undef cpu_is_omap15xx +# define cpu_is_omap15xx() is_omap15xx() +# endif +# if defined(CONFIG_ARCH_OMAP16XX) +# undef cpu_is_omap16xx +# define cpu_is_omap16xx() is_omap16xx() +# endif +#else +# if defined(CONFIG_ARCH_OMAP730) +# undef cpu_is_omap7xx +# define cpu_is_omap7xx() 1 +# endif +# if defined(CONFIG_ARCH_OMAP850) +# undef cpu_is_omap7xx +# define cpu_is_omap7xx() 1 +# endif +# if defined(CONFIG_ARCH_OMAP15XX) +# undef cpu_is_omap15xx +# define cpu_is_omap15xx() 1 +# endif +# if defined(CONFIG_ARCH_OMAP16XX) +# undef cpu_is_omap16xx +# define cpu_is_omap16xx() 1 +# endif +#endif + +/* + * Macros to detect individual cpu types. + * These are only rarely needed. + * cpu_is_omap310(): True for OMAP310 + * cpu_is_omap1510(): True for OMAP1510 + * cpu_is_omap1610(): True for OMAP1610 + * cpu_is_omap1611(): True for OMAP1611 + * cpu_is_omap5912(): True for OMAP5912 + * cpu_is_omap1621(): True for OMAP1621 + * cpu_is_omap1710(): True for OMAP1710 + */ +#define GET_OMAP_TYPE ((omap_rev() >> 16) & 0xffff) + +#define IS_OMAP_TYPE(type, id) \ +static inline int is_omap ##type (void) \ +{ \ + return (GET_OMAP_TYPE == (id)) ? 1 : 0; \ +} + +IS_OMAP_TYPE(310, 0x0310) +IS_OMAP_TYPE(1510, 0x1510) +IS_OMAP_TYPE(1610, 0x1610) +IS_OMAP_TYPE(1611, 0x1611) +IS_OMAP_TYPE(5912, 0x1611) +IS_OMAP_TYPE(1621, 0x1621) +IS_OMAP_TYPE(1710, 0x1710) + +#define cpu_is_omap310() 0 +#define cpu_is_omap1510() 0 +#define cpu_is_omap1610() 0 +#define cpu_is_omap5912() 0 +#define cpu_is_omap1611() 0 +#define cpu_is_omap1621() 0 +#define cpu_is_omap1710() 0 + +#define cpu_class_is_omap1() 1 + +/* + * Whether we have MULTI_OMAP1 or not, we still need to distinguish + * between 310 vs. 1510 and 1611B/5912 vs. 1710. + */ + +#if defined(CONFIG_ARCH_OMAP15XX) +# undef cpu_is_omap310 +# undef cpu_is_omap1510 +# define cpu_is_omap310() is_omap310() +# define cpu_is_omap1510() is_omap1510() +#endif + +#if defined(CONFIG_ARCH_OMAP16XX) +# undef cpu_is_omap1610 +# undef cpu_is_omap1611 +# undef cpu_is_omap5912 +# undef cpu_is_omap1621 +# undef cpu_is_omap1710 +# define cpu_is_omap1610() is_omap1610() +# define cpu_is_omap1611() is_omap1611() +# define cpu_is_omap5912() is_omap5912() +# define cpu_is_omap1621() is_omap1621() +# define cpu_is_omap1710() is_omap1710() +#endif + +#endif -- cgit v1.2.3 From 9fe15316563cbd46601c770a7214ccc5e1925bfb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 6 Aug 2019 15:13:18 +0200 Subject: ARM: omap1: innovator: move ohci phy power handling to board file The innovator board needs a special case for its phy control. Move the corresponding code into the board file and out of the common code by adding another callback. Acked-by: Felipe Balbi Acked-by: Tony Lindgren Signed-off-by: Arnd Bergmann --- include/linux/platform_data/usb-omap1.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/usb-omap1.h b/include/linux/platform_data/usb-omap1.h index 878e572a78bf..e7b8dc92a269 100644 --- a/include/linux/platform_data/usb-omap1.h +++ b/include/linux/platform_data/usb-omap1.h @@ -50,6 +50,8 @@ struct omap_usb_config { int (*ocpi_enable)(void); void (*lb_reset)(void); + + int (*transceiver_power)(int on); }; #endif /* __LINUX_USB_OMAP1_H */ -- cgit v1.2.3 From 17ea03b75e5665c9ce4945aa5afd097f3c845cdf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 27 Nov 2020 15:46:47 +0100 Subject: ARM: omap: dma: make usb support optional Most of the plat-omap/dma.c code is specific to the USB driver. Hide that code when it is not in use, to make it clearer which parts are actually still required. Acked-by: Tony Lindgren Signed-off-by: Arnd Bergmann --- include/linux/omap-dma.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/omap-dma.h b/include/linux/omap-dma.h index 5e228428fda1..07fa58ae9902 100644 --- a/include/linux/omap-dma.h +++ b/include/linux/omap-dma.h @@ -299,8 +299,9 @@ extern void omap_set_dma_priority(int lch, int dst_port, int priority); extern int omap_request_dma(int dev_id, const char *dev_name, void (*callback)(int lch, u16 ch_status, void *data), void *data, int *dma_ch); -extern void omap_disable_dma_irq(int ch, u16 irq_bits); extern void omap_free_dma(int ch); +#if IS_ENABLED(CONFIG_USB_OMAP) +extern void omap_disable_dma_irq(int ch, u16 irq_bits); extern void omap_start_dma(int lch); extern void omap_stop_dma(int lch); extern void omap_set_dma_transfer_params(int lch, int data_type, @@ -326,6 +327,8 @@ extern void omap_set_dma_dest_burst_mode(int lch, extern dma_addr_t omap_get_dma_src_pos(int lch); extern dma_addr_t omap_get_dma_dst_pos(int lch); extern int omap_get_dma_active_status(int lch); +#endif + extern int omap_dma_running(void); #if IS_ENABLED(CONFIG_FB_OMAP) -- cgit v1.2.3 From 3550bba25d5587a701e6edf20e20984d2ee72c78 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Sat, 9 Apr 2022 11:51:28 +0200 Subject: gpiolib: of: Introduce hook for missing gpio-ranges Since commit 2ab73c6d8323 ("gpio: Support GPIO controllers without pin-ranges") the device tree nodes of GPIO controller need the gpio-ranges property to handle gpio-hogs. Unfortunately it's impossible to guarantee that every new kernel is shipped with an updated device tree binary. In order to provide backward compatibility with those older DTB, we need a callback within of_gpiochip_add_pin_range() so the relevant platform driver can handle this case. Fixes: 2ab73c6d8323 ("gpio: Support GPIO controllers without pin-ranges") Signed-off-by: Stefan Wahren Reviewed-by: Florian Fainelli Tested-by: Florian Fainelli Acked-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20220409095129.45786-2-stefan.wahren@i2se.com Signed-off-by: Linus Walleij --- include/linux/gpio/driver.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 98c93510640e..c2041bd01f90 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -492,6 +492,18 @@ struct gpio_chip { */ int (*of_xlate)(struct gpio_chip *gc, const struct of_phandle_args *gpiospec, u32 *flags); + + /** + * @of_gpio_ranges_fallback: + * + * Optional hook for the case that no gpio-ranges property is defined + * within the device tree node "np" (usually DT before introduction + * of gpio-ranges). So this callback is helpful to provide the + * necessary backward compatibility for the pin ranges. + */ + int (*of_gpio_ranges_fallback)(struct gpio_chip *gc, + struct device_node *np); + #endif /* CONFIG_OF_GPIO */ }; -- cgit v1.2.3 From 988f11e046401f8561c4afefa506a50f0203de40 Mon Sep 17 00:00:00 2001 From: liaohua Date: Thu, 7 Apr 2022 15:29:48 +0800 Subject: latencytop: move sysctl to its own file This moves latencytop sysctl to kernel/latencytop.c Signed-off-by: liaohua Signed-off-by: Luis Chamberlain --- include/linux/latencytop.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index abe3d95f795b..84f1053cf2a8 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -38,9 +38,6 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) void clear_tsk_latency_tracing(struct task_struct *p); -int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos); - #else static inline void -- cgit v1.2.3 From 52ef8efcb75e8a8aab88e74c1376c2785d9a5452 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 2 Apr 2022 00:28:23 +0200 Subject: dma: omap: hide legacy interface The legacy interface for omap-dma is only used on OMAP1, and the same is true for the non-DT case. Make both of these conditional on CONFIG_ARCH_OMAP1 being set to simplify the dependency. The non-OMAP stub functions in include/linux/omap-dma.h are note needed any more either now, because they are only called on OMAP1. Acked-by: Tony Lindgren Acked-By: Vinod Koul Signed-off-by: Arnd Bergmann --- include/linux/omap-dma.h | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/omap-dma.h b/include/linux/omap-dma.h index 07fa58ae9902..254b4e10511b 100644 --- a/include/linux/omap-dma.h +++ b/include/linux/omap-dma.h @@ -292,7 +292,6 @@ struct omap_system_dma_plat_info { #define dma_omap15xx() __dma_omap15xx(d) #define dma_omap16xx() __dma_omap16xx(d) -#if defined(CONFIG_ARCH_OMAP) extern struct omap_system_dma_plat_info *omap_get_plat_info(void); extern void omap_set_dma_priority(int lch, int dst_port, int priority); @@ -340,25 +339,4 @@ static inline int omap_lcd_dma_running(void) } #endif -#else /* CONFIG_ARCH_OMAP */ -static inline void omap_set_dma_priority(int lch, int dst_port, int priority) -{ -} - -static inline struct omap_system_dma_plat_info *omap_get_plat_info(void) -{ - return NULL; -} - -static inline int omap_request_dma(int dev_id, const char *dev_name, - void (*callback)(int lch, u16 ch_status, void *data), - void *data, int *dma_ch) -{ - return -ENODEV; -} - -static inline void omap_free_dma(int ch) { } - -#endif /* CONFIG_ARCH_OMAP */ - #endif /* __LINUX_OMAP_DMA_H */ -- cgit v1.2.3 From 615dce5bf7369334f8c4f19f7f8f2a5634b4f911 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 12 Sep 2019 10:11:24 +0200 Subject: ARM: omap1: fix build with no SoC selected In a multiplatform randconfig kernel, one can have CONFIG_ARCH_OMAP1 enabled, but none of the specific SoCs. This leads to some build issues as the code is not meant to deal with this configuration at the moment: arch/arm/mach-omap1/io.c:86:20: error: unused function 'omap1_map_common_io' [-Werror,-Wunused-function] arch/arm/mach-omap1/pm.h:113:2: error: "Power management for this processor not implemented yet" [-Werror,-W#warnings] Use the same trick as on OMAP2 and guard the actual compilation of platform code with another Makefile ifdef check based on an option that depends on having at least one SoC enabled. The io.c file still needs to get compiled to allow building device drivers with a dependency on CONFIG_ARCH_OMAP1. Acked-by: Tony Lindgren Signed-off-by: Arnd Bergmann --- include/linux/soc/ti/omap1-io.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/ti/omap1-io.h b/include/linux/soc/ti/omap1-io.h index 9332c92690f4..f7f12728d4a6 100644 --- a/include/linux/soc/ti/omap1-io.h +++ b/include/linux/soc/ti/omap1-io.h @@ -5,7 +5,7 @@ #ifndef __ASSEMBLER__ #include -#if defined(CONFIG_ARCH_OMAP) && defined(CONFIG_ARCH_OMAP1) +#ifdef CONFIG_ARCH_OMAP1_ANY /* * NOTE: Please use ioremap + __raw_read/write where possible instead of these */ @@ -15,7 +15,7 @@ extern u32 omap_readl(u32 pa); extern void omap_writeb(u8 v, u32 pa); extern void omap_writew(u16 v, u32 pa); extern void omap_writel(u32 v, u32 pa); -#elif defined(CONFIG_COMPILE_TEST) +#else static inline u8 omap_readb(u32 pa) { return 0; } static inline u16 omap_readw(u32 pa) { return 0; } static inline u32 omap_readl(u32 pa) { return 0; } -- cgit v1.2.3 From 78ed93d72ded679e3caf0758357209887bda885f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 4 Apr 2022 13:12:04 +0200 Subject: signal: Deliver SIGTRAP on perf event asynchronously if blocked With SIGTRAP on perf events, we have encountered termination of processes due to user space attempting to block delivery of SIGTRAP. Consider this case: ... sigset_t s; sigemptyset(&s); sigaddset(&s, SIGTRAP | ); sigprocmask(SIG_BLOCK, &s, ...); ... When the perf event triggers, while SIGTRAP is blocked, force_sig_perf() will force the signal, but revert back to the default handler, thus terminating the task. This makes sense for error conditions, but not so much for explicitly requested monitoring. However, the expectation is still that signals generated by perf events are synchronous, which will no longer be the case if the signal is blocked and delivered later. To give user space the ability to clearly distinguish synchronous from asynchronous signals, introduce siginfo_t::si_perf_flags and TRAP_PERF_FLAG_ASYNC (opted for flags in case more binary information is required in future). The resolution to the problem is then to (a) no longer force the signal (avoiding the terminations), but (b) tell user space via si_perf_flags if the signal was synchronous or not, so that such signals can be handled differently (e.g. let user space decide to ignore or consider the data imprecise). The alternative of making the kernel ignore SIGTRAP on perf events if the signal is blocked may work for some usecases, but likely causes issues in others that then have to revert back to interception of sigprocmask() (which we want to avoid). [ A concrete example: when using breakpoint perf events to track data-flow, in a region of code where signals are blocked, data-flow can no longer be tracked accurately. When a relevant asynchronous signal is received after unblocking the signal, the data-flow tracking logic needs to know its state is imprecise. ] Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Geert Uytterhoeven Tested-by: Dmitry Vyukov Link: https://lore.kernel.org/r/20220404111204.935357-1-elver@google.com --- include/linux/compat.h | 1 + include/linux/sched/signal.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 1c758b0e0359..01fddf72a81f 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -235,6 +235,7 @@ typedef struct compat_siginfo { struct { compat_ulong_t _data; u32 _type; + u32 _flags; } _perf; }; } _sigfault; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3c8b34876744..bab7cc56b13a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -320,7 +320,7 @@ int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); -int force_sig_perf(void __user *addr, u32 type, u64 sig_data); +int send_sig_perf(void __user *addr, u32 type, u64 sig_data); int force_sig_ptrace_errno_trap(int errno, void __user *addr); int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno); -- cgit v1.2.3 From 03f16cd020eb8bb2eb837e2090086f296a9fa91d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Apr 2022 09:50:36 -0700 Subject: objtool: Add CONFIG_OBJTOOL Now that stack validation is an optional feature of objtool, add CONFIG_OBJTOOL and replace most usages of CONFIG_STACK_VALIDATION with it. CONFIG_STACK_VALIDATION can now be considered to be frame-pointer specific. CONFIG_UNWINDER_ORC is already inherently valid for live patching, so no need to "validate" it. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/939bf3d85604b2a126412bf11af6e3bd3b872bcb.1650300597.git.jpoimboe@redhat.com --- include/linux/compiler.h | 6 +++--- include/linux/instrumentation.h | 6 +++--- include/linux/objtool.h | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 219aa5ddbc73..01ce94b58b42 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -109,7 +109,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* Unreachable code */ -#ifdef CONFIG_STACK_VALIDATION +#ifdef CONFIG_OBJTOOL /* * These macros help objtool understand GCC code flow for unreachable code. * The __COUNTER__ based labels are a hack to make each instance of the macros @@ -128,10 +128,10 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Annotate a C jump table to allow objtool to follow the code flow */ #define __annotate_jump_table __section(".rodata..c_jump_table") -#else +#else /* !CONFIG_OBJTOOL */ #define annotate_unreachable() #define __annotate_jump_table -#endif +#endif /* CONFIG_OBJTOOL */ #ifndef unreachable # define unreachable() do { \ diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h index 24359b4a9605..9111a3704072 100644 --- a/include/linux/instrumentation.h +++ b/include/linux/instrumentation.h @@ -2,7 +2,7 @@ #ifndef __LINUX_INSTRUMENTATION_H #define __LINUX_INSTRUMENTATION_H -#if defined(CONFIG_DEBUG_ENTRY) && defined(CONFIG_STACK_VALIDATION) +#ifdef CONFIG_VMLINUX_VALIDATION #include @@ -53,9 +53,9 @@ ".popsection\n\t" : : "i" (c)); \ }) #define instrumentation_end() __instrumentation_end(__COUNTER__) -#else +#else /* !CONFIG_VMLINUX_VALIDATION */ # define instrumentation_begin() do { } while(0) # define instrumentation_end() do { } while(0) -#endif +#endif /* CONFIG_VMLINUX_VALIDATION */ #endif /* __LINUX_INSTRUMENTATION_H */ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 586d35720f13..977d90ba642d 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -38,7 +38,7 @@ struct unwind_hint { #define UNWIND_HINT_TYPE_REGS_PARTIAL 2 #define UNWIND_HINT_TYPE_FUNC 3 -#ifdef CONFIG_STACK_VALIDATION +#ifdef CONFIG_OBJTOOL #ifndef __ASSEMBLY__ @@ -157,7 +157,7 @@ struct unwind_hint { #endif /* __ASSEMBLY__ */ -#else /* !CONFIG_STACK_VALIDATION */ +#else /* !CONFIG_OBJTOOL */ #ifndef __ASSEMBLY__ @@ -179,6 +179,6 @@ struct unwind_hint { .endm #endif -#endif /* CONFIG_STACK_VALIDATION */ +#endif /* CONFIG_OBJTOOL */ #endif /* _LINUX_OBJTOOL_H */ -- cgit v1.2.3 From 0f620cefd7753175b6258fed85f76c2014ec3799 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Apr 2022 09:50:41 -0700 Subject: objtool: Rename "VMLINUX_VALIDATION" -> "NOINSTR_VALIDATION" CONFIG_VMLINUX_VALIDATION is just the validation of the "noinstr" rules. That name is a misnomer, because now objtool actually does vmlinux validation for other reasons. Rename CONFIG_VMLINUX_VALIDATION to CONFIG_NOINSTR_VALIDATION. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/173f07e2d6d1afc0874aed975a61783207c6a531.1650300597.git.jpoimboe@redhat.com --- include/linux/instrumentation.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h index 9111a3704072..bc7babe91b2e 100644 --- a/include/linux/instrumentation.h +++ b/include/linux/instrumentation.h @@ -2,7 +2,7 @@ #ifndef __LINUX_INSTRUMENTATION_H #define __LINUX_INSTRUMENTATION_H -#ifdef CONFIG_VMLINUX_VALIDATION +#ifdef CONFIG_NOINSTR_VALIDATION #include @@ -53,9 +53,9 @@ ".popsection\n\t" : : "i" (c)); \ }) #define instrumentation_end() __instrumentation_end(__COUNTER__) -#else /* !CONFIG_VMLINUX_VALIDATION */ +#else /* !CONFIG_NOINSTR_VALIDATION */ # define instrumentation_begin() do { } while(0) # define instrumentation_end() do { } while(0) -#endif /* CONFIG_VMLINUX_VALIDATION */ +#endif /* CONFIG_NOINSTR_VALIDATION */ #endif /* __LINUX_INSTRUMENTATION_H */ -- cgit v1.2.3 From 89e9c7280075f6733b22dd0740daeddeb1256ebf Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 20 Apr 2022 10:58:50 +0900 Subject: ipv6: Remove __ipv6_only_sock(). Since commit 9fe516ba3fb2 ("inet: move ipv6only in sock_common"), ipv6_only_sock() and __ipv6_only_sock() are the same macro. Let's remove the one. Signed-off-by: Kuniyuki Iwashima Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 918bfea4ef5f..ec5ca392eaa3 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -340,8 +340,7 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk) return (struct raw6_sock *)sk; } -#define __ipv6_only_sock(sk) (sk->sk_ipv6only) -#define ipv6_only_sock(sk) (__ipv6_only_sock(sk)) +#define ipv6_only_sock(sk) (sk->sk_ipv6only) #define ipv6_sk_rxinfo(sk) ((sk)->sk_family == PF_INET6 && \ inet6_sk(sk)->rxopt.bits.rxinfo) @@ -358,7 +357,6 @@ static inline int inet_v6_ipv6only(const struct sock *sk) return ipv6_only_sock(sk); } #else -#define __ipv6_only_sock(sk) 0 #define ipv6_only_sock(sk) 0 #define ipv6_sk_rxinfo(sk) 0 -- cgit v1.2.3 From b804923b7ccb9c9629703364e927b48cd02a9254 Mon Sep 17 00:00:00 2001 From: "jason-jh.lin" Date: Tue, 19 Apr 2022 17:41:36 +0800 Subject: soc: mediatek: add mtk-mmsys support for mt8195 vdosys0 1. Add mt8195 mmsys compatible for 2 vdosys. 2. Add io_start into each driver data of mt8195 vdosys. 3. Add get match data function to identify mmsys by io_start. 4. Add mt8195 routing table settings of vdosys0. Signed-off-by: jason-jh.lin Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Rex-BC Chen Reviewed-by: CK Hu Link: https://lore.kernel.org/r/20220419094143.9561-2-jason-jh.lin@mediatek.com Signed-off-by: Matthias Brugger --- include/linux/soc/mediatek/mtk-mmsys.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h index 4bba275e235a..cff5c9adbf46 100644 --- a/include/linux/soc/mediatek/mtk-mmsys.h +++ b/include/linux/soc/mediatek/mtk-mmsys.h @@ -17,13 +17,24 @@ enum mtk_ddp_comp_id { DDP_COMPONENT_COLOR0, DDP_COMPONENT_COLOR1, DDP_COMPONENT_DITHER, + DDP_COMPONENT_DITHER1, + DDP_COMPONENT_DP_INTF0, + DDP_COMPONENT_DP_INTF1, DDP_COMPONENT_DPI0, DDP_COMPONENT_DPI1, + DDP_COMPONENT_DSC0, + DDP_COMPONENT_DSC1, DDP_COMPONENT_DSI0, DDP_COMPONENT_DSI1, DDP_COMPONENT_DSI2, DDP_COMPONENT_DSI3, DDP_COMPONENT_GAMMA, + DDP_COMPONENT_MERGE0, + DDP_COMPONENT_MERGE1, + DDP_COMPONENT_MERGE2, + DDP_COMPONENT_MERGE3, + DDP_COMPONENT_MERGE4, + DDP_COMPONENT_MERGE5, DDP_COMPONENT_OD0, DDP_COMPONENT_OD1, DDP_COMPONENT_OVL0, -- cgit v1.2.3 From 4e8988c634a1159e803bfdc0118578ded97e012c Mon Sep 17 00:00:00 2001 From: "jason-jh.lin" Date: Tue, 19 Apr 2022 17:41:41 +0800 Subject: soc: mediatek: add DDP_DOMPONENT_DITHER0 enum for mt8195 vdosys0 The mmsys routing table of mt8195 vdosys0 has 2 DITHER components, so mmsys need to add DDP_COMPONENT_DITHER1 and change all usages of DITHER enum form DDP_COMPONENT_DITHER to DDP_COMPONENT_DITHER0. But its header need to keep DDP_COMPONENT_DITHER enum until drm/mediatek also changed it. Signed-off-by: jason-jh.lin Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Rex-BC Chen Link: https://lore.kernel.org/r/20220419094143.9561-7-jason-jh.lin@mediatek.com Signed-off-by: Matthias Brugger --- include/linux/soc/mediatek/mtk-mmsys.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h index cff5c9adbf46..59117d970daf 100644 --- a/include/linux/soc/mediatek/mtk-mmsys.h +++ b/include/linux/soc/mediatek/mtk-mmsys.h @@ -17,6 +17,7 @@ enum mtk_ddp_comp_id { DDP_COMPONENT_COLOR0, DDP_COMPONENT_COLOR1, DDP_COMPONENT_DITHER, + DDP_COMPONENT_DITHER0 = DDP_COMPONENT_DITHER, DDP_COMPONENT_DITHER1, DDP_COMPONENT_DP_INTF0, DDP_COMPONENT_DP_INTF1, -- cgit v1.2.3 From bd40cbb0e3b37a4d2a2d9e2ac40122cdf619b1f3 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 19 Apr 2022 19:29:16 +0200 Subject: PM: domains: Move genpd's time-accounting to ktime_get_mono_fast_ns() To move towards a more consistent behaviour between genpd and the runtime PM core, let's start by converting genpd's time-accounting from ktime_get() into ktime_get_mono_fast_ns(). Signed-off-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- include/linux/pm_domain.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 67017c9390c8..043d48e4420a 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -98,7 +98,7 @@ struct genpd_power_state { u64 usage; u64 rejected; struct fwnode_handle *fwnode; - ktime_t idle_time; + u64 idle_time; void *data; }; @@ -149,8 +149,8 @@ struct generic_pm_domain { unsigned int state_count); unsigned int state_count; /* number of states */ unsigned int state_idx; /* state that genpd will go to when off */ - ktime_t on_time; - ktime_t accounting_time; + u64 on_time; + u64 accounting_time; const struct genpd_lock_ops *lock_ops; union { struct mutex mlock; -- cgit v1.2.3 From 6c2f421174273de8f83cde4286d1c076d43a2d35 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 19 Apr 2022 13:34:24 +0200 Subject: driver: platform: Add helper for safer setting of driver_override Several core drivers and buses expect that driver_override is a dynamically allocated memory thus later they can kfree() it. However such assumption is not documented, there were in the past and there are already users setting it to a string literal. This leads to kfree() of static memory during device release (e.g. in error paths or during unbind): kernel BUG at ../mm/slub.c:3960! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM ... (kfree) from [] (platform_device_release+0x88/0xb4) (platform_device_release) from [] (device_release+0x2c/0x90) (device_release) from [] (kobject_put+0xec/0x20c) (kobject_put) from [] (exynos5_clk_probe+0x154/0x18c) (exynos5_clk_probe) from [] (platform_drv_probe+0x6c/0xa4) (platform_drv_probe) from [] (really_probe+0x280/0x414) (really_probe) from [] (driver_probe_device+0x78/0x1c4) (driver_probe_device) from [] (bus_for_each_drv+0x74/0xb8) (bus_for_each_drv) from [] (__device_attach+0xd4/0x16c) (__device_attach) from [] (bus_probe_device+0x88/0x90) (bus_probe_device) from [] (device_add+0x3dc/0x62c) (device_add) from [] (of_platform_device_create_pdata+0x94/0xbc) (of_platform_device_create_pdata) from [] (of_platform_bus_create+0x1a8/0x4fc) (of_platform_bus_create) from [] (of_platform_bus_create+0x20c/0x4fc) (of_platform_bus_create) from [] (of_platform_populate+0x84/0x118) (of_platform_populate) from [] (of_platform_default_populate_init+0xa0/0xb8) (of_platform_default_populate_init) from [] (do_one_initcall+0x8c/0x404) Provide a helper which clearly documents the usage of driver_override. This will allow later to reuse the helper and reduce the amount of duplicated code. Convert the platform driver to use a new helper and make the driver_override field const char (it is not modified by the core). Reviewed-by: Rafael J. Wysocki Acked-by: Rafael J. Wysocki Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220419113435.246203-2-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/device/driver.h | 2 ++ include/linux/platform_device.h | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index 15e7c5e15d62..700453017e1c 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -151,6 +151,8 @@ extern int __must_check driver_create_file(struct device_driver *driver, extern void driver_remove_file(struct device_driver *driver, const struct driver_attribute *attr); +int driver_set_override(struct device *dev, const char **override, + const char *s, size_t len); extern int __must_check driver_for_each_device(struct device_driver *drv, struct device *start, void *data, diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 7c96f169d274..582d83ed9a91 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -31,7 +31,11 @@ struct platform_device { struct resource *resource; const struct platform_device_id *id_entry; - char *driver_override; /* Driver name to force a match */ + /* + * Driver name to force a match. Do not set directly, because core + * frees it. Use driver_set_override() to set or clear it. + */ + const char *driver_override; /* MFD cell pointer */ struct mfd_cell *mfd_cell; -- cgit v1.2.3 From 6e67955087e72f1a5f64dd8014c27e1d9317fbb4 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 19 Apr 2022 13:34:25 +0200 Subject: amba: Use driver_set_override() instead of open-coding Use a helper to set driver_override to reduce the amount of duplicated code. Make the driver_override field const char, because it is not modified by the core and it matches other subsystems. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220419113435.246203-3-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/amba/bus.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index 6562f543c3e0..93799a72ff82 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -70,7 +70,11 @@ struct amba_device { unsigned int cid; struct amba_cs_uci_id uci; unsigned int irq[AMBA_NR_IRQS]; - char *driver_override; + /* + * Driver name to force a match. Do not set directly, because core + * frees it. Use driver_set_override() to set or clear it. + */ + const char *driver_override; }; struct amba_driver { -- cgit v1.2.3 From 5688f212e98a2469583a067fa5da4312ddc4e357 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 19 Apr 2022 13:34:26 +0200 Subject: fsl-mc: Use driver_set_override() instead of open-coding Use a helper to set driver_override to reduce the amount of duplicated code. Make the driver_override field const char, because it is not modified by the core and it matches other subsystems. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220419113435.246203-4-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/fsl/mc.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index 7b6c42bfb660..7a87ab9eba99 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -170,7 +170,9 @@ struct fsl_mc_obj_desc { * @regions: pointer to array of MMIO region entries * @irqs: pointer to array of pointers to interrupts allocated to this device * @resource: generic resource associated with this MC object device, if any. - * @driver_override: driver name to force a match + * @driver_override: driver name to force a match; do not set directly, + * because core frees it; use driver_set_override() to + * set or clear it. * * Generic device object for MC object devices that are "attached" to a * MC bus. @@ -204,7 +206,7 @@ struct fsl_mc_device { struct fsl_mc_device_irq **irqs; struct fsl_mc_resource *resource; struct device_link *consumer_link; - char *driver_override; + const char *driver_override; }; #define to_fsl_mc_device(_dev) \ -- cgit v1.2.3 From 01ed100276bdac259f1b418b998904a7486b0b68 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 19 Apr 2022 13:34:27 +0200 Subject: hv: Use driver_set_override() instead of open-coding Use a helper to set driver_override to the reduce amount of duplicated code. Make the driver_override field const char, because it is not modified by the core and it matches other subsystems. Reviewed-by: Michael Kelley Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220419113435.246203-5-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/hyperv.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index fe2e0179ed51..12e2336b23b7 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1257,7 +1257,11 @@ struct hv_device { u16 device_id; struct device device; - char *driver_override; /* Driver name to force a match */ + /* + * Driver name to force a match. Do not set directly, because core + * frees it. Use driver_set_override() to set or clear it. + */ + const char *driver_override; struct vmbus_channel *channel; struct kset *channels_kset; -- cgit v1.2.3 From 23d99baf9d729ca30b2fb6798a7b403a37bfb800 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 19 Apr 2022 13:34:28 +0200 Subject: PCI: Use driver_set_override() instead of open-coding Use a helper to set driver_override to the reduce amount of duplicated code. Make the driver_override field const char, because it is not modified by the core and it matches other subsystems. Reviewed-by: Andy Shevchenko Acked-by: Bjorn Helgaas Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220419113435.246203-6-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/pci.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 60adf42460ab..844d38f589cf 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -516,7 +516,11 @@ struct pci_dev { u16 acs_cap; /* ACS Capability offset */ phys_addr_t rom; /* Physical address if not from BAR */ size_t romlen; /* Length if not from BAR */ - char *driver_override; /* Driver name to force a match */ + /* + * Driver name to force a match. Do not set directly, because core + * frees it. Use driver_set_override() to set or clear it. + */ + const char *driver_override; unsigned long priv_flags; /* Private flags for the PCI driver */ -- cgit v1.2.3 From 19368f0f23e80929691dd5b1354832c0e0494419 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 19 Apr 2022 13:34:30 +0200 Subject: spi: Use helper for safer setting of driver_override Use a helper to set driver_override to the reduce amount of duplicated code. Reviewed-by: Mark Brown Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220419113435.246203-8-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/spi/spi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 5f8c063ddff4..f0177f9b6e13 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -138,6 +138,8 @@ extern int spi_delay_exec(struct spi_delay *_delay, struct spi_transfer *xfer); * for driver coldplugging, and in uevents used for hotplugging * @driver_override: If the name of a driver is written to this attribute, then * the device will bind to the named driver and only the named driver. + * Do not set directly, because core frees it; use driver_set_override() to + * set or clear it. * @cs_gpiod: gpio descriptor of the chipselect line (optional, NULL when * not using a GPIO line) * @word_delay: delay to be inserted between consecutive -- cgit v1.2.3 From 240bf4e665741241983580812a2be1b033f120ee Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 19 Apr 2022 13:34:31 +0200 Subject: vdpa: Use helper for safer setting of driver_override Use a helper to set driver_override to the reduce amount of duplicated code. Acked-by: Michael S. Tsirkin Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220419113435.246203-9-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/vdpa.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 8943a209202e..c0a5083632ab 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -64,7 +64,9 @@ struct vdpa_mgmt_dev; * struct vdpa_device - representation of a vDPA device * @dev: underlying device * @dma_dev: the actual device that is performing DMA - * @driver_override: driver name to force a match + * @driver_override: driver name to force a match; do not set directly, + * because core frees it; use driver_set_override() to + * set or clear it. * @config: the configuration ops for this device. * @cf_mutex: Protects get and set access to configuration layout. * @index: device index -- cgit v1.2.3 From 42cd402b8fd4672b692400fe5f9eecd55d2794ac Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 19 Apr 2022 13:34:35 +0200 Subject: rpmsg: Fix kfree() of static memory on setting driver_override The driver_override field from platform driver should not be initialized from static memory (string literal) because the core later kfree() it, for example when driver_override is set via sysfs. Use dedicated helper to set driver_override properly. Fixes: 950a7388f02b ("rpmsg: Turn name service into a stand alone driver") Fixes: c0cdc19f84a4 ("rpmsg: Driver for user space endpoint interface") Reviewed-by: Bjorn Andersson Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220419113435.246203-13-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/rpmsg.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index 02fa9116cd60..20c8cd1cde21 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -41,7 +41,9 @@ struct rpmsg_channel_info { * rpmsg_device - device that belong to the rpmsg bus * @dev: the device struct * @id: device id (used to match between rpmsg drivers and devices) - * @driver_override: driver name to force a match + * @driver_override: driver name to force a match; do not set directly, + * because core frees it; use driver_set_override() to + * set or clear it. * @src: local address * @dst: destination address * @ept: the rpmsg endpoint of this channel @@ -51,7 +53,7 @@ struct rpmsg_channel_info { struct rpmsg_device { struct device dev; struct rpmsg_device_id id; - char *driver_override; + const char *driver_override; u32 src; u32 dst; struct rpmsg_endpoint *ept; -- cgit v1.2.3 From f918cfc08c1755b9e54cd6effc923fa809045cf4 Mon Sep 17 00:00:00 2001 From: Ronak Jain Date: Wed, 6 Apr 2022 03:55:23 -0700 Subject: firmware: xilinx: add support for IOCTL and QUERY ID feature check Add support to check if IOCTL ID or QUERY ID is supported in firmware or not. Signed-off-by: Ronak Jain Link: https://lore.kernel.org/r/1649242526-17493-2-git-send-email-ronak.jain@xilinx.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-zynqmp.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 14f00a7672d1..1ec73d5352c3 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -29,6 +29,11 @@ /* SMC SIP service Call Function Identifier Prefix */ #define PM_SIP_SVC 0xC2000000 + +/* PM API versions */ +#define PM_API_VERSION_2 2 + +/* ATF only commands */ #define PM_GET_TRUSTZONE_VERSION 0xa03 #define PM_SET_SUSPEND_MODE 0xa02 #define GET_CALLBACK_DATA 0xa01 @@ -460,6 +465,7 @@ int zynqmp_pm_load_pdi(const u32 src, const u64 address); int zynqmp_pm_register_notifier(const u32 node, const u32 event, const u32 wake, const u32 enable); int zynqmp_pm_feature(const u32 api_id); +int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id); int zynqmp_pm_set_feature_config(enum pm_feature_config_id id, u32 value); int zynqmp_pm_get_feature_config(enum pm_feature_config_id id, u32 *payload); #else @@ -678,6 +684,11 @@ static inline int zynqmp_pm_pinctrl_get_function(const u32 pin, u32 *id) return -ENODEV; } +static inline int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id) +{ + return -ENODEV; +} + static inline int zynqmp_pm_pinctrl_set_function(const u32 pin, const u32 id) { return -ENODEV; -- cgit v1.2.3 From faebd693c59387b7b765fab64b543855e15a91b4 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:36 +0206 Subject: printk: rename cpulock functions Since the printk cpulock is CPU-reentrant and since it is used in all contexts, its usage must be carefully considered and most likely will require programming locklessly. To avoid mistaking the printk cpulock as a typical lock, rename it to cpu_sync. The main functions then become: printk_cpu_sync_get_irqsave(flags); printk_cpu_sync_put_irqrestore(flags); Add extra notes of caution in the function description to help developers understand the requirements for correct usage. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-2-john.ogness@linutronix.de --- include/linux/printk.h | 54 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 1522df223c0f..859323a52985 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -277,43 +277,55 @@ static inline void printk_trigger_flush(void) #endif #ifdef CONFIG_SMP -extern int __printk_cpu_trylock(void); -extern void __printk_wait_on_cpu_lock(void); -extern void __printk_cpu_unlock(void); +extern int __printk_cpu_sync_try_get(void); +extern void __printk_cpu_sync_wait(void); +extern void __printk_cpu_sync_put(void); /** - * printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning - * lock and disable interrupts. + * printk_cpu_sync_get_irqsave() - Acquire the printk cpu-reentrant spinning + * lock and disable interrupts. * @flags: Stack-allocated storage for saving local interrupt state, - * to be passed to printk_cpu_unlock_irqrestore(). + * to be passed to printk_cpu_sync_put_irqrestore(). * * If the lock is owned by another CPU, spin until it becomes available. * Interrupts are restored while spinning. + * + * CAUTION: This function must be used carefully. It does not behave like a + * typical lock. Here are important things to watch out for... + * + * * This function is reentrant on the same CPU. Therefore the calling + * code must not assume exclusive access to data if code accessing the + * data can run reentrant or within NMI context on the same CPU. + * + * * If there exists usage of this function from NMI context, it becomes + * unsafe to perform any type of locking or spinning to wait for other + * CPUs after calling this function from any context. This includes + * using spinlocks or any other busy-waiting synchronization methods. */ -#define printk_cpu_lock_irqsave(flags) \ - for (;;) { \ - local_irq_save(flags); \ - if (__printk_cpu_trylock()) \ - break; \ - local_irq_restore(flags); \ - __printk_wait_on_cpu_lock(); \ +#define printk_cpu_sync_get_irqsave(flags) \ + for (;;) { \ + local_irq_save(flags); \ + if (__printk_cpu_sync_try_get()) \ + break; \ + local_irq_restore(flags); \ + __printk_cpu_sync_wait(); \ } /** - * printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant spinning - * lock and restore interrupts. - * @flags: Caller's saved interrupt state, from printk_cpu_lock_irqsave(). + * printk_cpu_sync_put_irqrestore() - Release the printk cpu-reentrant spinning + * lock and restore interrupts. + * @flags: Caller's saved interrupt state, from printk_cpu_sync_get_irqsave(). */ -#define printk_cpu_unlock_irqrestore(flags) \ +#define printk_cpu_sync_put_irqrestore(flags) \ do { \ - __printk_cpu_unlock(); \ + __printk_cpu_sync_put(); \ local_irq_restore(flags); \ - } while (0) \ + } while (0) #else -#define printk_cpu_lock_irqsave(flags) ((void)flags) -#define printk_cpu_unlock_irqrestore(flags) ((void)flags) +#define printk_cpu_sync_get_irqsave(flags) ((void)flags) +#define printk_cpu_sync_put_irqrestore(flags) ((void)flags) #endif /* CONFIG_SMP */ -- cgit v1.2.3 From f5343321b71ac0a1112adeab0ff90b239bad3a83 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:37 +0206 Subject: printk: cpu sync always disable interrupts The CPU sync functions are a NOP for !CONFIG_SMP. But for !CONFIG_SMP they still need to disable interrupts in order to preserve context within the CPU sync sections. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-3-john.ogness@linutronix.de --- include/linux/printk.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 859323a52985..b70a42f94031 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -281,9 +281,16 @@ extern int __printk_cpu_sync_try_get(void); extern void __printk_cpu_sync_wait(void); extern void __printk_cpu_sync_put(void); +#else + +#define __printk_cpu_sync_try_get() true +#define __printk_cpu_sync_wait() +#define __printk_cpu_sync_put() +#endif /* CONFIG_SMP */ + /** - * printk_cpu_sync_get_irqsave() - Acquire the printk cpu-reentrant spinning - * lock and disable interrupts. + * printk_cpu_sync_get_irqsave() - Disable interrupts and acquire the printk + * cpu-reentrant spinning lock. * @flags: Stack-allocated storage for saving local interrupt state, * to be passed to printk_cpu_sync_put_irqrestore(). * @@ -322,13 +329,6 @@ extern void __printk_cpu_sync_put(void); local_irq_restore(flags); \ } while (0) -#else - -#define printk_cpu_sync_get_irqsave(flags) ((void)flags) -#define printk_cpu_sync_put_irqrestore(flags) ((void)flags) - -#endif /* CONFIG_SMP */ - extern int kptr_restrict; /** -- cgit v1.2.3 From a699449bb13b70b8bd10dc03ad7327ea3993221e Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:44 +0206 Subject: printk: refactor and rework printing logic Refactor/rework printing logic in order to prepare for moving to threaded console printing. - Move @console_seq into struct console so that the current "position" of each console can be tracked individually. - Move @console_dropped into struct console so that the current drop count of each console can be tracked individually. - Modify printing logic so that each console independently loads, prepares, and prints its next record. - Remove exclusive_console logic. Since console positions are handled independently, replaying past records occurs naturally. - Update the comments explaining why preemption is disabled while printing from printk() context. With these changes, there is a change in behavior: the console replaying the log (formerly exclusive console) will no longer block other consoles. New messages appear on the other consoles while the newly added console is still replaying. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-10-john.ogness@linutronix.de --- include/linux/console.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 7cd758a4f44e..8c1686e2c233 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -151,6 +151,8 @@ struct console { int cflag; uint ispeed; uint ospeed; + u64 seq; + unsigned long dropped; void *data; struct console *next; }; -- cgit v1.2.3 From 3b604ca81202eea2a917eb6491e90f610fba0ec7 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:46 +0206 Subject: printk: add pr_flush() Provide a might-sleep function to allow waiting for console printers to catch up to the latest logged message. Use pr_flush() whenever it is desirable to get buffered messages printed before continuing: suspend_console(), resume_console(), console_stop(), console_start(), console_unblank(). Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-12-john.ogness@linutronix.de --- include/linux/printk.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index b70a42f94031..091fba7283e1 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -170,6 +170,8 @@ extern void __printk_safe_exit(void); #define printk_deferred_enter __printk_safe_enter #define printk_deferred_exit __printk_safe_exit +extern bool pr_flush(int timeout_ms, bool reset_on_progress); + /* * Please don't use printk_ratelimit(), because it shares ratelimiting state * with all other unrelated printk_ratelimit() callsites. Instead use @@ -220,6 +222,11 @@ static inline void printk_deferred_exit(void) { } +static inline bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + return true; +} + static inline int printk_ratelimit(void) { return 0; -- cgit v1.2.3 From 2bb2b7b57f81255c13f4395ea911d6bdc70c9fe2 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:47 +0206 Subject: printk: add functions to prefer direct printing Once kthread printing is available, console printing will no longer occur in the context of the printk caller. However, there are some special contexts where it is desirable for the printk caller to directly print out kernel messages. Using pr_flush() to wait for threaded printers is only possible if the caller is in a sleepable context and the kthreads are active. That is not always the case. Introduce printk_prefer_direct_enter() and printk_prefer_direct_exit() functions to explicitly (and globally) activate/deactivate preferred direct console printing. The term "direct console printing" refers to printing to all enabled consoles from the context of the printk caller. The term "prefer" is used because this type of printing is only best effort. If the console is currently locked or other printers are already actively printing, the printk caller will need to rely on the other contexts to handle the printing. This preferred direct printing is how all printing has been handled until now (unless it was explicitly deferred). When kthread printing is introduced, there may be some unanticipated problems due to kthreads being unable to flush important messages. In order to minimize such risks, preferred direct printing is activated for the primary important messages when the system experiences general types of major errors. These are: - emergency reboot/shutdown - cpu and rcu stalls - hard and soft lockups - hung tasks - warn - sysrq Note that since kthread printing does not yet exist, no behavior changes result from this commit. This is only implementing the counter and marking the various places where preferred direct printing is active. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Acked-by: Paul E. McKenney # for RCU Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-13-john.ogness@linutronix.de --- include/linux/printk.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 091fba7283e1..cd26aab0ab2a 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -170,6 +170,9 @@ extern void __printk_safe_exit(void); #define printk_deferred_enter __printk_safe_enter #define printk_deferred_exit __printk_safe_exit +extern void printk_prefer_direct_enter(void); +extern void printk_prefer_direct_exit(void); + extern bool pr_flush(int timeout_ms, bool reset_on_progress); /* @@ -222,6 +225,14 @@ static inline void printk_deferred_exit(void) { } +static inline void printk_prefer_direct_enter(void) +{ +} + +static inline void printk_prefer_direct_exit(void) +{ +} + static inline bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; -- cgit v1.2.3 From 09c5ba0aa2fcfdadb17d045c3ee6f86d69270df7 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 21 Apr 2022 23:28:48 +0206 Subject: printk: add kthread console printers Create a kthread for each console to perform console printing. During normal operation (@system_state == SYSTEM_RUNNING), the kthread printers are responsible for all printing on their respective consoles. During non-normal operation, console printing is done as it has been: within the context of the printk caller or within irqwork triggered by the printk caller, referred to as direct printing. Since threaded console printers are responsible for all printing during normal operation, this also includes messages generated via deferred printk calls. If direct printing is in effect during a deferred printk call, the queued irqwork will perform the direct printing. To make it clear that this is the only time that the irqwork will perform direct printing, rename the flag PRINTK_PENDING_OUTPUT to PRINTK_PENDING_DIRECT_OUTPUT. Threaded console printers synchronize against each other and against console lockers by taking the console lock for each message that is printed. Note that the kthread printers do not care about direct printing. They will always try to print if new records are available. They can be blocked by direct printing, but will be woken again once direct printing is finished. Console unregistration is a bit tricky because the associated kthread printer cannot be stopped while the console lock is held. A policy is implemented that states: whichever task clears con->thread (under the console lock) is responsible for stopping the kthread. unregister_console() will clear con->thread while the console lock is held and then stop the kthread after releasing the console lock. For consoles that have implemented the exit() callback, the kthread is stopped before exit() is called. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220421212250.565456-14-john.ogness@linutronix.de --- include/linux/console.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 8c1686e2c233..9a251e70c090 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -153,6 +153,8 @@ struct console { uint ospeed; u64 seq; unsigned long dropped; + struct task_struct *thread; + void *data; struct console *next; }; -- cgit v1.2.3 From 5e7260712b9a76de19c32f15d667f8f93e573978 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 21 Apr 2022 14:47:26 +0200 Subject: qed: Remove IP services API. qed_nvmetcp_ip_services.c and its corresponding header file were introduced in commit 806ee7f81a2b ("qed: Add IP services APIs support") but there's still no users for any of the functions they declare. Since these files are effectively unused, let's just drop them. Found by code inspection. Compile-tested only. Signed-off-by: Guillaume Nault Link: https://lore.kernel.org/r/351ac8c847980e22850eb390553f8cc0e1ccd0ce.1650545051.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/qed/qed_nvmetcp_ip_services_if.h | 29 -------------------------- 1 file changed, 29 deletions(-) delete mode 100644 include/linux/qed/qed_nvmetcp_ip_services_if.h (limited to 'include/linux') diff --git a/include/linux/qed/qed_nvmetcp_ip_services_if.h b/include/linux/qed/qed_nvmetcp_ip_services_if.h deleted file mode 100644 index 3604aee53796..000000000000 --- a/include/linux/qed/qed_nvmetcp_ip_services_if.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */ -/* - * Copyright 2021 Marvell. All rights reserved. - */ - -#ifndef _QED_IP_SERVICES_IF_H -#define _QED_IP_SERVICES_IF_H - -#include -#include -#include -#include - -int qed_route_ipv4(struct sockaddr_storage *local_addr, - struct sockaddr_storage *remote_addr, - struct sockaddr *hardware_address, - struct net_device **ndev); -int qed_route_ipv6(struct sockaddr_storage *local_addr, - struct sockaddr_storage *remote_addr, - struct sockaddr *hardware_address, - struct net_device **ndev); -void qed_vlan_get_ndev(struct net_device **ndev, u16 *vlan_id); -struct pci_dev *qed_validate_ndev(struct net_device *ndev); -void qed_return_tcp_port(struct socket *sock); -int qed_fetch_tcp_port(struct sockaddr_storage local_ip_addr, - struct socket **sock, u16 *port); -__be16 qed_get_in_port(struct sockaddr_storage *sa); - -#endif /* _QED_IP_SERVICES_IF_H */ -- cgit v1.2.3 From 9ea4dcf49878bb9546b8fa9319dcbdc9b7ee20f8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 22 Apr 2022 15:58:11 -0700 Subject: PM: CXL: Disable suspend The CXL specification claims S3 support at a hardware level, but at a system software level there are some missing pieces. Section 9.4 (CXL 2.0) rightly claims that "CXL mem adapters may need aux power to retain memory context across S3", but there is no enumeration mechanism for the OS to determine if a given adapter has that support. Moreover the save state and resume image for the system may inadvertantly end up in a CXL device that needs to be restored before the save state is recoverable. I.e. a circular dependency that is not resolvable without a third party save-area. Arrange for the cxl_mem driver to fail S3 attempts. This still nominaly allows for suspend, but requires unbinding all CXL memory devices before the suspend to ensure the typical DRAM flow is taken. The cxl_mem unbind flow is intended to also tear down all CXL memory regions associated with a given cxl_memdev. It is reasonable to assume that any device participating in a System RAM range published in the EFI memory map is covered by aux power and save-area outside the device itself. So this restriction can be minimized in the future once pre-existing region enumeration support arrives, and perhaps a spec update to clarify if the EFI memory map is sufficent for determining the range of devices managed by platform-firmware for S3 support. Per Rafael, if the CXL configuration prevents suspend then it should fail early before tasks are frozen, and mem_sleep should stop showing 'mem' as an option [1]. Effectively CXL augments the platform suspend ->valid() op since, for example, the ACPI ops are not aware of the CXL / PCI dependencies. Given the split role of platform firmware vs OS provisioned CXL memory it is up to the cxl_mem driver to determine if the CXL configuration has elements that platform firmware may not be prepared to restore. Link: https://lore.kernel.org/r/CAJZ5v0hGVN_=3iU8OLpHY3Ak35T5+JcBM-qs8SbojKrpd0VXsA@mail.gmail.com [1] Cc: "Rafael J. Wysocki" Cc: Pavel Machek Cc: Len Brown Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/165066828317.3907920.5690432272182042556.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/pm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index e65b3ab28377..7911c4c9a7be 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -36,6 +36,15 @@ static inline void pm_vt_switch_unregister(struct device *dev) } #endif /* CONFIG_VT_CONSOLE_SLEEP */ +#ifdef CONFIG_CXL_SUSPEND +bool cxl_mem_active(void); +#else +static inline bool cxl_mem_active(void) +{ + return false; +} +#endif + /* * Device power management */ -- cgit v1.2.3 From f226041424cf87245d39a1b2dfae304308b36b6b Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Sat, 9 Apr 2022 14:12:15 -0700 Subject: soc: ti: wkup_m3_ipc: Add support for toggling VTT regulator Some boards like the AM335x EVM-SK and AM437x GP EVM provide software control via a GPIO pin to toggle the DDR VTT regulator to reduce power consumption in low power states. The VTT regulator should be disabled after enabling self-refresh on suspend, and should be enabled before disabling self-refresh on resume. This is to allow proper self-refresh entry/exit commands to be transmitted to the memory. The "ti,vtt-gpio-pin" device tree property in the wkup_m3_ipc node specifies which GPIO pin to use. This property is communicated to the Wakeup Cortex M3 co-processor where the actual toggling of the GPIO pin happens in CM3 firmware [1]. Please note that the GPIO pin must be on the GPIO0 module as that module is in the wakeup power domain. [1] https://git.ti.com/cgit/processor-firmware/ti-amx3-cm3-pm-firmware/tree/src/pm_services/ddr.c?h=08.02.00.006#n190 Signed-off-by: Dave Gerlach Signed-off-by: Keerthy [dfustini: remove the unnecessary "ti,needs-vtt-toggle" property] Signed-off-by: Drew Fustini Signed-off-by: Nishanth Menon Link: https://lore.kernel.org/r/20220409211215.2529387-3-dfustini@baylibre.com --- include/linux/wkup_m3_ipc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/wkup_m3_ipc.h b/include/linux/wkup_m3_ipc.h index 3f496967b538..2bc52c6381d5 100644 --- a/include/linux/wkup_m3_ipc.h +++ b/include/linux/wkup_m3_ipc.h @@ -33,6 +33,7 @@ struct wkup_m3_ipc { int mem_type; unsigned long resume_addr; + int vtt_conf; int state; struct completion sync_complete; -- cgit v1.2.3 From 0f08c2e7458e25c967d844170f8ad1aac3b57a02 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 17 Mar 2022 12:55:06 +0900 Subject: usb: deprecate the third argument of usb_maxpacket() This is a transitional patch with the ultimate goal of changing the prototype of usb_maxpacket() from: | static inline __u16 | usb_maxpacket(struct usb_device *udev, int pipe, int is_out) into: | static inline u16 usb_maxpacket(struct usb_device *udev, int pipe) The third argument of usb_maxpacket(): is_out gets removed because it can be derived from its second argument: pipe using usb_pipeout(pipe). Furthermore, in the current version, ubs_pipeout(pipe) is called regardless in order to sanitize the is_out parameter. In order to make a smooth change, we first deprecate the is_out parameter by simply ignoring it (using a variadic function) and will remove it later, once all the callers get updated. The body of the function is reworked accordingly and is_out is replaced by usb_pipeout(pipe). The WARN_ON() calls become unnecessary and get removed. Finally, the return type is changed from __u16 to u16 because this is not a UAPI function. Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20220317035514.6378-2-mailhol.vincent@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 200b7b79acb5..572e136f6314 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1969,21 +1969,17 @@ usb_pipe_endpoint(struct usb_device *dev, unsigned int pipe) return eps[usb_pipeendpoint(pipe)]; } -/*-------------------------------------------------------------------------*/ - -static inline __u16 -usb_maxpacket(struct usb_device *udev, int pipe, int is_out) +static inline u16 usb_maxpacket(struct usb_device *udev, int pipe, + /* int is_out deprecated */ ...) { struct usb_host_endpoint *ep; unsigned epnum = usb_pipeendpoint(pipe); - if (is_out) { - WARN_ON(usb_pipein(pipe)); + if (usb_pipeout(pipe)) ep = udev->ep_out[epnum]; - } else { - WARN_ON(usb_pipeout(pipe)); + else ep = udev->ep_in[epnum]; - } + if (!ep) return 0; @@ -1991,8 +1987,6 @@ usb_maxpacket(struct usb_device *udev, int pipe, int is_out) return usb_endpoint_maxp(&ep->desc); } -/* ----------------------------------------------------------------------- */ - /* translate USB error codes to codes user space understands */ static inline int usb_translate_errors(int error_code) { -- cgit v1.2.3 From 2ddf7617d568fdcbd3cc454bb4849fa6d3398c86 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 17 Mar 2022 12:55:13 +0900 Subject: usb: remove third argument of usb_maxpacket() Now that all users of usb_maxpacket() have been migrated to only use two arguments, remove the third variadic argument which was introduced for the transition. Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20220317035514.6378-9-mailhol.vincent@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 572e136f6314..8127782aa7a1 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1969,8 +1969,7 @@ usb_pipe_endpoint(struct usb_device *dev, unsigned int pipe) return eps[usb_pipeendpoint(pipe)]; } -static inline u16 usb_maxpacket(struct usb_device *udev, int pipe, - /* int is_out deprecated */ ...) +static inline u16 usb_maxpacket(struct usb_device *udev, int pipe) { struct usb_host_endpoint *ep; unsigned epnum = usb_pipeendpoint(pipe); -- cgit v1.2.3 From bdddc253b0938a0063798881d1f6a971ea1d8943 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 17 Mar 2022 12:55:14 +0900 Subject: usb: rework usb_maxpacket() using usb_pipe_endpoint() Rework the body of usb_maxpacket() and just rely on the usb_pipe_endpoint() helper function to retrieve the host endpoint instead of doing it by hand. Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20220317035514.6378-10-mailhol.vincent@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 8127782aa7a1..60bee864d897 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1971,13 +1971,7 @@ usb_pipe_endpoint(struct usb_device *dev, unsigned int pipe) static inline u16 usb_maxpacket(struct usb_device *udev, int pipe) { - struct usb_host_endpoint *ep; - unsigned epnum = usb_pipeendpoint(pipe); - - if (usb_pipeout(pipe)) - ep = udev->ep_out[epnum]; - else - ep = udev->ep_in[epnum]; + struct usb_host_endpoint *ep = usb_pipe_endpoint(udev, pipe); if (!ep) return 0; -- cgit v1.2.3 From da214a475f8bd1d3e9e7a19ddfeb4d1617551bab Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Apr 2022 14:22:39 -0600 Subject: net: add __sys_socket_file() This works like __sys_socket(), except instead of allocating and returning a socket fd, it just returns the file associated with the socket. No fd is installed into the process file table. This is similar to do_accept(), and allows io_uring to use this without instantiating a file descriptor in the process file table. Signed-off-by: Jens Axboe Acked-by: David S. Miller Link: https://lore.kernel.org/r/20220412202240.234207-2-axboe@kernel.dk --- include/linux/socket.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 6f85f5d957ef..a1882e1e71d2 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -434,6 +434,7 @@ extern struct file *do_accept(struct file *file, unsigned file_flags, extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); +extern struct file *__sys_socket_file(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, int addrlen, int file_flags); -- cgit v1.2.3 From da32b5817253697671af961715517bfbb308a592 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Sat, 23 Apr 2022 11:07:49 +0100 Subject: mm: Add fault_in_subpage_writeable() to probe at sub-page granularity On hardware with features like arm64 MTE or SPARC ADI, an access fault can be triggered at sub-page granularity. Depending on how the fault_in_writeable() function is used, the caller can get into a live-lock by continuously retrying the fault-in on an address different from the one where the uaccess failed. In the majority of cases progress is ensured by the following conditions: 1. copy_to_user_nofault() guarantees at least one byte access if the user address is not faulting. 2. The fault_in_writeable() loop is resumed from the first address that could not be accessed by copy_to_user_nofault(). If the loop iteration is restarted from an earlier (initial) point, the loop is repeated with the same conditions and it would live-lock. Introduce an arch-specific probe_subpage_writeable() and call it from the newly added fault_in_subpage_writeable() function. The arch code with sub-page faults will have to implement the specific probing functionality. Note that no other fault_in_subpage_*() functions are added since they have no callers currently susceptible to a live-lock. Signed-off-by: Catalin Marinas Cc: Andrew Morton Link: https://lore.kernel.org/r/20220423100751.1870771-2-catalin.marinas@arm.com Signed-off-by: Catalin Marinas --- include/linux/pagemap.h | 1 + include/linux/uaccess.h | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 993994cd943a..6165283bdb6f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1046,6 +1046,7 @@ void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter); * Fault in userspace address range. */ size_t fault_in_writeable(char __user *uaddr, size_t size); +size_t fault_in_subpage_writeable(char __user *uaddr, size_t size); size_t fault_in_safe_writeable(const char __user *uaddr, size_t size); size_t fault_in_readable(const char __user *uaddr, size_t size); diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 546179418ffa..5a328cf02b75 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -231,6 +231,28 @@ static inline bool pagefault_disabled(void) */ #define faulthandler_disabled() (pagefault_disabled() || in_atomic()) +#ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS + +/** + * probe_subpage_writeable: probe the user range for write faults at sub-page + * granularity (e.g. arm64 MTE) + * @uaddr: start of address range + * @size: size of address range + * + * Returns 0 on success, the number of bytes not probed on fault. + * + * It is expected that the caller checked for the write permission of each + * page in the range either by put_user() or GUP. The architecture port can + * implement a more efficient get_user() probing if the same sub-page faults + * are triggered by either a read or a write. + */ +static inline size_t probe_subpage_writeable(char __user *uaddr, size_t size) +{ + return 0; +} + +#endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ + #ifndef ARCH_HAS_NOCACHE_UACCESS static inline __must_check unsigned long -- cgit v1.2.3 From 3c938cc5cebcbd2291fe97f523c0705a2c24c77d Mon Sep 17 00:00:00 2001 From: Schspa Shi Date: Tue, 19 Apr 2022 09:28:10 +0800 Subject: gpio: use raw spinlock for gpio chip shadowed data In case of PREEMPT_RT, there is a raw_spinlock -> spinlock dependency as the lockdep report shows. __irq_set_handler irq_get_desc_buslock __irq_get_desc_lock raw_spin_lock_irqsave(&desc->lock, *flags); // raw spinlock get here __irq_do_set_handler mask_ack_irq dwapb_irq_ack spin_lock_irqsave(&gc->bgpio_lock, flags); // sleep able spinlock irq_put_desc_busunlock Replace with a raw lock to avoid BUGs. This lock is only used to access registers, and It's safe to replace with the raw lock without bad influence. [ 15.090359][ T1] ============================= [ 15.090365][ T1] [ BUG: Invalid wait context ] [ 15.090373][ T1] 5.10.59-rt52-00983-g186a6841c682-dirty #3 Not tainted [ 15.090386][ T1] ----------------------------- [ 15.090392][ T1] swapper/0/1 is trying to lock: [ 15.090402][ T1] 70ff00018507c188 (&gc->bgpio_lock){....}-{3:3}, at: _raw_spin_lock_irqsave+0x1c/0x28 [ 15.090470][ T1] other info that might help us debug this: [ 15.090477][ T1] context-{5:5} [ 15.090485][ T1] 3 locks held by swapper/0/1: [ 15.090497][ T1] #0: c2ff0001816de1a0 (&dev->mutex){....}-{4:4}, at: __device_driver_lock+0x98/0x104 [ 15.090553][ T1] #1: ffff90001485b4b8 (irq_domain_mutex){+.+.}-{4:4}, at: irq_domain_associate+0xbc/0x6d4 [ 15.090606][ T1] #2: 4bff000185d7a8e0 (lock_class){....}-{2:2}, at: _raw_spin_lock_irqsave+0x1c/0x28 [ 15.090654][ T1] stack backtrace: [ 15.090661][ T1] CPU: 4 PID: 1 Comm: swapper/0 Not tainted 5.10.59-rt52-00983-g186a6841c682-dirty #3 [ 15.090682][ T1] Hardware name: Horizon Robotics Journey 5 DVB (DT) [ 15.090692][ T1] Call trace: ...... [ 15.090811][ T1] _raw_spin_lock_irqsave+0x1c/0x28 [ 15.090828][ T1] dwapb_irq_ack+0xb4/0x300 [ 15.090846][ T1] __irq_do_set_handler+0x494/0xb2c [ 15.090864][ T1] __irq_set_handler+0x74/0x114 [ 15.090881][ T1] irq_set_chip_and_handler_name+0x44/0x58 [ 15.090900][ T1] gpiochip_irq_map+0x210/0x644 Signed-off-by: Schspa Shi Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Acked-by: Doug Berger Acked-by: Serge Semin Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 98c93510640e..991d374dcf71 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -436,7 +436,7 @@ struct gpio_chip { void __iomem *reg_dir_in; bool bgpio_dir_unreadable; int bgpio_bits; - spinlock_t bgpio_lock; + raw_spinlock_t bgpio_lock; unsigned long bgpio_data; unsigned long bgpio_dir; #endif /* CONFIG_GPIO_GENERIC */ -- cgit v1.2.3 From 38035c04f5865c4ef9597d6beed6a7178f90f64a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 22 Apr 2022 15:03:13 +0300 Subject: inotify: move control flags from mask to mark flags The inotify control flags in the mark mask (e.g. FS_IN_ONE_SHOT) are not relevant to object interest mask, so move them to the mark flags. This frees up some bits in the object interest mask. Link: https://lore.kernel.org/r/20220422120327.3459282-3-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0805b74cae44..b1c72edd9784 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -55,7 +55,6 @@ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ -#define FS_EXCL_UNLINK 0x04000000 /* do not send events if object is unlinked */ /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. @@ -66,7 +65,6 @@ #define FS_RENAME 0x10000000 /* File was renamed */ #define FS_DN_MULTISHOT 0x20000000 /* dnotify multishot */ #define FS_ISDIR 0x40000000 /* event occurred against dir */ -#define FS_IN_ONESHOT 0x80000000 /* only send event once */ #define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO) @@ -106,8 +104,7 @@ FS_ERROR) /* Extra flags that may be reported with event or control handling of events */ -#define ALL_FSNOTIFY_FLAGS (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \ - FS_DN_MULTISHOT | FS_EVENT_ON_CHILD) +#define ALL_FSNOTIFY_FLAGS (FS_ISDIR | FS_EVENT_ON_CHILD | FS_DN_MULTISHOT) #define ALL_FSNOTIFY_BITS (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS) @@ -473,9 +470,14 @@ struct fsnotify_mark { struct fsnotify_mark_connector *connector; /* Events types to ignore [mark->lock, group->mark_mutex] */ __u32 ignored_mask; -#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x01 -#define FSNOTIFY_MARK_FLAG_ALIVE 0x02 -#define FSNOTIFY_MARK_FLAG_ATTACHED 0x04 + /* General fsnotify mark flags */ +#define FSNOTIFY_MARK_FLAG_ALIVE 0x0001 +#define FSNOTIFY_MARK_FLAG_ATTACHED 0x0002 + /* inotify mark flags */ +#define FSNOTIFY_MARK_FLAG_EXCL_UNLINK 0x0010 +#define FSNOTIFY_MARK_FLAG_IN_ONESHOT 0x0020 + /* fanotify mark flags */ +#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 unsigned int flags; /* flags [mark->lock] */ }; -- cgit v1.2.3 From 867a448d587e7fa845bceaf4ee1c632448f2a9fa Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 22 Apr 2022 15:03:15 +0300 Subject: fsnotify: pass flags argument to fsnotify_alloc_group() Add flags argument to fsnotify_alloc_group(), define and use the flag FSNOTIFY_GROUP_USER in inotify and fanotify instead of the helper fsnotify_alloc_user_group() to indicate user allocation. Although the flag FSNOTIFY_GROUP_USER is currently not used after group allocation, we store the flags argument in the group struct for future use of other group flags. Link: https://lore.kernel.org/r/20220422120327.3459282-5-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index b1c72edd9784..f0bf557af009 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -210,6 +210,9 @@ struct fsnotify_group { unsigned int priority; bool shutdown; /* group is being shut down, don't queue more events */ +#define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */ + int flags; + /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ struct mutex mark_mutex; /* protect marks_list */ atomic_t user_waits; /* Number of tasks waiting for user @@ -543,8 +546,9 @@ static inline void fsnotify_update_flags(struct dentry *dentry) /* called from fsnotify listeners, such as fanotify or dnotify */ /* create a new group */ -extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops); -extern struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops); +extern struct fsnotify_group *fsnotify_alloc_group( + const struct fsnotify_ops *ops, + int flags); /* get reference to a group */ extern void fsnotify_get_group(struct fsnotify_group *group); /* drop reference on a group from fsnotify_alloc_group */ -- cgit v1.2.3 From f3010343d9e119da35ee864b3a28993bb5c78ed7 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 22 Apr 2022 15:03:16 +0300 Subject: fsnotify: make allow_dups a property of the group Instead of passing the allow_dups argument to fsnotify_add_mark() as an argument, define the group flag FSNOTIFY_GROUP_DUPS to express the allow_dups behavior and set this behavior at group creation time for all calls of fsnotify_add_mark(). Rename the allow_dups argument to generic add_flags argument for future use. Link: https://lore.kernel.org/r/20220422120327.3459282-6-amir73il@gmail.com Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index f0bf557af009..dd440e6ff528 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -211,6 +211,7 @@ struct fsnotify_group { bool shutdown; /* group is being shut down, don't queue more events */ #define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */ +#define FSNOTIFY_GROUP_DUPS 0x02 /* allow multiple marks per object */ int flags; /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ @@ -641,26 +642,26 @@ extern int fsnotify_get_conn_fsid(const struct fsnotify_mark_connector *conn, /* attach the mark to the object */ extern int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, - int allow_dups, __kernel_fsid_t *fsid); + int add_flags, __kernel_fsid_t *fsid); extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, int allow_dups, + unsigned int obj_type, int add_flags, __kernel_fsid_t *fsid); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct inode *inode, - int allow_dups) + int add_flags) { return fsnotify_add_mark(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, allow_dups, NULL); + FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, - int allow_dups) + int add_flags) { return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, allow_dups, + FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL); } -- cgit v1.2.3 From 43b245a788e2d8f1bb742668a9bdace02fcb3e96 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 22 Apr 2022 15:03:17 +0300 Subject: fsnotify: create helpers for group mark_mutex lock Create helpers to take and release the group mark_mutex lock. Define a flag FSNOTIFY_GROUP_NOFS in fsnotify_group that determines if the mark_mutex lock is fs reclaim safe or not. If not safe, the lock helpers take the lock and disable direct fs reclaim. In that case we annotate the mutex with a different lockdep class to express to lockdep that an allocation of mark of an fs reclaim safe group may take the group lock of another "NOFS" group to evict inodes. For now, converted only the callers in common code and no backend defines the NOFS flag. It is intended to be set by fanotify for evictable marks support. Link: https://lore.kernel.org/r/20220422120327.3459282-7-amir73il@gmail.com Suggested-by: Jan Kara Link: https://lore.kernel.org/r/20220321112310.vpr7oxro2xkz5llh@quack3.lan/ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index dd440e6ff528..d62111e83244 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -20,6 +20,7 @@ #include #include #include +#include /* * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily @@ -212,7 +213,9 @@ struct fsnotify_group { #define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */ #define FSNOTIFY_GROUP_DUPS 0x02 /* allow multiple marks per object */ +#define FSNOTIFY_GROUP_NOFS 0x04 /* group lock is not direct reclaim safe */ int flags; + unsigned int owner_flags; /* stored flags of mark_mutex owner */ /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ struct mutex mark_mutex; /* protect marks_list */ @@ -254,6 +257,31 @@ struct fsnotify_group { }; }; +/* + * These helpers are used to prevent deadlock when reclaiming inodes with + * evictable marks of the same group that is allocating a new mark. + */ +static inline void fsnotify_group_lock(struct fsnotify_group *group) +{ + mutex_lock(&group->mark_mutex); + if (group->flags & FSNOTIFY_GROUP_NOFS) + group->owner_flags = memalloc_nofs_save(); +} + +static inline void fsnotify_group_unlock(struct fsnotify_group *group) +{ + if (group->flags & FSNOTIFY_GROUP_NOFS) + memalloc_nofs_restore(group->owner_flags); + mutex_unlock(&group->mark_mutex); +} + +static inline void fsnotify_group_assert_locked(struct fsnotify_group *group) +{ + WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); + if (group->flags & FSNOTIFY_GROUP_NOFS) + WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); +} + /* When calling fsnotify tell it if the data is a path or inode */ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, -- cgit v1.2.3 From c3638b5b13740fa31762d414bbce8b7a694e582a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 22 Apr 2022 15:03:22 +0300 Subject: fsnotify: allow adding an inode mark without pinning inode fsnotify_add_mark() and variants implicitly take a reference on inode when attaching a mark to an inode. Make that behavior opt-out with the mark flag FSNOTIFY_MARK_FLAG_NO_IREF. Instead of taking the inode reference when attaching connector to inode and dropping the inode reference when detaching connector from inode, take the inode reference on attach of the first mark that wants to hold an inode reference and drop the inode reference on detach of the last mark that wants to hold an inode reference. Backends can "upgrade" an existing mark to take an inode reference, but cannot "downgrade" a mark with inode reference to release the refernce. This leaves the choice to the backend whether or not to pin the inode when adding an inode mark. This is intended to be used when adding a mark with ignored mask that is used for optimization in cases where group can afford getting unneeded events and reinstate the mark with ignored mask when inode is accessed again after being evicted. Link: https://lore.kernel.org/r/20220422120327.3459282-12-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index d62111e83244..9a1a9e78f69f 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -456,6 +456,7 @@ struct fsnotify_mark_connector { spinlock_t lock; unsigned short type; /* Type of object [lock] */ #define FSNOTIFY_CONN_FLAG_HAS_FSID 0x01 +#define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 unsigned short flags; /* flags [lock] */ __kernel_fsid_t fsid; /* fsid of filesystem containing object */ union { @@ -510,6 +511,7 @@ struct fsnotify_mark { #define FSNOTIFY_MARK_FLAG_IN_ONESHOT 0x0020 /* fanotify mark flags */ #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 +#define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 unsigned int flags; /* flags [mark->lock] */ }; -- cgit v1.2.3 From 5f9d3bd520261fd7a850818c71809fd580e0f30c Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 22 Apr 2022 15:03:27 +0300 Subject: fanotify: enable "evictable" inode marks Now that the direct reclaim path is handled we can enable evictable inode marks. Link: https://lore.kernel.org/r/20220422120327.3459282-17-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fanotify.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 419cadcd7ff5..edc28555814c 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -66,6 +66,7 @@ FAN_MARK_ONLYDIR | \ FAN_MARK_IGNORED_MASK | \ FAN_MARK_IGNORED_SURV_MODIFY | \ + FAN_MARK_EVICTABLE | \ FAN_MARK_FLUSH) /* -- cgit v1.2.3 From 430c3500995484962bdbccf358201afef8055535 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 25 Apr 2022 10:51:59 +0100 Subject: firmware: cirrus: cs_dsp: Avoid padding bytes in cs_dsp_coeff_ctl Change the order of members in struct cs_dsp_coeff_ctl to avoid the compiler having to insert alignment padding bytes. On a x86_64 build this saves 16 bytes per control. - Pointers are collected to the top of the struct (with the exception of priv, as noted below), so that they are inherently aligned. - The set and enable bitflags are placed together so they can be merged. - priv is placed at the end of the struct - it is for use by the client so it is helpful to make it stand out, and since the compiler will always pad the struct size to an alignment multiple putting a pointer last won't introduce any more padding. - struct cs_dsp_alg_region is placed at the end, right before priv, for the same reasoning as priv. Signed-off-by: Richard Fitzgerald Reviewed-by: Charles Keepax Link: https://lore.kernel.org/r/20220425095159.3044527-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index 38b4da3ddfe4..30055706cce2 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -68,36 +68,36 @@ struct cs_dsp_alg_region { /** * struct cs_dsp_coeff_ctl - Describes a coefficient control + * @list: List node for internal use + * @dsp: DSP instance associated with this control + * @cache: Cached value of the control * @fw_name: Name of the firmware * @subname: Name of the control parsed from the WMFW * @subname_len: Length of subname - * @alg_region: Logical region associated with this control - * @dsp: DSP instance associated with this control - * @enabled: Flag indicating whether control is enabled - * @list: List node for internal use - * @cache: Cached value of the control * @offset: Offset of control within alg_region in words * @len: Length of the cached value in bytes - * @set: Flag indicating the value has been written by the user - * @flags: Bitfield of WMFW_CTL_FLAG_ control flags defined in wmfw.h * @type: One of the WMFW_CTL_TYPE_ control types defined in wmfw.h + * @flags: Bitfield of WMFW_CTL_FLAG_ control flags defined in wmfw.h + * @set: Flag indicating the value has been written by the user + * @enabled: Flag indicating whether control is enabled + * @alg_region: Logical region associated with this control * @priv: For use by the client */ struct cs_dsp_coeff_ctl { + struct list_head list; + struct cs_dsp *dsp; + void *cache; const char *fw_name; /* Subname is needed to match with firmware */ const char *subname; unsigned int subname_len; - struct cs_dsp_alg_region alg_region; - struct cs_dsp *dsp; - unsigned int enabled:1; - struct list_head list; - void *cache; unsigned int offset; size_t len; - unsigned int set:1; - unsigned int flags; unsigned int type; + unsigned int flags; + unsigned int set:1; + unsigned int enabled:1; + struct cs_dsp_alg_region alg_region; void *priv; }; -- cgit v1.2.3 From 66200bbcde697dea7f48071d325ea4dec6f66b11 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Tue, 12 Apr 2022 19:49:00 -0700 Subject: Drivers: hv: vmbus: Add VMbus IMC device to unsupported list Hyper-V may offer an Initial Machine Configuration (IMC) synthetic device to guest VMs. The device may be used by Windows guests to get specialization information, such as the hostname. But the device is not used in Linux and there is no Linux driver, so it is unsupported. Currently, the IMC device GUID is not recognized by the VMbus driver, which results in an "Unknown GUID" error message during boot. Add the GUID to the list of known but unsupported devices so that the error message is not generated. Other than avoiding the error message, there is no change in guest behavior. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/1649818140-100953-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index fe2e0179ed51..2d085d951ee4 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1451,12 +1451,14 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size); 0x80, 0x2e, 0x27, 0xed, 0xe1, 0x9f) /* - * Linux doesn't support the 3 devices: the first two are for - * Automatic Virtual Machine Activation, and the third is for - * Remote Desktop Virtualization. + * Linux doesn't support these 4 devices: the first two are for + * Automatic Virtual Machine Activation, the third is for + * Remote Desktop Virtualization, and the fourth is Initial + * Machine Configuration (IMC) used only by Windows guests. * {f8e65716-3cb3-4a06-9a60-1889c5cccab5} * {3375baf4-9e15-4b30-b765-67acb10d607b} * {276aacf4-ac15-426c-98dd-7521ad3f01fe} + * {c376c1c3-d276-48d2-90a9-c04748072c60} */ #define HV_AVMA1_GUID \ @@ -1471,6 +1473,10 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size); .guid = GUID_INIT(0x276aacf4, 0xac15, 0x426c, 0x98, 0xdd, \ 0x75, 0x21, 0xad, 0x3f, 0x01, 0xfe) +#define HV_IMC_GUID \ + .guid = GUID_INIT(0xc376c1c3, 0xd276, 0x48d2, 0x90, 0xa9, \ + 0xc0, 0x47, 0x48, 0x07, 0x2c, 0x60) + /* * Common header for Hyper-V ICs */ -- cgit v1.2.3 From b03afa57c65e1e045e02df49777e953742745f4c Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Tue, 19 Apr 2022 14:23:22 +0200 Subject: Drivers: hv: vmbus: Introduce vmbus_sendpacket_getid() The function can be used to send a VMbus packet and retrieve the corresponding transaction ID. It will be used by hv_pci. No functional change. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220419122325.10078-4-parri.andrea@gmail.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 2d085d951ee4..7cb8718825ee 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1161,6 +1161,13 @@ extern int vmbus_open(struct vmbus_channel *channel, extern void vmbus_close(struct vmbus_channel *channel); +extern int vmbus_sendpacket_getid(struct vmbus_channel *channel, + void *buffer, + u32 bufferLen, + u64 requestid, + u64 *trans_id, + enum vmbus_packet_type type, + u32 flags); extern int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, u32 bufferLen, -- cgit v1.2.3 From 0aadb6a7bb811554cf39318b5d18e8ec50dd9f02 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Tue, 19 Apr 2022 14:23:23 +0200 Subject: Drivers: hv: vmbus: Introduce vmbus_request_addr_match() The function can be used to retrieve and clear/remove a transation ID from a channel requestor, provided the memory address corresponding to the ID equals a specified address. The function, and its 'lockless' variant __vmbus_request_addr_match(), will be used by hv_pci. Refactor vmbus_request_addr() to reuse the 'newly' introduced code. No functional change. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220419122325.10078-5-parri.andrea@gmail.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 7cb8718825ee..2e4962a2fbec 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -788,6 +788,7 @@ struct vmbus_requestor { #define VMBUS_NO_RQSTOR U64_MAX #define VMBUS_RQST_ERROR (U64_MAX - 1) +#define VMBUS_RQST_ADDR_ANY U64_MAX /* NetVSC-specific */ #define VMBUS_RQST_ID_NO_RESPONSE (U64_MAX - 2) /* StorVSC-specific */ @@ -1042,6 +1043,10 @@ struct vmbus_channel { }; u64 vmbus_next_request_id(struct vmbus_channel *channel, u64 rqst_addr); +u64 __vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, + u64 rqst_addr); +u64 vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, + u64 rqst_addr); u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id); static inline bool is_hvsock_channel(const struct vmbus_channel *c) -- cgit v1.2.3 From b91eaf7267cf7aec0a4e087decf7770dfb694d78 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Tue, 19 Apr 2022 14:23:24 +0200 Subject: Drivers: hv: vmbus: Introduce {lock,unlock}_requestor() To abtract the lock and unlock operations on the requestor spin lock. The helpers will come in handy in hv_pci. No functional change. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220419122325.10078-6-parri.andrea@gmail.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 2e4962a2fbec..460a716f4748 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1042,6 +1042,21 @@ struct vmbus_channel { u32 max_pkt_size; }; +#define lock_requestor(channel, flags) \ +do { \ + struct vmbus_requestor *rqstor = &(channel)->requestor; \ + \ + spin_lock_irqsave(&rqstor->req_lock, flags); \ +} while (0) + +static __always_inline void unlock_requestor(struct vmbus_channel *channel, + unsigned long flags) +{ + struct vmbus_requestor *rqstor = &channel->requestor; + + spin_unlock_irqrestore(&rqstor->req_lock, flags); +} + u64 vmbus_next_request_id(struct vmbus_channel *channel, u64 rqst_addr); u64 __vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, u64 rqst_addr); -- cgit v1.2.3 From 067c098766c6af667a9002d4e33cf1f3c998abbe Mon Sep 17 00:00:00 2001 From: Frank Rowand Date: Wed, 20 Apr 2022 17:25:05 -0500 Subject: of: overlay: rework overlay apply and remove kfree()s Fix various kfree() issues related to of_overlay_apply(). - Double kfree() of fdt and tree when init_overlay_changeset() returns an error. - free_overlay_changeset() free the root of the unflattened overlay (variable tree) instead of the memory that contains the unflattened overlay. - For the case of a failure during applying an overlay, move kfree() of new_fdt and overlay_mem into free_overlay_changeset(), which is called by the function that allocated them. - For the case of removing an overlay, the kfree() of new_fdt and overlay_mem remains in free_overlay_changeset(). - Check return value of of_fdt_unflatten_tree() for error instead of checking the returned value of overlay_root. - When storing pointers to allocated objects in ovcs, do so as near to the allocation as possible instead of in deeply layered function. More clearly document policy related to lifetime of pointers into overlay memory. Double kfree() Reported-by: Slawomir Stepien Signed-off-by: Frank Rowand Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220420222505.928492-3-frowand.list@gmail.com --- include/linux/of.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 04971e85fbc9..17741eee0ca4 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1543,7 +1543,8 @@ static inline bool of_device_is_system_power_controller(const struct device_node */ enum of_overlay_notify_action { - OF_OVERLAY_PRE_APPLY = 0, + OF_OVERLAY_INIT = 0, /* kzalloc() of ovcs sets this value */ + OF_OVERLAY_PRE_APPLY, OF_OVERLAY_POST_APPLY, OF_OVERLAY_PRE_REMOVE, OF_OVERLAY_POST_REMOVE, -- cgit v1.2.3 From 5b0e58542acb3672f42e08cb1cd57784f5e08d6b Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 7 Apr 2022 12:08:54 +0200 Subject: net: ieee802154: Enhance/fix the names of the MLME return codes Let's keep these definitions as close to the specification as possible while they are not yet in use. The names get slightly longer, but we gain the minor cost of being able to search the spec more easily. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220407100903.1695973-2-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/linux/ieee802154.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h index 95c831162212..01d945c8b2e1 100644 --- a/include/linux/ieee802154.h +++ b/include/linux/ieee802154.h @@ -136,16 +136,16 @@ enum { IEEE802154_SUCCESS = 0x0, /* The beacon was lost following a synchronization request. */ - IEEE802154_BEACON_LOSS = 0xe0, + IEEE802154_BEACON_LOST = 0xe0, /* * A transmission could not take place due to activity on the * channel, i.e., the CSMA-CA mechanism has failed. */ - IEEE802154_CHNL_ACCESS_FAIL = 0xe1, + IEEE802154_CHANNEL_ACCESS_FAILURE = 0xe1, /* The GTS request has been denied by the PAN coordinator. */ - IEEE802154_DENINED = 0xe2, + IEEE802154_DENIED = 0xe2, /* The attempt to disable the transceiver has failed. */ - IEEE802154_DISABLE_TRX_FAIL = 0xe3, + IEEE802154_DISABLE_TRX_FAILURE = 0xe3, /* * The received frame induces a failed security check according to * the security suite. @@ -185,9 +185,9 @@ enum { * A PAN identifier conflict has been detected and communicated to the * PAN coordinator. */ - IEEE802154_PANID_CONFLICT = 0xee, + IEEE802154_PAN_ID_CONFLICT = 0xee, /* A coordinator realignment command has been received. */ - IEEE802154_REALIGMENT = 0xef, + IEEE802154_REALIGNMENT = 0xef, /* The transaction has expired and its information discarded. */ IEEE802154_TRANSACTION_EXPIRED = 0xf0, /* There is no capacity to store the transaction. */ @@ -203,7 +203,7 @@ enum { * A SET/GET request was issued with the identifier of a PIB attribute * that is not supported. */ - IEEE802154_UNSUPPORTED_ATTR = 0xf4, + IEEE802154_UNSUPPORTED_ATTRIBUTE = 0xf4, /* * A request to perform a scan operation failed because the MLME was * in the process of performing a previously initiated scan operation. -- cgit v1.2.3 From f06cfc233ac67e26ed137f4e098c047939cf1530 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 7 Apr 2022 12:08:55 +0200 Subject: net: ieee802154: Fill the list of MLME return codes There are more codes than already listed, let's be a bit more exhaustive. This will allow to drop device drivers local definitions of these codes. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220407100903.1695973-3-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/linux/ieee802154.h | 67 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h index 01d945c8b2e1..f1f9412b6ac6 100644 --- a/include/linux/ieee802154.h +++ b/include/linux/ieee802154.h @@ -134,7 +134,35 @@ enum { * a successful transmission. */ IEEE802154_SUCCESS = 0x0, - + /* The requested operation failed. */ + IEEE802154_MAC_ERROR = 0x1, + /* The requested operation has been cancelled. */ + IEEE802154_CANCELLED = 0x2, + /* + * Device is ready to poll the coordinator for data in a non beacon + * enabled PAN. + */ + IEEE802154_READY_FOR_POLL = 0x3, + /* Wrong frame counter. */ + IEEE802154_COUNTER_ERROR = 0xdb, + /* + * The frame does not conforms to the incoming key usage policy checking + * procedure. + */ + IEEE802154_IMPROPER_KEY_TYPE = 0xdc, + /* + * The frame does not conforms to the incoming security level usage + * policy checking procedure. + */ + IEEE802154_IMPROPER_SECURITY_LEVEL = 0xdd, + /* Secured frame received with an empty Frame Version field. */ + IEEE802154_UNSUPPORTED_LEGACY = 0xde, + /* + * A secured frame is received or must be sent but security is not + * enabled in the device. Or, the Auxiliary Security Header has security + * level of zero in it. + */ + IEEE802154_UNSUPPORTED_SECURITY = 0xdf, /* The beacon was lost following a synchronization request. */ IEEE802154_BEACON_LOST = 0xe0, /* @@ -204,11 +232,48 @@ enum { * that is not supported. */ IEEE802154_UNSUPPORTED_ATTRIBUTE = 0xf4, + /* Missing source or destination address or address mode. */ + IEEE802154_INVALID_ADDRESS = 0xf5, + /* + * MLME asked to turn the receiver on, but the on time duration is too + * big compared to the macBeaconOrder. + */ + IEEE802154_ON_TIME_TOO_LONG = 0xf6, + /* + * MLME asaked to turn the receiver on, but the request was delayed for + * too long before getting processed. + */ + IEEE802154_PAST_TIME = 0xf7, + /* + * The StartTime parameter is nonzero, and the MLME is not currently + * tracking the beacon of the coordinator through which it is + * associated. + */ + IEEE802154_TRACKING_OFF = 0xf8, + /* + * The index inside the hierarchical values in PIBAttribute is out of + * range. + */ + IEEE802154_INVALID_INDEX = 0xf9, + /* + * The number of PAN descriptors discovered during a scan has been + * reached. + */ + IEEE802154_LIMIT_REACHED = 0xfa, + /* + * The PIBAttribute parameter specifies an attribute that is a read-only + * attribute. + */ + IEEE802154_READ_ONLY = 0xfb, /* * A request to perform a scan operation failed because the MLME was * in the process of performing a previously initiated scan operation. */ IEEE802154_SCAN_IN_PROGRESS = 0xfc, + /* The outgoing superframe overlaps the incoming superframe. */ + IEEE802154_SUPERFRAME_OVERLAP = 0xfd, + /* Any other error situation. */ + IEEE802154_SYSTEM_ERROR = 0xff, }; /* frame control handling */ -- cgit v1.2.3 From c83227a5d05ed77b634ce4c2fc5f143ae2a4d6f5 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 21 Apr 2022 22:06:54 +0200 Subject: irq/gpio: ixp4xx: Drop boardfile probe path The boardfiles for IXP4xx have been deleted. Delete all the quirks and code dealing with that boot path and rely solely on device tree boot. Fix some missing static keywords that the kernel test robot was complaining about while we're at it. Cc: Bartosz Golaszewski Acked-by: Marc Zyngier Signed-off-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- include/linux/irqchip/irq-ixp4xx.h | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 include/linux/irqchip/irq-ixp4xx.h (limited to 'include/linux') diff --git a/include/linux/irqchip/irq-ixp4xx.h b/include/linux/irqchip/irq-ixp4xx.h deleted file mode 100644 index 9395917d6936..000000000000 --- a/include/linux/irqchip/irq-ixp4xx.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __IRQ_IXP4XX_H -#define __IRQ_IXP4XX_H - -#include -struct irq_domain; - -void ixp4xx_irq_init(resource_size_t irqbase, - bool is_356); -struct irq_domain *ixp4xx_get_irq_domain(void); - -#endif /* __IRQ_IXP4XX_H */ -- cgit v1.2.3 From fae74fb5d525f979085b6e70b883d7a7049bf15f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 25 Apr 2022 19:32:55 +0200 Subject: gpio: pcf857x: Make teardown callback return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All teardown functions return 0. Also there is little sense in returning a negative error code from an i2c remove function as this only results in emitting an error message but the device is removed nevertheless. This patch is a preparation for making i2c remove callbacks return void. Signed-off-by: Uwe Kleine-König Signed-off-by: Bartosz Golaszewski --- include/linux/platform_data/pcf857x.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/pcf857x.h b/include/linux/platform_data/pcf857x.h index 11d4ed78c7f4..01d0a3ea3aef 100644 --- a/include/linux/platform_data/pcf857x.h +++ b/include/linux/platform_data/pcf857x.h @@ -36,7 +36,7 @@ struct pcf857x_platform_data { int (*setup)(struct i2c_client *client, int gpio, unsigned ngpio, void *context); - int (*teardown)(struct i2c_client *client, + void (*teardown)(struct i2c_client *client, int gpio, unsigned ngpio, void *context); void *context; -- cgit v1.2.3 From d9d31cf88702ae071bec033e5c8714048aa71285 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 25 Apr 2022 15:04:48 -0700 Subject: bpf: Use bpf_prog_run_array_cg_flags everywhere Rename bpf_prog_run_array_cg_flags to bpf_prog_run_array_cg and use it everywhere. check_return_code already enforces sane return ranges for all cgroup types. (only egress and bind hooks have uncanonical return ranges, the rest is using [0, 1]) No functional changes. v2: - 'func_ret & 1' under explicit test (Andrii & Martin) Suggested-by: Alexei Starovoitov Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220425220448.3669032-1-sdf@google.com --- include/linux/bpf-cgroup.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 88a51b242adc..669d96d074ad 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -225,24 +225,20 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) \ ({ \ - u32 __unused_flags; \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ - NULL, \ - &__unused_flags); \ + NULL, NULL); \ __ret; \ }) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) \ ({ \ - u32 __unused_flags; \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ - t_ctx, \ - &__unused_flags); \ + t_ctx, NULL); \ release_sock(sk); \ } \ __ret; \ -- cgit v1.2.3 From 61df10c7799e27807ad5e459eec9d77cddf8bf45 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:49 +0530 Subject: bpf: Allow storing unreferenced kptr in map This commit introduces a new pointer type 'kptr' which can be embedded in a map value to hold a PTR_TO_BTF_ID stored by a BPF program during its invocation. When storing such a kptr, BPF program's PTR_TO_BTF_ID register must have the same type as in the map value's BTF, and loading a kptr marks the destination register as PTR_TO_BTF_ID with the correct kernel BTF and BTF ID. Such kptr are unreferenced, i.e. by the time another invocation of the BPF program loads this pointer, the object which the pointer points to may not longer exist. Since PTR_TO_BTF_ID loads (using BPF_LDX) are patched to PROBE_MEM loads by the verifier, it would safe to allow user to still access such invalid pointer, but passing such pointers into BPF helpers and kfuncs should not be permitted. A future patch in this series will close this gap. The flexibility offered by allowing programs to dereference such invalid pointers while being safe at runtime frees the verifier from doing complex lifetime tracking. As long as the user may ensure that the object remains valid, it can ensure data read by it from the kernel object is valid. The user indicates that a certain pointer must be treated as kptr capable of accepting stores of PTR_TO_BTF_ID of a certain type, by using a BTF type tag 'kptr' on the pointed to type of the pointer. Then, this information is recorded in the object BTF which will be passed into the kernel by way of map's BTF information. The name and kind from the map value BTF is used to look up the in-kernel type, and the actual BTF and BTF ID is recorded in the map struct in a new kptr_off_tab member. For now, only storing pointers to structs is permitted. An example of this specification is shown below: #define __kptr __attribute__((btf_type_tag("kptr"))) struct map_value { ... struct task_struct __kptr *task; ... }; Then, in a BPF program, user may store PTR_TO_BTF_ID with the type task_struct into the map, and then load it later. Note that the destination register is marked PTR_TO_BTF_ID_OR_NULL, as the verifier cannot know whether the value is NULL or not statically, it must treat all potential loads at that map value offset as loading a possibly NULL pointer. Only BPF_LDX, BPF_STX, and BPF_ST (with insn->imm = 0 to denote NULL) are allowed instructions that can access such a pointer. On BPF_LDX, the destination register is updated to be a PTR_TO_BTF_ID, and on BPF_STX, it is checked whether the source register type is a PTR_TO_BTF_ID with same BTF type as specified in the map BTF. The access size must always be BPF_DW. For the map in map support, the kptr_off_tab for outer map is copied from the inner map's kptr_off_tab. It was chosen to do a deep copy instead of introducing a refcount to kptr_off_tab, because the copy only needs to be done when paramterizing using inner_map_fd in the map in map case, hence would be unnecessary for all other users. It is not permitted to use MAP_FREEZE command and mmap for BPF map having kptrs, similar to the bpf_timer case. A kptr also requires that BPF program has both read and write access to the map (hence both BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG are disallowed). Note that check_map_access must be called from both check_helper_mem_access and for the BPF instructions, hence the kptr check must distinguish between ACCESS_DIRECT and ACCESS_HELPER, and reject ACCESS_HELPER cases. We rename stack_access_src to bpf_access_src and reuse it for this purpose. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-2-memxor@gmail.com --- include/linux/bpf.h | 31 ++++++++++++++++++++++++++++++- include/linux/btf.h | 2 ++ 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7bf441563ffc..ce12124048c0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -155,6 +155,24 @@ struct bpf_map_ops { const struct bpf_iter_seq_info *iter_seq_info; }; +enum { + /* Support at most 8 pointers in a BPF map value */ + BPF_MAP_VALUE_OFF_MAX = 8, +}; + +struct bpf_map_value_off_desc { + u32 offset; + struct { + struct btf *btf; + u32 btf_id; + } kptr; +}; + +struct bpf_map_value_off { + u32 nr_off; + struct bpf_map_value_off_desc off[]; +}; + struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -171,6 +189,7 @@ struct bpf_map { u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; int spin_lock_off; /* >=0 valid offset, <0 error */ + struct bpf_map_value_off *kptr_off_tab; int timer_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; @@ -184,7 +203,7 @@ struct bpf_map { char name[BPF_OBJ_NAME_LEN]; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ - /* 14 bytes hole */ + /* 6 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. @@ -217,6 +236,11 @@ static inline bool map_value_has_timer(const struct bpf_map *map) return map->timer_off >= 0; } +static inline bool map_value_has_kptrs(const struct bpf_map *map) +{ + return !IS_ERR_OR_NULL(map->kptr_off_tab); +} + static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { if (unlikely(map_value_has_spin_lock(map))) @@ -1396,6 +1420,11 @@ void bpf_prog_put(struct bpf_prog *prog); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); +struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset); +void bpf_map_free_kptr_off_tab(struct bpf_map *map); +struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map); +bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b); + struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); diff --git a/include/linux/btf.h b/include/linux/btf.h index 36bc09b8e890..19c297f9a52f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -123,6 +123,8 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); +struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, + const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, -- cgit v1.2.3 From 8f14852e89113d738c99c375b4c8b8b7e1073df1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:50 +0530 Subject: bpf: Tag argument to be released in bpf_func_proto Add a new type flag for bpf_arg_type that when set tells verifier that for a release function, that argument's register will be the one for which meta.ref_obj_id will be set, and which will then be released using release_reference. To capture the regno, introduce a new field release_regno in bpf_call_arg_meta. This would be required in the next patch, where we may either pass NULL or a refcounted pointer as an argument to the release function bpf_kptr_xchg. Just releasing only when meta.ref_obj_id is set is not enough, as there is a case where the type of argument needed matches, but the ref_obj_id is set to 0. Hence, we must enforce that whenever meta.ref_obj_id is zero, the register that is to be released can only be NULL for a release function. Since we now indicate whether an argument is to be released in bpf_func_proto itself, is_release_function helper has lost its utitlity, hence refactor code to work without it, and just rely on meta.release_regno to know when to release state for a ref_obj_id. Still, the restriction of one release argument and only one ref_obj_id passed to BPF helper or kfunc remains. This may be lifted in the future. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-3-memxor@gmail.com --- include/linux/bpf.h | 5 ++++- include/linux/bpf_verifier.h | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ce12124048c0..492edd2c5713 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -366,7 +366,10 @@ enum bpf_type_flag { */ MEM_PERCPU = BIT(4 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = MEM_PERCPU, + /* Indicates that the argument will be released. */ + OBJ_RELEASE = BIT(5 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = OBJ_RELEASE, }; /* Max number of base types. */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3a9d2d7cc6b7..1f1e7f2ea967 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -523,8 +523,7 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type, - bool is_release_func); + enum bpf_arg_type arg_type); int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, -- cgit v1.2.3 From c0a5a21c25f37c9fd7b36072f9968cdff1e4aa13 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:51 +0530 Subject: bpf: Allow storing referenced kptr in map Extending the code in previous commits, introduce referenced kptr support, which needs to be tagged using 'kptr_ref' tag instead. Unlike unreferenced kptr, referenced kptr have a lot more restrictions. In addition to the type matching, only a newly introduced bpf_kptr_xchg helper is allowed to modify the map value at that offset. This transfers the referenced pointer being stored into the map, releasing the references state for the program, and returning the old value and creating new reference state for the returned pointer. Similar to unreferenced pointer case, return value for this case will also be PTR_TO_BTF_ID_OR_NULL. The reference for the returned pointer must either be eventually released by calling the corresponding release function, otherwise it must be transferred into another map. It is also allowed to call bpf_kptr_xchg with a NULL pointer, to clear the value, and obtain the old value if any. BPF_LDX, BPF_STX, and BPF_ST cannot access referenced kptr. A future commit will permit using BPF_LDX for such pointers, but attempt at making it safe, since the lifetime of object won't be guaranteed. There are valid reasons to enforce the restriction of permitting only bpf_kptr_xchg to operate on referenced kptr. The pointer value must be consistent in face of concurrent modification, and any prior values contained in the map must also be released before a new one is moved into the map. To ensure proper transfer of this ownership, bpf_kptr_xchg returns the old value, which the verifier would require the user to either free or move into another map, and releases the reference held for the pointer being moved in. In the future, direct BPF_XCHG instruction may also be permitted to work like bpf_kptr_xchg helper. Note that process_kptr_func doesn't have to call check_helper_mem_access, since we already disallow rdonly/wronly flags for map, which is what check_map_access_type checks, and we already ensure the PTR_TO_MAP_VALUE refers to kptr by obtaining its off_desc, so check_map_access is also not required. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-4-memxor@gmail.com --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 492edd2c5713..24310837bafc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -160,8 +160,14 @@ enum { BPF_MAP_VALUE_OFF_MAX = 8, }; +enum bpf_kptr_type { + BPF_KPTR_UNREF, + BPF_KPTR_REF, +}; + struct bpf_map_value_off_desc { u32 offset; + enum bpf_kptr_type type; struct { struct btf *btf; u32 btf_id; @@ -418,6 +424,7 @@ enum bpf_arg_type { ARG_PTR_TO_STACK, /* pointer to stack */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ + ARG_PTR_TO_KPTR, /* pointer to referenced kptr */ __BPF_ARG_TYPE_MAX, /* Extended arg_types. */ @@ -427,6 +434,7 @@ enum bpf_arg_type { ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, + ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. -- cgit v1.2.3 From 6efe152d4061a83a2c5db6a0e5b58f9345c9742f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:52 +0530 Subject: bpf: Prevent escaping of kptr loaded from maps While we can guarantee that even for unreferenced kptr, the object pointer points to being freed etc. can be handled by the verifier's exception handling (normal load patching to PROBE_MEM loads), we still cannot allow the user to pass these pointers to BPF helpers and kfunc, because the same exception handling won't be done for accesses inside the kernel. The same is true if a referenced pointer is loaded using normal load instruction. Since the reference is not guaranteed to be held while the pointer is used, it must be marked as untrusted. Hence introduce a new type flag, PTR_UNTRUSTED, which is used to mark all registers loading unreferenced and referenced kptr from BPF maps, and ensure they can never escape the BPF program and into the kernel by way of calling stable/unstable helpers. In check_ptr_to_btf_access, the !type_may_be_null check to reject type flags is still correct, as apart from PTR_MAYBE_NULL, only MEM_USER, MEM_PERCPU, and PTR_UNTRUSTED may be set for PTR_TO_BTF_ID. The first two are checked inside the function and rejected using a proper error message, but we still want to allow dereference of untrusted case. Also, we make sure to inherit PTR_UNTRUSTED when chain of pointers are walked, so that this flag is never dropped once it has been set on a PTR_TO_BTF_ID (i.e. trusted to untrusted transition can only be in one direction). In convert_ctx_accesses, extend the switch case to consider untrusted PTR_TO_BTF_ID in addition to normal PTR_TO_BTF_ID for PROBE_MEM conversion for BPF_LDX. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-5-memxor@gmail.com --- include/linux/bpf.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 24310837bafc..e13a5cbd4ebb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -375,7 +375,15 @@ enum bpf_type_flag { /* Indicates that the argument will be released. */ OBJ_RELEASE = BIT(5 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = OBJ_RELEASE, + /* PTR is not trusted. This is only used with PTR_TO_BTF_ID, to mark + * unreferenced and referenced kptr loaded from map value using a load + * instruction, so that they can only be dereferenced but not escape the + * BPF program into the kernel (i.e. cannot be passed as arguments to + * kfunc or bpf helpers). + */ + PTR_UNTRUSTED = BIT(6 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = PTR_UNTRUSTED, }; /* Max number of base types. */ -- cgit v1.2.3 From 4d7d7f69f4b104b2ddeec6a1e7fcfd2d044ed8c4 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:53 +0530 Subject: bpf: Adapt copy_map_value for multiple offset case Since now there might be at most 10 offsets that need handling in copy_map_value, the manual shuffling and special case is no longer going to work. Hence, let's generalise the copy_map_value function by using a sorted array of offsets to skip regions that must be avoided while copying into and out of a map value. When the map is created, we populate the offset array in struct map, Then, copy_map_value uses this sorted offset array is used to memcpy while skipping timer, spin lock, and kptr. The array is allocated as in most cases none of these special fields would be present in map value, hence we can save on space for the common case by not embedding the entire object inside bpf_map struct. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-6-memxor@gmail.com --- include/linux/bpf.h | 56 ++++++++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e13a5cbd4ebb..4e12b27a5de6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -158,6 +158,9 @@ struct bpf_map_ops { enum { /* Support at most 8 pointers in a BPF map value */ BPF_MAP_VALUE_OFF_MAX = 8, + BPF_MAP_OFF_ARR_MAX = BPF_MAP_VALUE_OFF_MAX + + 1 + /* for bpf_spin_lock */ + 1, /* for bpf_timer */ }; enum bpf_kptr_type { @@ -179,6 +182,12 @@ struct bpf_map_value_off { struct bpf_map_value_off_desc off[]; }; +struct bpf_map_off_arr { + u32 cnt; + u32 field_off[BPF_MAP_OFF_ARR_MAX]; + u8 field_sz[BPF_MAP_OFF_ARR_MAX]; +}; + struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -207,10 +216,7 @@ struct bpf_map { struct mem_cgroup *memcg; #endif char name[BPF_OBJ_NAME_LEN]; - bool bypass_spec_v1; - bool frozen; /* write-once; write-protected by freeze_mutex */ - /* 6 bytes hole */ - + struct bpf_map_off_arr *off_arr; /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ @@ -230,6 +236,8 @@ struct bpf_map { bool jited; bool xdp_has_frags; } owner; + bool bypass_spec_v1; + bool frozen; /* write-once; write-protected by freeze_mutex */ }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) @@ -253,37 +261,33 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); if (unlikely(map_value_has_timer(map))) memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); + if (unlikely(map_value_has_kptrs(map))) { + struct bpf_map_value_off *tab = map->kptr_off_tab; + int i; + + for (i = 0; i < tab->nr_off; i++) + *(u64 *)(dst + tab->off[i].offset) = 0; + } } /* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { - u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0; + u32 curr_off = 0; + int i; - if (unlikely(map_value_has_spin_lock(map))) { - s_off = map->spin_lock_off; - s_sz = sizeof(struct bpf_spin_lock); - } - if (unlikely(map_value_has_timer(map))) { - t_off = map->timer_off; - t_sz = sizeof(struct bpf_timer); + if (likely(!map->off_arr)) { + memcpy(dst, src, map->value_size); + return; } - if (unlikely(s_sz || t_sz)) { - if (s_off < t_off || !s_sz) { - swap(s_off, t_off); - swap(s_sz, t_sz); - } - memcpy(dst, src, t_off); - memcpy(dst + t_off + t_sz, - src + t_off + t_sz, - s_off - t_off - t_sz); - memcpy(dst + s_off + s_sz, - src + s_off + s_sz, - map->value_size - s_off - s_sz); - } else { - memcpy(dst, src, map->value_size); + for (i = 0; i < map->off_arr->cnt; i++) { + u32 next_off = map->off_arr->field_off[i]; + + memcpy(dst + curr_off, src + curr_off, next_off - curr_off); + curr_off += map->off_arr->field_sz[i]; } + memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); -- cgit v1.2.3 From 5ce937d613a423ca3102f53d9f3daf4210c1b6e2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:54 +0530 Subject: bpf: Populate pairs of btf_id and destructor kfunc in btf To support storing referenced PTR_TO_BTF_ID in maps, we require associating a specific BTF ID with a 'destructor' kfunc. This is because we need to release a live referenced pointer at a certain offset in map value from the map destruction path, otherwise we end up leaking resources. Hence, introduce support for passing an array of btf_id, kfunc_btf_id pairs that denote a BTF ID and its associated release function. Then, add an accessor 'btf_find_dtor_kfunc' which can be used to look up the destructor kfunc of a certain BTF ID. If found, we can use it to free the object from the map free path. The registration of these pairs also serve as a whitelist of structures which are allowed as referenced PTR_TO_BTF_ID in a BPF map, because without finding the destructor kfunc, we will bail and return an error. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-7-memxor@gmail.com --- include/linux/btf.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 19c297f9a52f..fea424681d66 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -40,6 +40,11 @@ struct btf_kfunc_id_set { }; }; +struct btf_id_dtor_kfunc { + u32 btf_id; + u32 kfunc_btf_id; +}; + extern const struct file_operations btf_fops; void btf_get(struct btf *btf); @@ -346,6 +351,9 @@ bool btf_kfunc_id_set_contains(const struct btf *btf, enum btf_kfunc_type type, u32 kfunc_btf_id); int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, const struct btf_kfunc_id_set *s); +s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); +int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, + struct module *owner); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -369,6 +377,15 @@ static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, { return 0; } +static inline s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) +{ + return -ENOENT; +} +static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, + u32 add_cnt, struct module *owner) +{ + return 0; +} #endif #endif -- cgit v1.2.3 From 14a324f6a67ef6a53e04362a70160a47eb8afffa Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:55 +0530 Subject: bpf: Wire up freeing of referenced kptr A destructor kfunc can be defined as void func(type *), where type may be void or any other pointer type as per convenience. In this patch, we ensure that the type is sane and capture the function pointer into off_desc of ptr_off_tab for the specific pointer offset, with the invariant that the dtor pointer is always set when 'kptr_ref' tag is applied to the pointer's pointee type, which is indicated by the flag BPF_MAP_VALUE_OFF_F_REF. Note that only BTF IDs whose destructor kfunc is registered, thus become the allowed BTF IDs for embedding as referenced kptr. Hence it serves the purpose of finding dtor kfunc BTF ID, as well acting as a check against the whitelist of allowed BTF IDs for this purpose. Finally, wire up the actual freeing of the referenced pointer if any at all available offsets, so that no references are leaked after the BPF map goes away and the BPF program previously moved the ownership a referenced pointer into it. The behavior is similar to BPF timers, where bpf_map_{update,delete}_elem will free any existing referenced kptr. The same case is with LRU map's bpf_lru_push_free/htab_lru_push_free functions, which are extended to reset unreferenced and free referenced kptr. Note that unlike BPF timers, kptr is not reset or freed when map uref drops to zero. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-8-memxor@gmail.com --- include/linux/bpf.h | 4 ++++ include/linux/btf.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e12b27a5de6..6141564c76c8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,6 +23,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -173,6 +174,8 @@ struct bpf_map_value_off_desc { enum bpf_kptr_type type; struct { struct btf *btf; + struct module *module; + btf_dtor_kfunc_t dtor; u32 btf_id; } kptr; }; @@ -1447,6 +1450,7 @@ struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u3 void bpf_map_free_kptr_off_tab(struct bpf_map *map); struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map); bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b); +void bpf_map_free_kptrs(struct bpf_map *map, void *map_value); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); diff --git a/include/linux/btf.h b/include/linux/btf.h index fea424681d66..f70625dd5bb4 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -45,6 +45,8 @@ struct btf_id_dtor_kfunc { u32 kfunc_btf_id; }; +typedef void (*btf_dtor_kfunc_t)(void *); + extern const struct file_operations btf_fops; void btf_get(struct btf *btf); -- cgit v1.2.3 From a1ef195996526da45bbc9710849254023df75aea Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:56 +0530 Subject: bpf: Teach verifier about kptr_get kfunc helpers We introduce a new style of kfunc helpers, namely *_kptr_get, where they take pointer to the map value which points to a referenced kernel pointer contained in the map. Since this is referenced, only bpf_kptr_xchg from BPF side and xchg from kernel side is allowed to change the current value, and each pointer that resides in that location would be referenced, and RCU protected (this must be kept in mind while adding kernel types embeddable as reference kptr in BPF maps). This means that if do the load of the pointer value in an RCU read section, and find a live pointer, then as long as we hold RCU read lock, it won't be freed by a parallel xchg + release operation. This allows us to implement a safe refcount increment scheme. Hence, enforce that first argument of all such kfunc is a proper PTR_TO_MAP_VALUE pointing at the right offset to referenced pointer. For the rest of the arguments, they are subjected to typical kfunc argument checks, hence allowing some flexibility in passing more intent into how the reference should be taken. For instance, in case of struct nf_conn, it is not freed until RCU grace period ends, but can still be reused for another tuple once refcount has dropped to zero. Hence, a bpf_ct_kptr_get helper not only needs to call refcount_inc_not_zero, but also do a tuple match after incrementing the reference, and when it fails to match it, put the reference again and return NULL. This can be implemented easily if we allow passing additional parameters to the bpf_ct_kptr_get kfunc, like a struct bpf_sock_tuple * and a tuple__sz pair. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-9-memxor@gmail.com --- include/linux/btf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index f70625dd5bb4..2611cea2c2b6 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -17,6 +17,7 @@ enum btf_kfunc_type { BTF_KFUNC_TYPE_ACQUIRE, BTF_KFUNC_TYPE_RELEASE, BTF_KFUNC_TYPE_RET_NULL, + BTF_KFUNC_TYPE_KPTR_ACQUIRE, BTF_KFUNC_TYPE_MAX, }; @@ -35,6 +36,7 @@ struct btf_kfunc_id_set { struct btf_id_set *acquire_set; struct btf_id_set *release_set; struct btf_id_set *ret_null_set; + struct btf_id_set *kptr_acquire_set; }; struct btf_id_set *sets[BTF_KFUNC_TYPE_MAX]; }; -- cgit v1.2.3 From 2ab3b3808eb17f729edfd69e061667ca0a427195 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Apr 2022 03:18:57 +0530 Subject: bpf: Make BTF type match stricter for release arguments The current of behavior of btf_struct_ids_match for release arguments is that when type match fails, it retries with first member type again (recursively). Since the offset is already 0, this is akin to just casting the pointer in normal C, since if type matches it was just embedded inside parent sturct as an object. However, we want to reject cases for release function type matching, be it kfunc or BPF helpers. An example is the following: struct foo { struct bar b; }; struct foo *v = acq_foo(); rel_bar(&v->b); // btf_struct_ids_match fails btf_types_are_same, then // retries with first member type and succeeds, while // it should fail. Hence, don't walk the struct and only rely on btf_types_are_same for strict mode. All users of strict mode must be dealing with zero offset anyway, since otherwise they would want the struct to be walked. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220424214901.2743946-10-memxor@gmail.com --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6141564c76c8..0af5793ba417 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1748,7 +1748,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, u32 *next_btf_id, enum bpf_type_flag *flag); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, - const struct btf *need_btf, u32 need_type_id); + const struct btf *need_btf, u32 need_type_id, + bool strict); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, -- cgit v1.2.3 From 66eb6df79aefd6b3f7d2e749da7104e90cedc0ff Mon Sep 17 00:00:00 2001 From: Andrew Davis Date: Mon, 25 Apr 2022 09:16:16 -0500 Subject: tee: remove tee_shm_va2pa() and tee_shm_pa2va() We should not need to index into SHMs based on absolute VA/PA. These functions are not used and this kind of usage should not be encouraged anyway. Remove these functions. Signed-off-by: Andrew Davis Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- include/linux/tee_drv.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 911cad324acc..17eb1c5205d3 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -298,24 +298,6 @@ void tee_shm_free(struct tee_shm *shm); */ void tee_shm_put(struct tee_shm *shm); -/** - * tee_shm_va2pa() - Get physical address of a virtual address - * @shm: Shared memory handle - * @va: Virtual address to tranlsate - * @pa: Returned physical address - * @returns 0 on success and < 0 on failure - */ -int tee_shm_va2pa(struct tee_shm *shm, void *va, phys_addr_t *pa); - -/** - * tee_shm_pa2va() - Get virtual address of a physical address - * @shm: Shared memory handle - * @pa: Physical address to tranlsate - * @va: Returned virtual address - * @returns 0 on success and < 0 on failure - */ -int tee_shm_pa2va(struct tee_shm *shm, phys_addr_t pa, void **va); - /** * tee_shm_get_va() - Get virtual address of a shared memory plus an offset * @shm: Shared memory handle -- cgit v1.2.3 From 97730bbb242cde22b7140acd202ffd88823886c9 Mon Sep 17 00:00:00 2001 From: Russ Weight Date: Thu, 21 Apr 2022 14:22:00 -0700 Subject: firmware_loader: Add firmware-upload support Extend the firmware subsystem to support a persistent sysfs interface that userspace may use to initiate a firmware update. For example, FPGA based PCIe cards load firmware and FPGA images from local FLASH when the card boots. The images in FLASH may be updated with new images provided by the user at his/her convenience. A device driver may call firmware_upload_register() to expose persistent "loading" and "data" sysfs files. These files are used in the same way as the fallback sysfs "loading" and "data" files. When 0 is written to "loading" to complete the write of firmware data, the data is transferred to the lower-level driver using pre-registered call-back functions. The data transfer is done in the context of a kernel worker thread. Reviewed-by: Luis Chamberlain Reviewed-by: Tianfei zhang Tested-by: Matthew Gerlach Signed-off-by: Russ Weight Link: https://lore.kernel.org/r/20220421212204.36052-5-russell.h.weight@intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware.h b/include/linux/firmware.h index ec2ccfebef65..de7fea3bca51 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -17,6 +17,64 @@ struct firmware { void *priv; }; +/** + * enum fw_upload_err - firmware upload error codes + * @FW_UPLOAD_ERR_NONE: returned to indicate success + * @FW_UPLOAD_ERR_HW_ERROR: error signalled by hardware, see kernel log + * @FW_UPLOAD_ERR_TIMEOUT: SW timed out on handshake with HW/firmware + * @FW_UPLOAD_ERR_CANCELED: upload was cancelled by the user + * @FW_UPLOAD_ERR_BUSY: there is an upload operation already in progress + * @FW_UPLOAD_ERR_INVALID_SIZE: invalid firmware image size + * @FW_UPLOAD_ERR_RW_ERROR: read or write to HW failed, see kernel log + * @FW_UPLOAD_ERR_WEAROUT: FLASH device is approaching wear-out, wait & retry + * @FW_UPLOAD_ERR_MAX: Maximum error code marker + */ +enum fw_upload_err { + FW_UPLOAD_ERR_NONE, + FW_UPLOAD_ERR_HW_ERROR, + FW_UPLOAD_ERR_TIMEOUT, + FW_UPLOAD_ERR_CANCELED, + FW_UPLOAD_ERR_BUSY, + FW_UPLOAD_ERR_INVALID_SIZE, + FW_UPLOAD_ERR_RW_ERROR, + FW_UPLOAD_ERR_WEAROUT, + FW_UPLOAD_ERR_MAX +}; + +struct fw_upload { + void *dd_handle; /* reference to parent driver */ + void *priv; /* firmware loader private fields */ +}; + +/** + * struct fw_upload_ops - device specific operations to support firmware upload + * @prepare: Required: Prepare secure update + * @write: Required: The write() op receives the remaining + * size to be written and must return the actual + * size written or a negative error code. The write() + * op will be called repeatedly until all data is + * written. + * @poll_complete: Required: Check for the completion of the + * HW authentication/programming process. + * @cancel: Required: Request cancellation of update. This op + * is called from the context of a different kernel + * thread, so race conditions need to be considered. + * @cleanup: Optional: Complements the prepare() + * function and is called at the completion + * of the update, on success or failure, if the + * prepare function succeeded. + */ +struct fw_upload_ops { + enum fw_upload_err (*prepare)(struct fw_upload *fw_upload, + const u8 *data, u32 size); + enum fw_upload_err (*write)(struct fw_upload *fw_upload, + const u8 *data, u32 offset, + u32 size, u32 *written); + enum fw_upload_err (*poll_complete)(struct fw_upload *fw_upload); + void (*cancel)(struct fw_upload *fw_upload); + void (*cleanup)(struct fw_upload *fw_upload); +}; + struct module; struct device; @@ -112,6 +170,30 @@ static inline int request_partial_firmware_into_buf #endif +#ifdef CONFIG_FW_UPLOAD + +struct fw_upload * +firmware_upload_register(struct module *module, struct device *parent, + const char *name, const struct fw_upload_ops *ops, + void *dd_handle); +void firmware_upload_unregister(struct fw_upload *fw_upload); + +#else + +static inline struct fw_upload * +firmware_upload_register(struct module *module, struct device *parent, + const char *name, const struct fw_upload_ops *ops, + void *dd_handle) +{ + return ERR_PTR(-EINVAL); +} + +static inline void firmware_upload_unregister(struct fw_upload *fw_upload) +{ +} + +#endif + int firmware_request_cache(struct device *device, const char *name); #endif -- cgit v1.2.3 From d434743e5cac3558381b767f343eef5af246a4dc Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 5 Apr 2022 19:27:37 +0530 Subject: bus: mhi: ep: Add support for registering MHI endpoint controllers This commit adds support for registering MHI endpoint controller drivers with the MHI endpoint stack. MHI endpoint controller drivers manage the interaction with the host machines (such as x86). They are also the MHI endpoint bus master in charge of managing the physical link between the host and endpoint device. Eventhough the MHI spec is bus agnostic, the current implementation is entirely based on PCIe bus. The endpoint controller driver encloses all information about the underlying physical bus like PCIe. The registration process involves parsing the channel configuration and allocating an MHI EP device. Channels used in the endpoint stack follows the perspective of the MHI host stack. i.e., UL - From host to endpoint DL - From endpoint to host Reviewed-by: Alex Elder Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220405135754.6622-2-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi_ep.h | 136 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 include/linux/mhi_ep.h (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h new file mode 100644 index 000000000000..891b199a9703 --- /dev/null +++ b/include/linux/mhi_ep.h @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022, Linaro Ltd. + * + */ +#ifndef _MHI_EP_H_ +#define _MHI_EP_H_ + +#include +#include + +#define MHI_EP_DEFAULT_MTU 0x8000 + +/** + * struct mhi_ep_channel_config - Channel configuration structure for controller + * @name: The name of this channel + * @num: The number assigned to this channel + * @num_elements: The number of elements that can be queued to this channel + * @dir: Direction that data may flow on this channel + */ +struct mhi_ep_channel_config { + char *name; + u32 num; + u32 num_elements; + enum dma_data_direction dir; +}; + +/** + * struct mhi_ep_cntrl_config - MHI Endpoint controller configuration + * @mhi_version: MHI spec version supported by the controller + * @max_channels: Maximum number of channels supported + * @num_channels: Number of channels defined in @ch_cfg + * @ch_cfg: Array of defined channels + */ +struct mhi_ep_cntrl_config { + u32 mhi_version; + u32 max_channels; + u32 num_channels; + const struct mhi_ep_channel_config *ch_cfg; +}; + +/** + * struct mhi_ep_db_info - MHI Endpoint doorbell info + * @mask: Mask of the doorbell interrupt + * @status: Status of the doorbell interrupt + */ +struct mhi_ep_db_info { + u32 mask; + u32 status; +}; + +/** + * struct mhi_ep_cntrl - MHI Endpoint controller structure + * @cntrl_dev: Pointer to the struct device of physical bus acting as the MHI + * Endpoint controller + * @mhi_dev: MHI Endpoint device instance for the controller + * @mmio: MMIO region containing the MHI registers + * @mhi_chan: Points to the channel configuration table + * @mhi_event: Points to the event ring configurations table + * @mhi_cmd: Points to the command ring configurations table + * @sm: MHI Endpoint state machine + * @raise_irq: CB function for raising IRQ to the host + * @alloc_map: CB function for allocating memory in endpoint for storing host context and mapping it + * @unmap_free: CB function to unmap and free the allocated memory in endpoint for storing host context + * @read_from_host: CB function for reading from host memory from endpoint + * @write_to_host: CB function for writing to host memory from endpoint + * @mhi_state: MHI Endpoint state + * @max_chan: Maximum channels supported by the endpoint controller + * @mru: MRU (Maximum Receive Unit) value of the endpoint controller + * @index: MHI Endpoint controller index + */ +struct mhi_ep_cntrl { + struct device *cntrl_dev; + struct mhi_ep_device *mhi_dev; + void __iomem *mmio; + + struct mhi_ep_chan *mhi_chan; + struct mhi_ep_event *mhi_event; + struct mhi_ep_cmd *mhi_cmd; + struct mhi_ep_sm *sm; + + void (*raise_irq)(struct mhi_ep_cntrl *mhi_cntrl, u32 vector); + int (*alloc_map)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t *phys_ptr, + void __iomem **virt, size_t size); + void (*unmap_free)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t phys, + void __iomem *virt, size_t size); + int (*read_from_host)(struct mhi_ep_cntrl *mhi_cntrl, u64 from, void *to, size_t size); + int (*write_to_host)(struct mhi_ep_cntrl *mhi_cntrl, void *from, u64 to, size_t size); + + enum mhi_state mhi_state; + + u32 max_chan; + u32 mru; + u32 index; +}; + +/** + * struct mhi_ep_device - Structure representing an MHI Endpoint device that binds + * to channels or is associated with controllers + * @dev: Driver model device node for the MHI Endpoint device + * @mhi_cntrl: Controller the device belongs to + * @id: Pointer to MHI Endpoint device ID struct + * @name: Name of the associated MHI Endpoint device + * @ul_chan: UL channel for the device + * @dl_chan: DL channel for the device + * @dev_type: MHI device type + */ +struct mhi_ep_device { + struct device dev; + struct mhi_ep_cntrl *mhi_cntrl; + const struct mhi_device_id *id; + const char *name; + struct mhi_ep_chan *ul_chan; + struct mhi_ep_chan *dl_chan; + enum mhi_device_type dev_type; +}; + +#define to_mhi_ep_device(dev) container_of(dev, struct mhi_ep_device, dev) + +/** + * mhi_ep_register_controller - Register MHI Endpoint controller + * @mhi_cntrl: MHI Endpoint controller to register + * @config: Configuration to use for the controller + * + * Return: 0 if controller registrations succeeds, a negative error code otherwise. + */ +int mhi_ep_register_controller(struct mhi_ep_cntrl *mhi_cntrl, + const struct mhi_ep_cntrl_config *config); + +/** + * mhi_ep_unregister_controller - Unregister MHI Endpoint controller + * @mhi_cntrl: MHI Endpoint controller to unregister + */ +void mhi_ep_unregister_controller(struct mhi_ep_cntrl *mhi_cntrl); + +#endif -- cgit v1.2.3 From ee0360b20b3fa08e2eee300eacb45e812ae44578 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 5 Apr 2022 19:27:38 +0530 Subject: bus: mhi: ep: Add support for registering MHI endpoint client drivers This commit adds support for registering MHI endpoint client drivers with the MHI endpoint stack. MHI endpoint client drivers bind to one or more MHI endpoint devices inorder to send and receive the upper-layer protocol packets like IP packets, modem control messages, and diagnostics messages over MHI bus. Reviewed-by: Hemant Kumar Reviewed-by: Alex Elder Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220405135754.6622-3-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi_ep.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index 891b199a9703..e2b94f9eb846 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -101,8 +101,8 @@ struct mhi_ep_cntrl { * @mhi_cntrl: Controller the device belongs to * @id: Pointer to MHI Endpoint device ID struct * @name: Name of the associated MHI Endpoint device - * @ul_chan: UL channel for the device - * @dl_chan: DL channel for the device + * @ul_chan: UL (from host to endpoint) channel for the device + * @dl_chan: DL (from endpoint to host) channel for the device * @dev_type: MHI device type */ struct mhi_ep_device { @@ -115,7 +115,60 @@ struct mhi_ep_device { enum mhi_device_type dev_type; }; +/** + * struct mhi_ep_driver - Structure representing a MHI Endpoint client driver + * @id_table: Pointer to MHI Endpoint device ID table + * @driver: Device driver model driver + * @probe: CB function for client driver probe function + * @remove: CB function for client driver remove function + * @ul_xfer_cb: CB function for UL (from host to endpoint) data transfer + * @dl_xfer_cb: CB function for DL (from endpoint to host) data transfer + */ +struct mhi_ep_driver { + const struct mhi_device_id *id_table; + struct device_driver driver; + int (*probe)(struct mhi_ep_device *mhi_ep, + const struct mhi_device_id *id); + void (*remove)(struct mhi_ep_device *mhi_ep); + void (*ul_xfer_cb)(struct mhi_ep_device *mhi_dev, + struct mhi_result *result); + void (*dl_xfer_cb)(struct mhi_ep_device *mhi_dev, + struct mhi_result *result); +}; + #define to_mhi_ep_device(dev) container_of(dev, struct mhi_ep_device, dev) +#define to_mhi_ep_driver(drv) container_of(drv, struct mhi_ep_driver, driver) + +/* + * module_mhi_ep_driver() - Helper macro for drivers that don't do + * anything special other than using default mhi_ep_driver_register() and + * mhi_ep_driver_unregister(). This eliminates a lot of boilerplate. + * Each module may only use this macro once. + */ +#define module_mhi_ep_driver(mhi_drv) \ + module_driver(mhi_drv, mhi_ep_driver_register, \ + mhi_ep_driver_unregister) + +/* + * Macro to avoid include chaining to get THIS_MODULE + */ +#define mhi_ep_driver_register(mhi_drv) \ + __mhi_ep_driver_register(mhi_drv, THIS_MODULE) + +/** + * __mhi_ep_driver_register - Register a driver with MHI Endpoint bus + * @mhi_drv: Driver to be associated with the device + * @owner: The module owner + * + * Return: 0 if driver registrations succeeds, a negative error code otherwise. + */ +int __mhi_ep_driver_register(struct mhi_ep_driver *mhi_drv, struct module *owner); + +/** + * mhi_ep_driver_unregister - Unregister a driver from MHI Endpoint bus + * @mhi_drv: Driver associated with the device + */ +void mhi_ep_driver_unregister(struct mhi_ep_driver *mhi_drv); /** * mhi_ep_register_controller - Register MHI Endpoint controller -- cgit v1.2.3 From e9e4da23cd65ea76ba658346f5c182791bd1cea9 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 5 Apr 2022 19:27:40 +0530 Subject: bus: mhi: ep: Add support for managing MMIO registers Add support for managing the Memory Mapped Input Output (MMIO) registers of the MHI bus. All MHI operations are carried out using the MMIO registers by both host and the endpoint device. The MMIO registers reside inside the endpoint device memory (fixed location based on the platform) and the address is passed by the MHI EP controller driver during its registration. Reviewed-by: Alex Elder Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220405135754.6622-5-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi_ep.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index e2b94f9eb846..5db048e258e4 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -59,6 +59,10 @@ struct mhi_ep_db_info { * @mhi_event: Points to the event ring configurations table * @mhi_cmd: Points to the command ring configurations table * @sm: MHI Endpoint state machine + * @ch_ctx_host_pa: Physical address of host channel context data structure + * @ev_ctx_host_pa: Physical address of host event context data structure + * @cmd_ctx_host_pa: Physical address of host command context data structure + * @chdb: Array of channel doorbell interrupt info * @raise_irq: CB function for raising IRQ to the host * @alloc_map: CB function for allocating memory in endpoint for storing host context and mapping it * @unmap_free: CB function to unmap and free the allocated memory in endpoint for storing host context @@ -67,6 +71,10 @@ struct mhi_ep_db_info { * @mhi_state: MHI Endpoint state * @max_chan: Maximum channels supported by the endpoint controller * @mru: MRU (Maximum Receive Unit) value of the endpoint controller + * @event_rings: Number of event rings supported by the endpoint controller + * @hw_event_rings: Number of hardware event rings supported by the endpoint controller + * @chdb_offset: Channel doorbell offset set by the host + * @erdb_offset: Event ring doorbell offset set by the host * @index: MHI Endpoint controller index */ struct mhi_ep_cntrl { @@ -79,6 +87,12 @@ struct mhi_ep_cntrl { struct mhi_ep_cmd *mhi_cmd; struct mhi_ep_sm *sm; + u64 ch_ctx_host_pa; + u64 ev_ctx_host_pa; + u64 cmd_ctx_host_pa; + + struct mhi_ep_db_info chdb[4]; + void (*raise_irq)(struct mhi_ep_cntrl *mhi_cntrl, u32 vector); int (*alloc_map)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t *phys_ptr, void __iomem **virt, size_t size); @@ -91,6 +105,10 @@ struct mhi_ep_cntrl { u32 max_chan; u32 mru; + u32 event_rings; + u32 hw_event_rings; + u32 chdb_offset; + u32 erdb_offset; u32 index; }; -- cgit v1.2.3 From 961aeb6892242e0a667f5b8eb62b9b0a6041752c Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 5 Apr 2022 19:27:42 +0530 Subject: bus: mhi: ep: Add support for sending events to the host Add support for sending the events to the host over MHI bus from the endpoint. Following events are supported: 1. Transfer completion event 2. Command completion event 3. State change event 4. Execution Environment (EE) change event An event is sent whenever an operation has been completed in the MHI EP device. Event is sent using the MHI event ring and additionally the host is notified using an IRQ if required. Reviewed-by: Alex Elder Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220405135754.6622-7-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi_ep.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index 5db048e258e4..46236ffb528a 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -59,10 +59,14 @@ struct mhi_ep_db_info { * @mhi_event: Points to the event ring configurations table * @mhi_cmd: Points to the command ring configurations table * @sm: MHI Endpoint state machine + * @ch_ctx_cache: Cache of host channel context data structure + * @ev_ctx_cache: Cache of host event context data structure + * @cmd_ctx_cache: Cache of host command context data structure * @ch_ctx_host_pa: Physical address of host channel context data structure * @ev_ctx_host_pa: Physical address of host event context data structure * @cmd_ctx_host_pa: Physical address of host command context data structure * @chdb: Array of channel doorbell interrupt info + * @event_lock: Lock for protecting event rings * @raise_irq: CB function for raising IRQ to the host * @alloc_map: CB function for allocating memory in endpoint for storing host context and mapping it * @unmap_free: CB function to unmap and free the allocated memory in endpoint for storing host context @@ -87,11 +91,15 @@ struct mhi_ep_cntrl { struct mhi_ep_cmd *mhi_cmd; struct mhi_ep_sm *sm; + struct mhi_chan_ctxt *ch_ctx_cache; + struct mhi_event_ctxt *ev_ctx_cache; + struct mhi_cmd_ctxt *cmd_ctx_cache; u64 ch_ctx_host_pa; u64 ev_ctx_host_pa; u64 cmd_ctx_host_pa; struct mhi_ep_db_info chdb[4]; + struct mutex event_lock; void (*raise_irq)(struct mhi_ep_cntrl *mhi_cntrl, u32 vector); int (*alloc_map)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t *phys_ptr, -- cgit v1.2.3 From f9baa4f737950523ca648866dfa345ac378e4487 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 5 Apr 2022 19:27:43 +0530 Subject: bus: mhi: ep: Add support for managing MHI state machine Add support for managing the MHI state machine by controlling the state transitions. Only the following MHI state transitions are supported: 1. Ready state 2. M0 state 3. M3 state 4. SYS_ERR state Reviewed-by: Alex Elder Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220405135754.6622-8-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi_ep.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index 46236ffb528a..2880d2aa88b8 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -67,6 +67,11 @@ struct mhi_ep_db_info { * @cmd_ctx_host_pa: Physical address of host command context data structure * @chdb: Array of channel doorbell interrupt info * @event_lock: Lock for protecting event rings + * @list_lock: Lock for protecting state transition and channel doorbell lists + * @state_lock: Lock for protecting state transitions + * @st_transition_list: List of state transitions + * @wq: Dedicated workqueue for handling rings and state changes + * @state_work: State transition worker * @raise_irq: CB function for raising IRQ to the host * @alloc_map: CB function for allocating memory in endpoint for storing host context and mapping it * @unmap_free: CB function to unmap and free the allocated memory in endpoint for storing host context @@ -100,6 +105,13 @@ struct mhi_ep_cntrl { struct mhi_ep_db_info chdb[4]; struct mutex event_lock; + spinlock_t list_lock; + spinlock_t state_lock; + + struct list_head st_transition_list; + + struct workqueue_struct *wq; + struct work_struct state_work; void (*raise_irq)(struct mhi_ep_cntrl *mhi_cntrl, u32 vector); int (*alloc_map)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t *phys_ptr, -- cgit v1.2.3 From 4799e71b082615445dc40ba0bbb86cbb76c24724 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 5 Apr 2022 19:27:44 +0530 Subject: bus: mhi: ep: Add support for processing MHI endpoint interrupts Add support for processing MHI endpoint interrupts such as control interrupt, command interrupt and channel interrupt from the host. The interrupts will be generated in the endpoint device whenever host writes to the corresponding doorbell registers. The doorbell logic is handled inside the hardware internally. Reviewed-by: Alex Elder Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220405135754.6622-9-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi_ep.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index 2880d2aa88b8..137bd3ee2e43 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -70,6 +70,7 @@ struct mhi_ep_db_info { * @list_lock: Lock for protecting state transition and channel doorbell lists * @state_lock: Lock for protecting state transitions * @st_transition_list: List of state transitions + * @ch_db_list: List of queued channel doorbells * @wq: Dedicated workqueue for handling rings and state changes * @state_work: State transition worker * @raise_irq: CB function for raising IRQ to the host @@ -85,6 +86,7 @@ struct mhi_ep_db_info { * @chdb_offset: Channel doorbell offset set by the host * @erdb_offset: Event ring doorbell offset set by the host * @index: MHI Endpoint controller index + * @irq: IRQ used by the endpoint controller */ struct mhi_ep_cntrl { struct device *cntrl_dev; @@ -109,6 +111,7 @@ struct mhi_ep_cntrl { spinlock_t state_lock; struct list_head st_transition_list; + struct list_head ch_db_list; struct workqueue_struct *wq; struct work_struct state_work; @@ -130,6 +133,7 @@ struct mhi_ep_cntrl { u32 chdb_offset; u32 erdb_offset; u32 index; + int irq; }; /** -- cgit v1.2.3 From fb3a26b7e8aff11e44d582604f61c38f63bd507c Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 5 Apr 2022 19:27:45 +0530 Subject: bus: mhi: ep: Add support for powering up the MHI endpoint stack Add support for MHI endpoint power_up that includes initializing the MMIO and rings, caching the host MHI registers, and setting the MHI state to M0. After registering the MHI EP controller, the stack has to be powered up for usage. Reviewed-by: Alex Elder Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220405135754.6622-10-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi_ep.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index 137bd3ee2e43..3b065f82fbeb 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -65,6 +65,9 @@ struct mhi_ep_db_info { * @ch_ctx_host_pa: Physical address of host channel context data structure * @ev_ctx_host_pa: Physical address of host event context data structure * @cmd_ctx_host_pa: Physical address of host command context data structure + * @ch_ctx_cache_phys: Physical address of the host channel context cache + * @ev_ctx_cache_phys: Physical address of the host event context cache + * @cmd_ctx_cache_phys: Physical address of the host command context cache * @chdb: Array of channel doorbell interrupt info * @event_lock: Lock for protecting event rings * @list_lock: Lock for protecting state transition and channel doorbell lists @@ -87,6 +90,7 @@ struct mhi_ep_db_info { * @erdb_offset: Event ring doorbell offset set by the host * @index: MHI Endpoint controller index * @irq: IRQ used by the endpoint controller + * @enabled: Check if the endpoint controller is enabled or not */ struct mhi_ep_cntrl { struct device *cntrl_dev; @@ -104,6 +108,9 @@ struct mhi_ep_cntrl { u64 ch_ctx_host_pa; u64 ev_ctx_host_pa; u64 cmd_ctx_host_pa; + phys_addr_t ch_ctx_cache_phys; + phys_addr_t ev_ctx_cache_phys; + phys_addr_t cmd_ctx_cache_phys; struct mhi_ep_db_info chdb[4]; struct mutex event_lock; @@ -134,6 +141,7 @@ struct mhi_ep_cntrl { u32 erdb_offset; u32 index; int irq; + bool enabled; }; /** @@ -228,4 +236,12 @@ int mhi_ep_register_controller(struct mhi_ep_cntrl *mhi_cntrl, */ void mhi_ep_unregister_controller(struct mhi_ep_cntrl *mhi_cntrl); +/** + * mhi_ep_power_up - Power up the MHI endpoint stack + * @mhi_cntrl: MHI Endpoint controller + * + * Return: 0 if power up succeeds, a negative error code otherwise. + */ +int mhi_ep_power_up(struct mhi_ep_cntrl *mhi_cntrl); + #endif -- cgit v1.2.3 From 5d507ee04894e166f8c5a29f05c6b06ce91d5833 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 5 Apr 2022 19:27:46 +0530 Subject: bus: mhi: ep: Add support for powering down the MHI endpoint stack Add support for MHI endpoint power_down that includes stopping all available channels, destroying the channels, resetting the event and transfer rings and freeing the host cache. The stack will be powered down whenever the physical bus link goes down. Reviewed-by: Alex Elder Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220405135754.6622-11-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi_ep.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index 3b065f82fbeb..9da683e8302c 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -244,4 +244,10 @@ void mhi_ep_unregister_controller(struct mhi_ep_cntrl *mhi_cntrl); */ int mhi_ep_power_up(struct mhi_ep_cntrl *mhi_cntrl); +/** + * mhi_ep_power_down - Power down the MHI endpoint stack + * @mhi_cntrl: MHI controller + */ +void mhi_ep_power_down(struct mhi_ep_cntrl *mhi_cntrl); + #endif -- cgit v1.2.3 From 7a97b6b47353c60dd1b53ada6180741437e377f2 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 5 Apr 2022 19:27:47 +0530 Subject: bus: mhi: ep: Add support for handling MHI_RESET Add support for handling MHI_RESET in MHI endpoint stack. MHI_RESET will be issued by the host during shutdown and during error scenario so that it can recover the endpoint device without restarting the whole device. MHI_RESET handling involves resetting the internal MHI registers, data structures, state machines, resetting all channels/rings and setting MHICTRL.RESET bit to 0. Additionally the device will also move to READY state if the reset was due to SYS_ERR. Reviewed-by: Alex Elder Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220405135754.6622-12-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi_ep.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index 9da683e8302c..2f31a54c205f 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -76,6 +76,7 @@ struct mhi_ep_db_info { * @ch_db_list: List of queued channel doorbells * @wq: Dedicated workqueue for handling rings and state changes * @state_work: State transition worker + * @reset_work: Worker for MHI Endpoint reset * @raise_irq: CB function for raising IRQ to the host * @alloc_map: CB function for allocating memory in endpoint for storing host context and mapping it * @unmap_free: CB function to unmap and free the allocated memory in endpoint for storing host context @@ -122,6 +123,7 @@ struct mhi_ep_cntrl { struct workqueue_struct *wq; struct work_struct state_work; + struct work_struct reset_work; void (*raise_irq)(struct mhi_ep_cntrl *mhi_cntrl, u32 vector); int (*alloc_map)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t *phys_ptr, -- cgit v1.2.3 From e827569062a804c67b51930ce83a4cb886113cb7 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 5 Apr 2022 19:27:49 +0530 Subject: bus: mhi: ep: Add support for processing command rings Add support for processing the command rings. Command ring is used by the host to issue channel specific commands to the ep device. Following commands are supported: 1. Start channel 2. Stop channel 3. Reset channel Once the device receives the command doorbell interrupt from host, it executes the command and generates a command completion event to the host in the primary event ring. Reviewed-by: Alex Elder Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220405135754.6622-14-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi_ep.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index 2f31a54c205f..8c6406d9c51f 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -77,6 +77,7 @@ struct mhi_ep_db_info { * @wq: Dedicated workqueue for handling rings and state changes * @state_work: State transition worker * @reset_work: Worker for MHI Endpoint reset + * @cmd_ring_work: Worker for processing command rings * @raise_irq: CB function for raising IRQ to the host * @alloc_map: CB function for allocating memory in endpoint for storing host context and mapping it * @unmap_free: CB function to unmap and free the allocated memory in endpoint for storing host context @@ -124,6 +125,7 @@ struct mhi_ep_cntrl { struct workqueue_struct *wq; struct work_struct state_work; struct work_struct reset_work; + struct work_struct cmd_ring_work; void (*raise_irq)(struct mhi_ep_cntrl *mhi_cntrl, u32 vector); int (*alloc_map)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t *phys_ptr, -- cgit v1.2.3 From 530125889977365cb6db32d7d0bd84c9f54c8aab Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 5 Apr 2022 19:27:50 +0530 Subject: bus: mhi: ep: Add support for reading from the host Data transfer between host and the ep device happens over the transfer ring associated with each bi-directional channel pair. Host defines the transfer ring by allocating memory for it. The read and write pointer addresses of the transfer ring are stored in the channel context. Once host places the elements in the transfer ring, it increments the write pointer and rings the channel doorbell. Device will receive the doorbell interrupt and will process the transfer ring elements. This commit adds support for reading the transfer ring elements from the transfer ring till write pointer, incrementing the read pointer and finally sending the completion event to the host through corresponding event ring. Reviewed-by: Alex Elder Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220405135754.6622-15-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi_ep.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index 8c6406d9c51f..fc7d197413eb 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -254,4 +254,13 @@ int mhi_ep_power_up(struct mhi_ep_cntrl *mhi_cntrl); */ void mhi_ep_power_down(struct mhi_ep_cntrl *mhi_cntrl); +/** + * mhi_ep_queue_is_empty - Determine whether the transfer queue is empty + * @mhi_dev: Device associated with the channels + * @dir: DMA direction for the channel + * + * Return: true if the queue is empty, false otherwise. + */ +bool mhi_ep_queue_is_empty(struct mhi_ep_device *mhi_dev, enum dma_data_direction dir); + #endif -- cgit v1.2.3 From 03c0bb8ec983f993a704417d73cc0a3511453d3e Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 5 Apr 2022 19:27:51 +0530 Subject: bus: mhi: ep: Add support for processing channel rings Add support for processing the channel rings from host. For the channel ring associated with DL channel, the xfer callback will simply invoked. For the case of UL channel, the ring elements will be read in a buffer till the write pointer and later passed to the client driver using the xfer callback. The client drivers should provide the callbacks for both UL and DL channels during registration. Reviewed-by: Alex Elder Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220405135754.6622-16-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi_ep.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index fc7d197413eb..eecc8f35d630 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -78,6 +78,7 @@ struct mhi_ep_db_info { * @state_work: State transition worker * @reset_work: Worker for MHI Endpoint reset * @cmd_ring_work: Worker for processing command rings + * @ch_ring_work: Worker for processing channel rings * @raise_irq: CB function for raising IRQ to the host * @alloc_map: CB function for allocating memory in endpoint for storing host context and mapping it * @unmap_free: CB function to unmap and free the allocated memory in endpoint for storing host context @@ -126,6 +127,7 @@ struct mhi_ep_cntrl { struct work_struct state_work; struct work_struct reset_work; struct work_struct cmd_ring_work; + struct work_struct ch_ring_work; void (*raise_irq)(struct mhi_ep_cntrl *mhi_cntrl, u32 vector); int (*alloc_map)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t *phys_ptr, -- cgit v1.2.3 From 2d945a394d9c1c59d88397cb383b11216d018a6b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 5 Apr 2022 19:27:52 +0530 Subject: bus: mhi: ep: Add support for queueing SKBs to the host Add support for queueing SKBs to the host over the transfer ring of the relevant channel. The mhi_ep_queue_skb() API will be used by the client networking drivers to queue the SKBs to the host over MHI bus. The host will add ring elements to the transfer ring periodically for the device and the device will write SKBs to the ring elements. If a single SKB doesn't fit in a ring element (TRE), it will be placed in multiple ring elements and the overflow event will be sent for all ring elements except the last one. For the last ring element, the EOT event will be sent indicating the packet boundary. Reviewed-by: Alex Elder Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220405135754.6622-17-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi_ep.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index eecc8f35d630..478aece17046 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -265,4 +265,13 @@ void mhi_ep_power_down(struct mhi_ep_cntrl *mhi_cntrl); */ bool mhi_ep_queue_is_empty(struct mhi_ep_device *mhi_dev, enum dma_data_direction dir); +/** + * mhi_ep_queue_skb - Send SKBs to host over MHI Endpoint + * @mhi_dev: Device associated with the DL channel + * @skb: SKBs to be queued + * + * Return: 0 if the SKBs has been sent successfully, a negative error code otherwise. + */ +int mhi_ep_queue_skb(struct mhi_ep_device *mhi_dev, struct sk_buff *skb); + #endif -- cgit v1.2.3 From c268c0a8a33047cd957fecc1349d09a68eb6ad9e Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 5 Apr 2022 19:27:54 +0530 Subject: bus: mhi: ep: Add uevent support for module autoloading Add uevent support to MHI endpoint bus so that the client drivers can be autoloaded by udev when the MHI endpoint devices gets created. The client drivers are expected to provide MODULE_DEVICE_TABLE with the MHI id_table struct so that the alias can be exported. The MHI endpoint reused the mhi_device_id structure of the MHI bus. Reviewed-by: Alex Elder Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220405135754.6622-19-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mod_devicetable.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 5da5d990ff58..549590e9c644 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -835,6 +835,8 @@ struct wmi_device_id { #define MHI_DEVICE_MODALIAS_FMT "mhi:%s" #define MHI_NAME_SIZE 32 +#define MHI_EP_DEVICE_MODALIAS_FMT "mhi_ep:%s" + /** * struct mhi_device_id - MHI device identification * @chan: MHI channel name -- cgit v1.2.3 From 31f6bd7fad3b149a1eb6f67fc2e742e4df369b3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 25 Apr 2022 17:33:58 +0300 Subject: serial: Store character timing information to uart_port MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Struct uart_port currently stores FIFO timeout. Having character timing information readily available is useful. Even serial core itself determines char_time from port->timeout using inverse calculation. Store frame_time directly into uart_port. Character time is stored in nanoseconds to have reasonable precision with high rates. To avoid overflow, 64-bit math is necessary. It might be possible to determine timeout from frame_time by multiplying it with fifosize as needed but only part of the users seem to be protected by a lock. Thus, this patch does not pursue storing only frame_time in uart_port. Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220425143410.12703-2-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index d4828e69087a..cbd5070bc87f 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -232,6 +232,7 @@ struct uart_port { int hw_stopped; /* sw-assisted CTS flow state */ unsigned int mctrl; /* current modem ctrl settings */ unsigned int timeout; /* character-based timeout */ + unsigned int frame_time; /* frame timing in ns */ unsigned int type; /* port type */ const struct uart_ops *ops; unsigned int custom_divisor; -- cgit v1.2.3 From 7a20917d30fb627bebff6bacbe860ad4eb3a04d6 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Fri, 22 Apr 2022 15:23:45 -0700 Subject: device property: Add helper to match multiple connections In some cases multiple connections with the same connection id needs to be resolved from a fwnode graph. One such example is when separate hardware is used for performing muxing and/or orientation switching of the SuperSpeed and SBU lines in a USB Type-C connector. In this case the connector needs to belong to a graph with multiple matching remote endpoints, and the Type-C controller needs to be able to resolve them both. Add a new API that allows this kind of lookup. Reviewed-by: Andy Shevchenko Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220422222351.1297276-2-bjorn.andersson@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/property.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 4cd4b326941f..de7ff336d2c8 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -447,6 +447,11 @@ static inline void *device_connection_find_match(struct device *dev, return fwnode_connection_find_match(dev_fwnode(dev), con_id, data, match); } +int fwnode_connection_find_matches(struct fwnode_handle *fwnode, + const char *con_id, void *data, + devcon_match_fn_t match, + void **matches, unsigned int matches_len); + /* -------------------------------------------------------------------------- */ /* Software fwnode support - when HW description is incomplete or missing */ -- cgit v1.2.3 From 713fd49b430c37263c6cae2c82954f4e1cbcd90d Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Fri, 22 Apr 2022 15:23:48 -0700 Subject: usb: typec: mux: Introduce indirection Rather than directly exposing the implementation's representation of the typec muxes to the controller/clients, introduce an indirection object. This enables the introduction of turning this relationship into a one-to-many in the following patch. Acked-by: Heikki Krogerus Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220422222351.1297276-5-bjorn.andersson@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec_mux.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/typec_mux.h b/include/linux/usb/typec_mux.h index a9d9957933dc..ee57781dcf28 100644 --- a/include/linux/usb/typec_mux.h +++ b/include/linux/usb/typec_mux.h @@ -8,11 +8,13 @@ struct device; struct typec_mux; +struct typec_mux_dev; struct typec_switch; +struct typec_switch_dev; struct typec_altmode; struct fwnode_handle; -typedef int (*typec_switch_set_fn_t)(struct typec_switch *sw, +typedef int (*typec_switch_set_fn_t)(struct typec_switch_dev *sw, enum typec_orientation orientation); struct typec_switch_desc { @@ -32,13 +34,13 @@ static inline struct typec_switch *typec_switch_get(struct device *dev) return fwnode_typec_switch_get(dev_fwnode(dev)); } -struct typec_switch * +struct typec_switch_dev * typec_switch_register(struct device *parent, const struct typec_switch_desc *desc); -void typec_switch_unregister(struct typec_switch *sw); +void typec_switch_unregister(struct typec_switch_dev *sw); -void typec_switch_set_drvdata(struct typec_switch *sw, void *data); -void *typec_switch_get_drvdata(struct typec_switch *sw); +void typec_switch_set_drvdata(struct typec_switch_dev *sw, void *data); +void *typec_switch_get_drvdata(struct typec_switch_dev *sw); struct typec_mux_state { struct typec_altmode *alt; @@ -46,7 +48,7 @@ struct typec_mux_state { void *data; }; -typedef int (*typec_mux_set_fn_t)(struct typec_mux *mux, +typedef int (*typec_mux_set_fn_t)(struct typec_mux_dev *mux, struct typec_mux_state *state); struct typec_mux_desc { @@ -67,11 +69,11 @@ typec_mux_get(struct device *dev, const struct typec_altmode_desc *desc) return fwnode_typec_mux_get(dev_fwnode(dev), desc); } -struct typec_mux * +struct typec_mux_dev * typec_mux_register(struct device *parent, const struct typec_mux_desc *desc); -void typec_mux_unregister(struct typec_mux *mux); +void typec_mux_unregister(struct typec_mux_dev *mux); -void typec_mux_set_drvdata(struct typec_mux *mux, void *data); -void *typec_mux_get_drvdata(struct typec_mux *mux); +void typec_mux_set_drvdata(struct typec_mux_dev *mux, void *data); +void *typec_mux_get_drvdata(struct typec_mux_dev *mux); #endif /* __USB_TYPEC_MUX */ -- cgit v1.2.3 From af1969a2d734d6272c0640b50c3ed31e59e203a9 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Sat, 23 Apr 2022 20:42:03 -0400 Subject: USB: gadget: Rename usb_gadget_probe_driver() In preparation for adding a "gadget" bus, this patch renames usb_gadget_probe_driver() to usb_gadget_register_driver(). The new name will be more accurate, since gadget drivers will be registered on the gadget bus and the probing will be done by the driver core, not the UDC core. Signed-off-by: Alan Stern Link: https://lore.kernel.org/r/YmSc29YZvxgT5fEJ@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/gadget.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 10fe57cf40be..5830b8a903da 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -745,7 +745,7 @@ struct usb_gadget_driver { */ /** - * usb_gadget_probe_driver - probe a gadget driver + * usb_gadget_register_driver - register a gadget driver * @driver: the driver being registered * Context: can sleep * @@ -755,7 +755,7 @@ struct usb_gadget_driver { * registration call returns. It's expected that the @bind() function will * be in init sections. */ -int usb_gadget_probe_driver(struct usb_gadget_driver *driver); +int usb_gadget_register_driver(struct usb_gadget_driver *driver); /** * usb_gadget_unregister_driver - unregister a gadget driver -- cgit v1.2.3 From fc274c1e997314bf47f6a62c79b5d7e554ed59c4 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Sat, 23 Apr 2022 21:35:51 -0400 Subject: USB: gadget: Add a new bus for gadgets This patch adds a "gadget" bus and uses it for registering gadgets and their drivers. From now on, bindings will be managed by the driver core rather than through ad-hoc manipulations in the UDC core. As part of this change, the driver_pending_list is removed. The UDC core won't need to keep track of unbound drivers for later binding, because the driver core handles all of that for us. However, we do need one new feature: a way to prevent gadget drivers from being bound to more than one gadget at a time. The existing code does this automatically, but the driver core doesn't -- it's perfectly happy to bind a single driver to all the matching devices on the bus. The patch adds a new bitflag to the usb_gadget_driver structure for this purpose. A nice side effect of this change is a reduction in the total lines of code, since now the driver core will do part of the work that the UDC used to do. A possible future patch could add udc devices to the gadget bus, say as a separate device type. Signed-off-by: Alan Stern Link: https://lore.kernel.org/r/YmSpdxaDNeC2BBOf@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/gadget.h | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 5830b8a903da..cf7af8a0a6e9 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -664,9 +664,9 @@ static inline int usb_gadget_check_config(struct usb_gadget *gadget) * @driver: Driver model state for this driver. * @udc_name: A name of UDC this driver should be bound to. If udc_name is NULL, * this driver will be bound to any available UDC. - * @pending: UDC core private data used for deferred probe of this driver. - * @match_existing_only: If udc is not found, return an error and don't add this - * gadget driver to list of pending driver + * @match_existing_only: If udc is not found, return an error and fail + * the driver registration + * @is_bound: Allow a driver to be bound to only one gadget * * Devices are disabled till a gadget driver successfully bind()s, which * means the driver will handle setup() requests needed to enumerate (and @@ -729,8 +729,8 @@ struct usb_gadget_driver { struct device_driver driver; char *udc_name; - struct list_head pending; unsigned match_existing_only:1; + bool is_bound:1; }; @@ -740,22 +740,30 @@ struct usb_gadget_driver { /* driver modules register and unregister, as usual. * these calls must be made in a context that can sleep. * - * these will usually be implemented directly by the hardware-dependent - * usb bus interface driver, which will only support a single driver. + * A gadget driver can be bound to only one gadget at a time. */ /** - * usb_gadget_register_driver - register a gadget driver + * usb_gadget_register_driver_owner - register a gadget driver * @driver: the driver being registered + * @owner: the driver module + * @mod_name: the driver module's build name * Context: can sleep * * Call this in your gadget driver's module initialization function, - * to tell the underlying usb controller driver about your driver. + * to tell the underlying UDC controller driver about your driver. * The @bind() function will be called to bind it to a gadget before this * registration call returns. It's expected that the @bind() function will * be in init sections. + * + * Use the macro defined below instead of calling this directly. */ -int usb_gadget_register_driver(struct usb_gadget_driver *driver); +int usb_gadget_register_driver_owner(struct usb_gadget_driver *driver, + struct module *owner, const char *mod_name); + +/* use a define to avoid include chaining to get THIS_MODULE & friends */ +#define usb_gadget_register_driver(driver) \ + usb_gadget_register_driver_owner(driver, THIS_MODULE, KBUILD_MODNAME) /** * usb_gadget_unregister_driver - unregister a gadget driver -- cgit v1.2.3 From 8e8b11956486e3fe8cacf54a1d492ebdd8cc1fb2 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Thu, 17 Feb 2022 10:42:52 -0800 Subject: of/platform: Add stubs for of_platform_device_create/destroy() Code for platform_device_create() and of_platform_device_destroy() is only generated if CONFIG_OF_ADDRESS=y. Add stubs to avoid unresolved symbols when CONFIG_OF_ADDRESS is not set. Reviewed-by: Stephen Boyd Reviewed-by: Douglas Anderson Acked-by: Rob Herring Signed-off-by: Matthias Kaehlcke Link: https://lore.kernel.org/r/20220217104219.v21.1.I08fd2e1c775af04f663730e9fb4d00e6bbb38541@changeid Signed-off-by: Greg Kroah-Hartman --- include/linux/of_platform.h | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h index 84a966623e78..d15b6cd5e1c3 100644 --- a/include/linux/of_platform.h +++ b/include/linux/of_platform.h @@ -61,16 +61,18 @@ static inline struct platform_device *of_find_device_by_node(struct device_node } #endif +extern int of_platform_bus_probe(struct device_node *root, + const struct of_device_id *matches, + struct device *parent); + +#ifdef CONFIG_OF_ADDRESS /* Platform devices and busses creation */ extern struct platform_device *of_platform_device_create(struct device_node *np, const char *bus_id, struct device *parent); extern int of_platform_device_destroy(struct device *dev, void *data); -extern int of_platform_bus_probe(struct device_node *root, - const struct of_device_id *matches, - struct device *parent); -#ifdef CONFIG_OF_ADDRESS + extern int of_platform_populate(struct device_node *root, const struct of_device_id *matches, const struct of_dev_auxdata *lookup, @@ -84,6 +86,18 @@ extern int devm_of_platform_populate(struct device *dev); extern void devm_of_platform_depopulate(struct device *dev); #else +/* Platform devices and busses creation */ +static inline struct platform_device *of_platform_device_create(struct device_node *np, + const char *bus_id, + struct device *parent) +{ + return NULL; +} +static inline int of_platform_device_destroy(struct device *dev, void *data) +{ + return -ENODEV; +} + static inline int of_platform_populate(struct device_node *root, const struct of_device_id *matches, const struct of_dev_auxdata *lookup, -- cgit v1.2.3 From 0298b4b95cb373c21e6323c905589f8dac42c5b4 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Thu, 17 Feb 2022 10:42:53 -0800 Subject: usb: misc: Add onboard_usb_hub driver The main issue this driver addresses is that a USB hub needs to be powered before it can be discovered. For discrete onboard hubs (an example for such a hub is the Realtek RTS5411) this is often solved by supplying the hub with an 'always-on' regulator, which is kind of a hack. Some onboard hubs may require further initialization steps, like changing the state of a GPIO or enabling a clock, which requires even more hacks. This driver creates a platform device representing the hub which performs the necessary initialization. Currently it only supports switching on a single regulator, support for multiple regulators or other actions can be added as needed. Different initialization sequences can be supported based on the compatible string. Besides performing the initialization the driver can be configured to power the hub off during system suspend. This can help to extend battery life on battery powered devices which have no requirements to keep the hub powered during suspend. The driver can also be configured to leave the hub powered when a wakeup capable USB device is connected when suspending, and power it off otherwise. Technically the driver consists of two drivers, the platform driver described above and a very thin USB driver that subclasses the generic driver. The purpose of this driver is to provide the platform driver with the USB devices corresponding to the hub(s) (a hub controller may provide multiple 'logical' hubs, e.g. one to support USB 2.0 and another for USB 3.x). Note: the current series only supports hubs connected directly to a root hub, support for other configurations could be added if needed. Co-developed-by: Ravi Chandra Sadineni Reviewed-by: Douglas Anderson Acked-by: Alan Stern Signed-off-by: Ravi Chandra Sadineni Signed-off-by: Matthias Kaehlcke Link: https://lore.kernel.org/r/20220217104219.v21.2.I7c9a1f1d6ced41dd8310e8a03da666a32364e790@changeid Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/onboard_hub.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 include/linux/usb/onboard_hub.h (limited to 'include/linux') diff --git a/include/linux/usb/onboard_hub.h b/include/linux/usb/onboard_hub.h new file mode 100644 index 000000000000..d9373230556e --- /dev/null +++ b/include/linux/usb/onboard_hub.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __LINUX_USB_ONBOARD_HUB_H +#define __LINUX_USB_ONBOARD_HUB_H + +struct usb_device; +struct list_head; + +#if IS_ENABLED(CONFIG_USB_ONBOARD_HUB) +void onboard_hub_create_pdevs(struct usb_device *parent_hub, struct list_head *pdev_list); +void onboard_hub_destroy_pdevs(struct list_head *pdev_list); +#else +static inline void onboard_hub_create_pdevs(struct usb_device *parent_hub, + struct list_head *pdev_list) {} +static inline void onboard_hub_destroy_pdevs(struct list_head *pdev_list) {} +#endif + +#endif /* __LINUX_USB_ONBOARD_HUB_H */ -- cgit v1.2.3 From c40b62216c1aecc0dc00faf33d71bd71cb440337 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Thu, 17 Feb 2022 10:42:54 -0800 Subject: usb: core: hcd: Create platform devices for onboard hubs in probe() Call onboard_hub_create/destroy_pdevs() from usb_add/remove_hcd() for primary HCDs to create/destroy platform devices for onboard USB hubs that may be connected to the root hub of the controller. These functions are a NOP unless CONFIG_USB_ONBOARD_HUB=y/m. Also add a field to struct usb_hcd to keep track of the onboard hub platform devices that are owned by the HCD. Reviewed-by: Douglas Anderson Reviewed-by: Stephen Boyd Signed-off-by: Matthias Kaehlcke Link: https://lore.kernel.org/r/20220217104219.v21.3.I7a3a7d9d2126c34079b1cab87aa0b2ec3030f9b7@changeid Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/hcd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 548a028f2dab..4ebc91c09182 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -198,6 +198,7 @@ struct usb_hcd { struct usb_hcd *shared_hcd; struct usb_hcd *primary_hcd; + struct list_head onboard_hub_devs; #define HCD_BUFFER_POOLS 4 struct dma_pool *pool[HCD_BUFFER_POOLS]; -- cgit v1.2.3 From 9723f69d1de37ca25474372ad1753ea39bb0dce1 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 8 Apr 2022 10:00:43 +0200 Subject: mmc: core: improve API to make clear that mmc_sw_reset is for cards To make it unambiguous that mmc_sw_reset() is for cards and not for controllers, we make the function argument mmc_card instead of mmc_host. There are no users to convert currently. Suggested-by: Ulf Hansson Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20220408080045.6497-3-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- include/linux/mmc/core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index de5c64bbdb72..6efec0b9820c 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -176,7 +176,7 @@ int mmc_wait_for_cmd(struct mmc_host *host, struct mmc_command *cmd, int retries); int mmc_hw_reset(struct mmc_card *card); -int mmc_sw_reset(struct mmc_host *host); +int mmc_sw_reset(struct mmc_card *card); void mmc_set_data_timeout(struct mmc_data *data, const struct mmc_card *card); #endif /* LINUX_MMC_CORE_H */ -- cgit v1.2.3 From 32f18e596141f40dbc5d8700b2730371ffd3055f Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 8 Apr 2022 10:00:44 +0200 Subject: mmc: improve API to make clear hw_reset callback is for cards To make it unambiguous that the hw_reset callback is for cards and not for controllers, we add 'card' to the callback name and convert all users in one go. We keep the argument as mmc_host, though, because the callback is used very early when mmc_card is not yet populated. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20220408080045.6497-4-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 7afb57cab00b..c193c50ccd78 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -181,7 +181,7 @@ struct mmc_host_ops { unsigned int max_dtr, int host_drv, int card_drv, int *drv_type); /* Reset the eMMC card via RST_n */ - void (*hw_reset)(struct mmc_host *host); + void (*card_hw_reset)(struct mmc_host *host); void (*card_event)(struct mmc_host *host); /* -- cgit v1.2.3 From 13acb62ce1ee7377ef03034fbbad6f5499464b86 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 12 Apr 2022 11:31:02 +0200 Subject: mmc: sh_mmcif: move platform_data header to proper location We have a dedicated directory for platform_data meanwhile, don't spoil the MMC directory with it. Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20220412093102.3428-1-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- include/linux/mmc/sh_mmcif.h | 209 --------------------------------- include/linux/platform_data/sh_mmcif.h | 207 ++++++++++++++++++++++++++++++++ 2 files changed, 207 insertions(+), 209 deletions(-) delete mode 100644 include/linux/mmc/sh_mmcif.h create mode 100644 include/linux/platform_data/sh_mmcif.h (limited to 'include/linux') diff --git a/include/linux/mmc/sh_mmcif.h b/include/linux/mmc/sh_mmcif.h deleted file mode 100644 index e25533b95d9f..000000000000 --- a/include/linux/mmc/sh_mmcif.h +++ /dev/null @@ -1,209 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * include/linux/mmc/sh_mmcif.h - * - * platform data for eMMC driver - * - * Copyright (C) 2010 Renesas Solutions Corp. - */ - -#ifndef LINUX_MMC_SH_MMCIF_H -#define LINUX_MMC_SH_MMCIF_H - -#include -#include - -/* - * MMCIF : CE_CLK_CTRL [19:16] - * 1000 : Peripheral clock / 512 - * 0111 : Peripheral clock / 256 - * 0110 : Peripheral clock / 128 - * 0101 : Peripheral clock / 64 - * 0100 : Peripheral clock / 32 - * 0011 : Peripheral clock / 16 - * 0010 : Peripheral clock / 8 - * 0001 : Peripheral clock / 4 - * 0000 : Peripheral clock / 2 - * 1111 : Peripheral clock (sup_pclk set '1') - */ - -struct sh_mmcif_plat_data { - unsigned int slave_id_tx; /* embedded slave_id_[tr]x */ - unsigned int slave_id_rx; - u8 sup_pclk; /* 1 :SH7757, 0: SH7724/SH7372 */ - unsigned long caps; - u32 ocr; -}; - -#define MMCIF_CE_CMD_SET 0x00000000 -#define MMCIF_CE_ARG 0x00000008 -#define MMCIF_CE_ARG_CMD12 0x0000000C -#define MMCIF_CE_CMD_CTRL 0x00000010 -#define MMCIF_CE_BLOCK_SET 0x00000014 -#define MMCIF_CE_CLK_CTRL 0x00000018 -#define MMCIF_CE_BUF_ACC 0x0000001C -#define MMCIF_CE_RESP3 0x00000020 -#define MMCIF_CE_RESP2 0x00000024 -#define MMCIF_CE_RESP1 0x00000028 -#define MMCIF_CE_RESP0 0x0000002C -#define MMCIF_CE_RESP_CMD12 0x00000030 -#define MMCIF_CE_DATA 0x00000034 -#define MMCIF_CE_INT 0x00000040 -#define MMCIF_CE_INT_MASK 0x00000044 -#define MMCIF_CE_HOST_STS1 0x00000048 -#define MMCIF_CE_HOST_STS2 0x0000004C -#define MMCIF_CE_CLK_CTRL2 0x00000070 -#define MMCIF_CE_VERSION 0x0000007C - -/* CE_BUF_ACC */ -#define BUF_ACC_DMAWEN (1 << 25) -#define BUF_ACC_DMAREN (1 << 24) -#define BUF_ACC_BUSW_32 (0 << 17) -#define BUF_ACC_BUSW_16 (1 << 17) -#define BUF_ACC_ATYP (1 << 16) - -/* CE_CLK_CTRL */ -#define CLK_ENABLE (1 << 24) /* 1: output mmc clock */ -#define CLK_CLEAR (0xf << 16) -#define CLK_SUP_PCLK (0xf << 16) -#define CLKDIV_4 (1 << 16) /* mmc clock frequency. - * n: bus clock/(2^(n+1)) */ -#define CLKDIV_256 (7 << 16) /* mmc clock frequency. (see above) */ -#define SRSPTO_256 (2 << 12) /* resp timeout */ -#define SRBSYTO_29 (0xf << 8) /* resp busy timeout */ -#define SRWDTO_29 (0xf << 4) /* read/write timeout */ -#define SCCSTO_29 (0xf << 0) /* ccs timeout */ - -/* CE_VERSION */ -#define SOFT_RST_ON (1 << 31) -#define SOFT_RST_OFF 0 - -static inline u32 sh_mmcif_readl(void __iomem *addr, int reg) -{ - return __raw_readl(addr + reg); -} - -static inline void sh_mmcif_writel(void __iomem *addr, int reg, u32 val) -{ - __raw_writel(val, addr + reg); -} - -#define SH_MMCIF_BBS 512 /* boot block size */ - -static inline void sh_mmcif_boot_cmd_send(void __iomem *base, - unsigned long cmd, unsigned long arg) -{ - sh_mmcif_writel(base, MMCIF_CE_INT, 0); - sh_mmcif_writel(base, MMCIF_CE_ARG, arg); - sh_mmcif_writel(base, MMCIF_CE_CMD_SET, cmd); -} - -static inline int sh_mmcif_boot_cmd_poll(void __iomem *base, unsigned long mask) -{ - unsigned long tmp; - int cnt; - - for (cnt = 0; cnt < 1000000; cnt++) { - tmp = sh_mmcif_readl(base, MMCIF_CE_INT); - if (tmp & mask) { - sh_mmcif_writel(base, MMCIF_CE_INT, tmp & ~mask); - return 0; - } - } - - return -1; -} - -static inline int sh_mmcif_boot_cmd(void __iomem *base, - unsigned long cmd, unsigned long arg) -{ - sh_mmcif_boot_cmd_send(base, cmd, arg); - return sh_mmcif_boot_cmd_poll(base, 0x00010000); -} - -static inline int sh_mmcif_boot_do_read_single(void __iomem *base, - unsigned int block_nr, - unsigned long *buf) -{ - int k; - - /* CMD13 - Status */ - sh_mmcif_boot_cmd(base, 0x0d400000, 0x00010000); - - if (sh_mmcif_readl(base, MMCIF_CE_RESP0) != 0x0900) - return -1; - - /* CMD17 - Read */ - sh_mmcif_boot_cmd(base, 0x11480000, block_nr * SH_MMCIF_BBS); - if (sh_mmcif_boot_cmd_poll(base, 0x00100000) < 0) - return -1; - - for (k = 0; k < (SH_MMCIF_BBS / 4); k++) - buf[k] = sh_mmcif_readl(base, MMCIF_CE_DATA); - - return 0; -} - -static inline int sh_mmcif_boot_do_read(void __iomem *base, - unsigned long first_block, - unsigned long nr_blocks, - void *buf) -{ - unsigned long k; - int ret = 0; - - /* In data transfer mode: Set clock to Bus clock/4 (about 20Mhz) */ - sh_mmcif_writel(base, MMCIF_CE_CLK_CTRL, - CLK_ENABLE | CLKDIV_4 | SRSPTO_256 | - SRBSYTO_29 | SRWDTO_29 | SCCSTO_29); - - /* CMD9 - Get CSD */ - sh_mmcif_boot_cmd(base, 0x09806000, 0x00010000); - - /* CMD7 - Select the card */ - sh_mmcif_boot_cmd(base, 0x07400000, 0x00010000); - - /* CMD16 - Set the block size */ - sh_mmcif_boot_cmd(base, 0x10400000, SH_MMCIF_BBS); - - for (k = 0; !ret && k < nr_blocks; k++) - ret = sh_mmcif_boot_do_read_single(base, first_block + k, - buf + (k * SH_MMCIF_BBS)); - - return ret; -} - -static inline void sh_mmcif_boot_init(void __iomem *base) -{ - /* reset */ - sh_mmcif_writel(base, MMCIF_CE_VERSION, SOFT_RST_ON); - sh_mmcif_writel(base, MMCIF_CE_VERSION, SOFT_RST_OFF); - - /* byte swap */ - sh_mmcif_writel(base, MMCIF_CE_BUF_ACC, BUF_ACC_ATYP); - - /* Set block size in MMCIF hardware */ - sh_mmcif_writel(base, MMCIF_CE_BLOCK_SET, SH_MMCIF_BBS); - - /* Enable the clock, set it to Bus clock/256 (about 325Khz). */ - sh_mmcif_writel(base, MMCIF_CE_CLK_CTRL, - CLK_ENABLE | CLKDIV_256 | SRSPTO_256 | - SRBSYTO_29 | SRWDTO_29 | SCCSTO_29); - - /* CMD0 */ - sh_mmcif_boot_cmd(base, 0x00000040, 0); - - /* CMD1 - Get OCR */ - do { - sh_mmcif_boot_cmd(base, 0x01405040, 0x40300000); /* CMD1 */ - } while ((sh_mmcif_readl(base, MMCIF_CE_RESP0) & 0x80000000) - != 0x80000000); - - /* CMD2 - Get CID */ - sh_mmcif_boot_cmd(base, 0x02806040, 0); - - /* CMD3 - Set card relative address */ - sh_mmcif_boot_cmd(base, 0x03400040, 0x00010000); -} - -#endif /* LINUX_MMC_SH_MMCIF_H */ diff --git a/include/linux/platform_data/sh_mmcif.h b/include/linux/platform_data/sh_mmcif.h new file mode 100644 index 000000000000..6eb914f958f9 --- /dev/null +++ b/include/linux/platform_data/sh_mmcif.h @@ -0,0 +1,207 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * platform data for eMMC driver + * + * Copyright (C) 2010 Renesas Solutions Corp. + */ + +#ifndef LINUX_MMC_SH_MMCIF_H +#define LINUX_MMC_SH_MMCIF_H + +#include +#include + +/* + * MMCIF : CE_CLK_CTRL [19:16] + * 1000 : Peripheral clock / 512 + * 0111 : Peripheral clock / 256 + * 0110 : Peripheral clock / 128 + * 0101 : Peripheral clock / 64 + * 0100 : Peripheral clock / 32 + * 0011 : Peripheral clock / 16 + * 0010 : Peripheral clock / 8 + * 0001 : Peripheral clock / 4 + * 0000 : Peripheral clock / 2 + * 1111 : Peripheral clock (sup_pclk set '1') + */ + +struct sh_mmcif_plat_data { + unsigned int slave_id_tx; /* embedded slave_id_[tr]x */ + unsigned int slave_id_rx; + u8 sup_pclk; /* 1 :SH7757, 0: SH7724/SH7372 */ + unsigned long caps; + u32 ocr; +}; + +#define MMCIF_CE_CMD_SET 0x00000000 +#define MMCIF_CE_ARG 0x00000008 +#define MMCIF_CE_ARG_CMD12 0x0000000C +#define MMCIF_CE_CMD_CTRL 0x00000010 +#define MMCIF_CE_BLOCK_SET 0x00000014 +#define MMCIF_CE_CLK_CTRL 0x00000018 +#define MMCIF_CE_BUF_ACC 0x0000001C +#define MMCIF_CE_RESP3 0x00000020 +#define MMCIF_CE_RESP2 0x00000024 +#define MMCIF_CE_RESP1 0x00000028 +#define MMCIF_CE_RESP0 0x0000002C +#define MMCIF_CE_RESP_CMD12 0x00000030 +#define MMCIF_CE_DATA 0x00000034 +#define MMCIF_CE_INT 0x00000040 +#define MMCIF_CE_INT_MASK 0x00000044 +#define MMCIF_CE_HOST_STS1 0x00000048 +#define MMCIF_CE_HOST_STS2 0x0000004C +#define MMCIF_CE_CLK_CTRL2 0x00000070 +#define MMCIF_CE_VERSION 0x0000007C + +/* CE_BUF_ACC */ +#define BUF_ACC_DMAWEN (1 << 25) +#define BUF_ACC_DMAREN (1 << 24) +#define BUF_ACC_BUSW_32 (0 << 17) +#define BUF_ACC_BUSW_16 (1 << 17) +#define BUF_ACC_ATYP (1 << 16) + +/* CE_CLK_CTRL */ +#define CLK_ENABLE (1 << 24) /* 1: output mmc clock */ +#define CLK_CLEAR (0xf << 16) +#define CLK_SUP_PCLK (0xf << 16) +#define CLKDIV_4 (1 << 16) /* mmc clock frequency. + * n: bus clock/(2^(n+1)) */ +#define CLKDIV_256 (7 << 16) /* mmc clock frequency. (see above) */ +#define SRSPTO_256 (2 << 12) /* resp timeout */ +#define SRBSYTO_29 (0xf << 8) /* resp busy timeout */ +#define SRWDTO_29 (0xf << 4) /* read/write timeout */ +#define SCCSTO_29 (0xf << 0) /* ccs timeout */ + +/* CE_VERSION */ +#define SOFT_RST_ON (1 << 31) +#define SOFT_RST_OFF 0 + +static inline u32 sh_mmcif_readl(void __iomem *addr, int reg) +{ + return __raw_readl(addr + reg); +} + +static inline void sh_mmcif_writel(void __iomem *addr, int reg, u32 val) +{ + __raw_writel(val, addr + reg); +} + +#define SH_MMCIF_BBS 512 /* boot block size */ + +static inline void sh_mmcif_boot_cmd_send(void __iomem *base, + unsigned long cmd, unsigned long arg) +{ + sh_mmcif_writel(base, MMCIF_CE_INT, 0); + sh_mmcif_writel(base, MMCIF_CE_ARG, arg); + sh_mmcif_writel(base, MMCIF_CE_CMD_SET, cmd); +} + +static inline int sh_mmcif_boot_cmd_poll(void __iomem *base, unsigned long mask) +{ + unsigned long tmp; + int cnt; + + for (cnt = 0; cnt < 1000000; cnt++) { + tmp = sh_mmcif_readl(base, MMCIF_CE_INT); + if (tmp & mask) { + sh_mmcif_writel(base, MMCIF_CE_INT, tmp & ~mask); + return 0; + } + } + + return -1; +} + +static inline int sh_mmcif_boot_cmd(void __iomem *base, + unsigned long cmd, unsigned long arg) +{ + sh_mmcif_boot_cmd_send(base, cmd, arg); + return sh_mmcif_boot_cmd_poll(base, 0x00010000); +} + +static inline int sh_mmcif_boot_do_read_single(void __iomem *base, + unsigned int block_nr, + unsigned long *buf) +{ + int k; + + /* CMD13 - Status */ + sh_mmcif_boot_cmd(base, 0x0d400000, 0x00010000); + + if (sh_mmcif_readl(base, MMCIF_CE_RESP0) != 0x0900) + return -1; + + /* CMD17 - Read */ + sh_mmcif_boot_cmd(base, 0x11480000, block_nr * SH_MMCIF_BBS); + if (sh_mmcif_boot_cmd_poll(base, 0x00100000) < 0) + return -1; + + for (k = 0; k < (SH_MMCIF_BBS / 4); k++) + buf[k] = sh_mmcif_readl(base, MMCIF_CE_DATA); + + return 0; +} + +static inline int sh_mmcif_boot_do_read(void __iomem *base, + unsigned long first_block, + unsigned long nr_blocks, + void *buf) +{ + unsigned long k; + int ret = 0; + + /* In data transfer mode: Set clock to Bus clock/4 (about 20Mhz) */ + sh_mmcif_writel(base, MMCIF_CE_CLK_CTRL, + CLK_ENABLE | CLKDIV_4 | SRSPTO_256 | + SRBSYTO_29 | SRWDTO_29 | SCCSTO_29); + + /* CMD9 - Get CSD */ + sh_mmcif_boot_cmd(base, 0x09806000, 0x00010000); + + /* CMD7 - Select the card */ + sh_mmcif_boot_cmd(base, 0x07400000, 0x00010000); + + /* CMD16 - Set the block size */ + sh_mmcif_boot_cmd(base, 0x10400000, SH_MMCIF_BBS); + + for (k = 0; !ret && k < nr_blocks; k++) + ret = sh_mmcif_boot_do_read_single(base, first_block + k, + buf + (k * SH_MMCIF_BBS)); + + return ret; +} + +static inline void sh_mmcif_boot_init(void __iomem *base) +{ + /* reset */ + sh_mmcif_writel(base, MMCIF_CE_VERSION, SOFT_RST_ON); + sh_mmcif_writel(base, MMCIF_CE_VERSION, SOFT_RST_OFF); + + /* byte swap */ + sh_mmcif_writel(base, MMCIF_CE_BUF_ACC, BUF_ACC_ATYP); + + /* Set block size in MMCIF hardware */ + sh_mmcif_writel(base, MMCIF_CE_BLOCK_SET, SH_MMCIF_BBS); + + /* Enable the clock, set it to Bus clock/256 (about 325Khz). */ + sh_mmcif_writel(base, MMCIF_CE_CLK_CTRL, + CLK_ENABLE | CLKDIV_256 | SRSPTO_256 | + SRBSYTO_29 | SRWDTO_29 | SCCSTO_29); + + /* CMD0 */ + sh_mmcif_boot_cmd(base, 0x00000040, 0); + + /* CMD1 - Get OCR */ + do { + sh_mmcif_boot_cmd(base, 0x01405040, 0x40300000); /* CMD1 */ + } while ((sh_mmcif_readl(base, MMCIF_CE_RESP0) & 0x80000000) + != 0x80000000); + + /* CMD2 - Get CID */ + sh_mmcif_boot_cmd(base, 0x02806040, 0); + + /* CMD3 - Set card relative address */ + sh_mmcif_boot_cmd(base, 0x03400040, 0x00010000); +} + +#endif /* LINUX_MMC_SH_MMCIF_H */ -- cgit v1.2.3 From 8e274732115f63c1d09136284431b3555bd5cc56 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 25 Apr 2022 23:04:28 +0206 Subject: printk: extend console_lock for per-console locking Currently threaded console printers synchronize against each other using console_lock(). However, different console drivers are unrelated and do not require any synchronization between each other. Removing the synchronization between the threaded console printers will allow each console to print at its own speed. But the threaded consoles printers do still need to synchronize against console_lock() callers. Introduce a per-console mutex and a new console boolean field @blocked to provide this synchronization. console_lock() is modified so that it must acquire the mutex of each console in order to set the @blocked field. Console printing threads will acquire their mutex while printing a record. If @blocked was set, the thread will go back to sleep instead of printing. The reason for the @blocked boolean field is so that console_lock() callers do not need to acquire multiple console mutexes simultaneously, which would introduce unnecessary complexity due to nested mutex locking. Also, a new field was chosen instead of adding a new @flags value so that the blocked status could be checked without concern of reading inconsistent values due to @flags updates from other contexts. Threaded console printers also need to synchronize against console_trylock() callers. Since console_trylock() may be called from any context, the per-console mutex cannot be used for this synchronization. (mutex_trylock() cannot be called from atomic contexts.) Introduce a global atomic counter to identify if any threaded printers are active. The threaded printers will also check the atomic counter to identify if the console has been locked by another task via console_trylock(). Note that @console_sem is still used to provide synchronization between console_lock() and console_trylock() callers. A locking overview for console_lock(), console_trylock(), and the threaded printers is as follows (pseudo code): console_lock() { down(&console_sem); for_each_console(con) { mutex_lock(&con->lock); con->blocked = true; mutex_unlock(&con->lock); } /* console_lock acquired */ } console_trylock() { if (down_trylock(&console_sem) == 0) { if (atomic_cmpxchg(&console_kthreads_active, 0, -1) == 0) { /* console_lock acquired */ } } } threaded_printer() { mutex_lock(&con->lock); if (!con->blocked) { /* console_lock() callers blocked */ if (atomic_inc_unless_negative(&console_kthreads_active)) { /* console_trylock() callers blocked */ con->write(); atomic_dec(&console_lock_count); } } mutex_unlock(&con->lock); } The console owner and waiter logic now only applies between contexts that have taken the console_lock via console_trylock(). Threaded printers never take the console_lock, so they do not have a console_lock to handover. Tasks that have used console_lock() will block the threaded printers using a mutex and if the console_lock is handed over to an atomic context, it would be unable to unblock the threaded printers. However, the console_trylock() case is really the only scenario that is interesting for handovers anyway. @panic_console_dropped must change to atomic_t since it is no longer protected exclusively by the console_lock. Since threaded printers remain asleep if they see that the console is locked, they now must be explicitly woken in __console_unlock(). This means wake_up_klogd() calls following a console_unlock() are no longer necessary and are removed. Also note that threaded printers no longer need to check @console_suspended. The check for the @blocked field implicitly covers the suspended console case. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/878rrs6ft7.fsf@jogness.linutronix.de --- include/linux/console.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 9a251e70c090..143653090c48 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -16,6 +16,7 @@ #include #include +#include struct vc_data; struct console_font_op; @@ -154,6 +155,20 @@ struct console { u64 seq; unsigned long dropped; struct task_struct *thread; + bool blocked; + + /* + * The per-console lock is used by printing kthreads to synchronize + * this console with callers of console_lock(). This is necessary in + * order to allow printing kthreads to run in parallel to each other, + * while each safely accessing the @blocked field and synchronizing + * against direct printing via console_lock/console_unlock. + * + * Note: For synchronizing against direct printing via + * console_trylock/console_unlock, see the static global + * variable @console_kthreads_active. + */ + struct mutex lock; void *data; struct console *next; -- cgit v1.2.3 From 33337d03f04f9ea3a50ac2d3490a5d7a3ba9be82 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Tue, 26 Apr 2022 01:29:05 -0700 Subject: io_uring: add io_uring_get_opcode In some debug scenarios it is useful to have the text representation of the opcode. Add this function in preparation. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220426082907.3600028-3-dylany@fb.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 1814e698d861..24651c229ed2 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -10,6 +10,7 @@ struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); +const char *io_uring_get_opcode(u8 opcode); static inline void io_uring_files_cancel(void) { @@ -42,6 +43,10 @@ static inline void io_uring_files_cancel(void) static inline void io_uring_free(struct task_struct *tsk) { } +static inline const char *io_uring_get_opcode(u8 opcode) +{ + return ""; +} #endif #endif -- cgit v1.2.3 From 54c861f930180e096483a6e1744c9c4b76e20da5 Mon Sep 17 00:00:00 2001 From: Daniel Ammann Date: Tue, 5 Apr 2022 14:54:25 +0200 Subject: mfd: tps65218: Fix trivial typo in comment The driver is for TPS65218, not TPS65219. Signed-off-by: Daniel Ammann Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20220405125426.28016-1-daniel.ammann@bytesatwork.ch --- include/linux/mfd/tps65218.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/tps65218.h b/include/linux/mfd/tps65218.h index f4ca367e3473..122e24ddbd4b 100644 --- a/include/linux/mfd/tps65218.h +++ b/include/linux/mfd/tps65218.h @@ -1,7 +1,7 @@ /* * linux/mfd/tps65218.h * - * Functions to access TPS65219 power management chip. + * Functions to access TPS65218 power management chip. * * Copyright (C) 2014 Texas Instruments Incorporated - https://www.ti.com/ * -- cgit v1.2.3 From 7f5aaa4a0ae6948eace6cf30f40a3ec27ece8441 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Wed, 6 Apr 2022 11:40:41 -0300 Subject: mfd: hi655x-pmic: Replace legacy gpio interface for gpiod interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Considering the current transition of the GPIO subsystem, remove all dependencies of the legacy GPIO interface (linux/gpio.h and linux /of_gpio.h) and replace it with the descriptor-based GPIO approach. Signed-off-by: Maíra Canal Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/Yk2maZuf+5FGL+eg@fedora --- include/linux/mfd/hi655x-pmic.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/hi655x-pmic.h b/include/linux/mfd/hi655x-pmic.h index af5d97239c0d..6a012784dd1b 100644 --- a/include/linux/mfd/hi655x-pmic.h +++ b/include/linux/mfd/hi655x-pmic.h @@ -12,6 +12,8 @@ #ifndef __HI655X_PMIC_H #define __HI655X_PMIC_H +#include + /* Hi655x registers are mapped to memory bus in 4 bytes stride */ #define HI655X_STRIDE 4 #define HI655X_BUS_ADDR(x) ((x) << 2) @@ -53,7 +55,7 @@ struct hi655x_pmic { struct resource *res; struct device *dev; struct regmap *regmap; - int gpio; + struct gpio_desc *gpio; unsigned int ver; struct regmap_irq_chip_data *irq_data; }; -- cgit v1.2.3 From 82028ba4d590c4e9eb3c93326019db7c26d02e35 Mon Sep 17 00:00:00 2001 From: Fabien Parent Date: Fri, 15 Apr 2022 17:36:24 +0200 Subject: mfd: mt6359: Add missing defines necessary for mtk-pmic-keys support Add 2 missing MT6359 registers that are needed to implement the keyboard driver. Signed-off-by: Fabien Parent Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20220415153629.1817202-3-fparent@baylibre.com --- include/linux/mfd/mt6359/registers.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/mt6359/registers.h b/include/linux/mfd/mt6359/registers.h index 2135c9695918..2a4394a27b1c 100644 --- a/include/linux/mfd/mt6359/registers.h +++ b/include/linux/mfd/mt6359/registers.h @@ -8,6 +8,8 @@ /* PMIC Registers */ #define MT6359_SWCID 0xa +#define MT6359_TOPSTATUS 0x2a +#define MT6359_TOP_RST_MISC 0x14c #define MT6359_MISC_TOP_INT_CON0 0x188 #define MT6359_MISC_TOP_INT_STATUS0 0x194 #define MT6359_TOP_INT_STATUS0 0x19e -- cgit v1.2.3 From c317ab71facc2cd0a94145973318a4c914e11acc Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 25 Apr 2022 21:32:47 +0800 Subject: bpf: Compute map_btf_id during build time For now, the field 'map_btf_id' in 'struct bpf_map_ops' for all map types are computed during vmlinux-btf init: btf_parse_vmlinux() -> btf_vmlinux_map_ids_init() It will lookup the btf_type according to the 'map_btf_name' field in 'struct bpf_map_ops'. This process can be done during build time, thanks to Jiri's resolve_btfids. selftest of map_ptr has passed: $96 map_ptr:OK Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED Reported-by: kernel test robot Signed-off-by: Menglong Dong Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0af5793ba417..be94833d390a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -148,8 +148,7 @@ struct bpf_map_ops { bpf_callback_t callback_fn, void *callback_ctx, u64 flags); - /* BTF name and id of struct allocated by map_alloc */ - const char * const map_btf_name; + /* BTF id of struct allocated by map_alloc */ int *map_btf_id; /* bpf_iter info used to open a seq_file */ -- cgit v1.2.3 From 3ce0f2373f7073f04715b1ffd7b1812d3183f79a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 Apr 2022 15:12:57 +0800 Subject: compat: consolidate the compat_flock{,64} definition Provide a single common definition for the compat_flock and compat_flock64 structures using the same tricks as for the native variants. Another extra define is added for the packing required on x86. Signed-off-by: Christoph Hellwig Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Tested-by: Heiko Stuebner Acked-by: Helge Deller # parisc Link: https://lore.kernel.org/r/20220405071314.3225832-4-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- include/linux/compat.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 1c758b0e0359..a0481fe6c5d5 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -258,6 +258,37 @@ struct compat_rlimit { compat_ulong_t rlim_max; }; +#ifdef __ARCH_NEED_COMPAT_FLOCK64_PACKED +#define __ARCH_COMPAT_FLOCK64_PACK __attribute__((packed)) +#else +#define __ARCH_COMPAT_FLOCK64_PACK +#endif + +struct compat_flock { + short l_type; + short l_whence; + compat_off_t l_start; + compat_off_t l_len; +#ifdef __ARCH_COMPAT_FLOCK_EXTRA_SYSID + __ARCH_COMPAT_FLOCK_EXTRA_SYSID +#endif + compat_pid_t l_pid; +#ifdef __ARCH_COMPAT_FLOCK_PAD + __ARCH_COMPAT_FLOCK_PAD +#endif +}; + +struct compat_flock64 { + short l_type; + short l_whence; + compat_loff_t l_start; + compat_loff_t l_len; + compat_pid_t l_pid; +#ifdef __ARCH_COMPAT_FLOCK64_PAD + __ARCH_COMPAT_FLOCK64_PAD +#endif +} __ARCH_COMPAT_FLOCK64_PACK; + struct compat_rusage { struct old_timeval32 ru_utime; struct old_timeval32 ru_stime; -- cgit v1.2.3 From 59c10c52f573faca862cda5ebcdd43831608eb5a Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 5 Apr 2022 15:13:05 +0800 Subject: riscv: compat: syscall: Add compat_sys_call_table implementation Implement compat sys_call_table and some system call functions: truncate64, ftruncate64, fallocate, pread64, pwrite64, sync_file_range, readahead, fadvise64_64 which need argument translation. Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Tested-by: Heiko Stuebner Link: https://lore.kernel.org/r/20220405071314.3225832-12-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- include/linux/compat.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index a0481fe6c5d5..ce6cd9b30d27 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -926,6 +926,43 @@ asmlinkage long compat_sys_sigaction(int sig, /* obsolete: net/socket.c */ asmlinkage long compat_sys_socketcall(int call, u32 __user *args); +#ifdef __ARCH_WANT_COMPAT_TRUNCATE64 +asmlinkage long compat_sys_truncate64(const char __user *pathname, compat_arg_u64(len)); +#endif + +#ifdef __ARCH_WANT_COMPAT_FTRUNCATE64 +asmlinkage long compat_sys_ftruncate64(unsigned int fd, compat_arg_u64(len)); +#endif + +#ifdef __ARCH_WANT_COMPAT_FALLOCATE +asmlinkage long compat_sys_fallocate(int fd, int mode, compat_arg_u64(offset), + compat_arg_u64(len)); +#endif + +#ifdef __ARCH_WANT_COMPAT_PREAD64 +asmlinkage long compat_sys_pread64(unsigned int fd, char __user *buf, size_t count, + compat_arg_u64(pos)); +#endif + +#ifdef __ARCH_WANT_COMPAT_PWRITE64 +asmlinkage long compat_sys_pwrite64(unsigned int fd, const char __user *buf, size_t count, + compat_arg_u64(pos)); +#endif + +#ifdef __ARCH_WANT_COMPAT_SYNC_FILE_RANGE +asmlinkage long compat_sys_sync_file_range(int fd, compat_arg_u64(pos), + compat_arg_u64(nbytes), unsigned int flags); +#endif + +#ifdef __ARCH_WANT_COMPAT_FADVISE64_64 +asmlinkage long compat_sys_fadvise64_64(int fd, compat_arg_u64(pos), + compat_arg_u64(len), int advice); +#endif + +#ifdef __ARCH_WANT_COMPAT_READAHEAD +asmlinkage long compat_sys_readahead(int fd, compat_arg_u64(offset), size_t count); +#endif + #endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ /** -- cgit v1.2.3 From a2a9d67a26ec94a99ed29efbd61cf5be0a575678 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 6 Apr 2022 11:31:19 +0900 Subject: bootconfig: Support embedding a bootconfig file in kernel This allows kernel developer to embed a default bootconfig file in the kernel instead of embedding it in the initrd. This will be good for who are using the kernel without initrd, or who needs a default bootconfigs. This needs to set two kconfigs: CONFIG_BOOT_CONFIG_EMBED=y and set the file path to CONFIG_BOOT_CONFIG_EMBED_FILE. Note that you still need 'bootconfig' command line option to load the embedded bootconfig. Also if you boot using an initrd with a different bootconfig, the kernel will use the bootconfig in the initrd, instead of the default bootconfig. Link: https://lkml.kernel.org/r/164921227943.1090670.14035119557571329218.stgit@devnote2 Cc: Padmanabha Srinivasaiah Cc: Jonathan Corbet Cc: Randy Dunlap Cc: Nick Desaulniers Cc: Sami Tolvanen Cc: Nathan Chancellor Cc: Masahiro Yamada Cc: Linux Kbuild mailing list Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (Google) --- include/linux/bootconfig.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index a4665c7ab07c..1611f9db878e 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -289,4 +289,14 @@ int __init xbc_get_info(int *node_size, size_t *data_size); /* XBC cleanup data structures */ void __init xbc_exit(void); +/* XBC embedded bootconfig data in kernel */ +#ifdef CONFIG_BOOT_CONFIG_EMBED +const char * __init xbc_get_embedded_bootconfig(size_t *size); +#else +static inline const char *xbc_get_embedded_bootconfig(size_t *size) +{ + return NULL; +} +#endif + #endif -- cgit v1.2.3 From 68822bdf76f10c3dc80609d4e2cdc1e847429086 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Apr 2022 13:12:37 -0700 Subject: net: generalize skb freeing deferral to per-cpu lists Logic added in commit f35f821935d8 ("tcp: defer skb freeing after socket lock is released") helped bulk TCP flows to move the cost of skbs frees outside of critical section where socket lock was held. But for RPC traffic, or hosts with RFS enabled, the solution is far from being ideal. For RPC traffic, recvmsg() has to return to user space right after skb payload has been consumed, meaning that BH handler has no chance to pick the skb before recvmsg() thread. This issue is more visible with BIG TCP, as more RPC fit one skb. For RFS, even if BH handler picks the skbs, they are still picked from the cpu on which user thread is running. Ideally, it is better to free the skbs (and associated page frags) on the cpu that originally allocated them. This patch removes the per socket anchor (sk->defer_list) and instead uses a per-cpu list, which will hold more skbs per round. This new per-cpu list is drained at the end of net_action_rx(), after incoming packets have been processed, to lower latencies. In normal conditions, skbs are added to the per-cpu list with no further action. In the (unlikely) cases where the cpu does not run net_action_rx() handler fast enough, we use an IPI to raise NET_RX_SOFTIRQ on the remote cpu. Also, we do not bother draining the per-cpu list from dev_cpu_dead() This is because skbs in this list have no requirement on how fast they should be freed. Note that we can add in the future a small per-cpu cache if we see any contention on sd->defer_lock. Tested on a pair of hosts with 100Gbit NIC, RFS enabled, and /proc/sys/net/ipv4/tcp_rmem[2] tuned to 16MB to work around page recycling strategy used by NIC driver (its page pool capacity being too small compared to number of skbs/pages held in sockets receive queues) Note that this tuning was only done to demonstrate worse conditions for skb freeing for this particular test. These conditions can happen in more general production workload. 10 runs of one TCP_STREAM flow Before: Average throughput: 49685 Mbit. Kernel profiles on cpu running user thread recvmsg() show high cost for skb freeing related functions (*) 57.81% [kernel] [k] copy_user_enhanced_fast_string (*) 12.87% [kernel] [k] skb_release_data (*) 4.25% [kernel] [k] __free_one_page (*) 3.57% [kernel] [k] __list_del_entry_valid 1.85% [kernel] [k] __netif_receive_skb_core 1.60% [kernel] [k] __skb_datagram_iter (*) 1.59% [kernel] [k] free_unref_page_commit (*) 1.16% [kernel] [k] __slab_free 1.16% [kernel] [k] _copy_to_iter (*) 1.01% [kernel] [k] kfree (*) 0.88% [kernel] [k] free_unref_page 0.57% [kernel] [k] ip6_rcv_core 0.55% [kernel] [k] ip6t_do_table 0.54% [kernel] [k] flush_smp_call_function_queue (*) 0.54% [kernel] [k] free_pcppages_bulk 0.51% [kernel] [k] llist_reverse_order 0.38% [kernel] [k] process_backlog (*) 0.38% [kernel] [k] free_pcp_prepare 0.37% [kernel] [k] tcp_recvmsg_locked (*) 0.37% [kernel] [k] __list_add_valid 0.34% [kernel] [k] sock_rfree 0.34% [kernel] [k] _raw_spin_lock_irq (*) 0.33% [kernel] [k] __page_cache_release 0.33% [kernel] [k] tcp_v6_rcv (*) 0.33% [kernel] [k] __put_page (*) 0.29% [kernel] [k] __mod_zone_page_state 0.27% [kernel] [k] _raw_spin_lock After patch: Average throughput: 73076 Mbit. Kernel profiles on cpu running user thread recvmsg() looks better: 81.35% [kernel] [k] copy_user_enhanced_fast_string 1.95% [kernel] [k] _copy_to_iter 1.95% [kernel] [k] __skb_datagram_iter 1.27% [kernel] [k] __netif_receive_skb_core 1.03% [kernel] [k] ip6t_do_table 0.60% [kernel] [k] sock_rfree 0.50% [kernel] [k] tcp_v6_rcv 0.47% [kernel] [k] ip6_rcv_core 0.45% [kernel] [k] read_tsc 0.44% [kernel] [k] _raw_spin_lock_irqsave 0.37% [kernel] [k] _raw_spin_lock 0.37% [kernel] [k] native_irq_return_iret 0.33% [kernel] [k] __inet6_lookup_established 0.31% [kernel] [k] ip6_protocol_deliver_rcu 0.29% [kernel] [k] tcp_rcv_established 0.29% [kernel] [k] llist_reverse_order v2: kdoc issue (kernel bots) do not defer if (alloc_cpu == smp_processor_id()) (Paolo) replace the sk_buff_head with a single-linked list (Jakub) add a READ_ONCE()/WRITE_ONCE() for the lockless read of sd->defer_list Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220422201237.416238-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 +++++ include/linux/skbuff.h | 3 +++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7dccbfd1bf56..ac8a5f71220a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3081,6 +3081,11 @@ struct softnet_data { struct sk_buff_head input_pkt_queue; struct napi_struct backlog; + /* Another possibly contended cache line */ + spinlock_t defer_lock ____cacheline_aligned_in_smp; + int defer_count; + struct sk_buff *defer_list; + call_single_data_t defer_csd; }; static inline void input_queue_head_incr(struct softnet_data *sd) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 84d78df60453..5cbc184ca685 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -888,6 +888,7 @@ typedef unsigned char *sk_buff_data_t; * delivery_time at egress. * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS + * @alloc_cpu: CPU which did the skb allocation. * @secmark: security marking * @mark: Generic packet mark * @reserved_tailroom: (aka @mark) number of bytes of free space available @@ -1080,6 +1081,7 @@ struct sk_buff { unsigned int sender_cpu; }; #endif + u16 alloc_cpu; #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; #endif @@ -1321,6 +1323,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size); +void skb_attempt_defer_free(struct sk_buff *skb); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); -- cgit v1.2.3 From 6423d2951087231706246f81851067f7f0593d4a Mon Sep 17 00:00:00 2001 From: Won Chung Date: Mon, 14 Mar 2022 19:54:58 +0000 Subject: driver core: Add sysfs support for physical location of a device When ACPI table includes _PLD fields for a device, create a new directory (physical_location) in sysfs to share _PLD fields. Currently without PLD information, when there are multiple of same devices, it is hard to distinguish which device corresponds to which physical device at which location. For example, when there are two Type C connectors, it is hard to find out which connector corresponds to the Type C port on the left panel versus the Type C port on the right panel. With PLD information provided, we can determine which specific device at which location is doing what. _PLD output includes much more fields, but only generic fields are added and exposed to sysfs, so that non-ACPI devices can also support it in the future. The minimal generic fields needed for locating a device are the following. - panel - vertical_position - horizontal_position - dock - lid Signed-off-by: Won Chung Link: https://lore.kernel.org/r/20220314195458.271430-1-wonchung@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 93459724dcde..766fbea6ca83 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -386,6 +386,75 @@ struct dev_msi_info { #endif }; +/** + * enum device_physical_location_panel - Describes which panel surface of the + * system's housing the device connection point resides on. + * @DEVICE_PANEL_TOP: Device connection point is on the top panel. + * @DEVICE_PANEL_BOTTOM: Device connection point is on the bottom panel. + * @DEVICE_PANEL_LEFT: Device connection point is on the left panel. + * @DEVICE_PANEL_RIGHT: Device connection point is on the right panel. + * @DEVICE_PANEL_FRONT: Device connection point is on the front panel. + * @DEVICE_PANEL_BACK: Device connection point is on the back panel. + * @DEVICE_PANEL_UNKNOWN: The panel with device connection point is unknown. + */ +enum device_physical_location_panel { + DEVICE_PANEL_TOP, + DEVICE_PANEL_BOTTOM, + DEVICE_PANEL_LEFT, + DEVICE_PANEL_RIGHT, + DEVICE_PANEL_FRONT, + DEVICE_PANEL_BACK, + DEVICE_PANEL_UNKNOWN, +}; + +/** + * enum device_physical_location_vertical_position - Describes vertical + * position of the device connection point on the panel surface. + * @DEVICE_VERT_POS_UPPER: Device connection point is at upper part of panel. + * @DEVICE_VERT_POS_CENTER: Device connection point is at center part of panel. + * @DEVICE_VERT_POS_LOWER: Device connection point is at lower part of panel. + */ +enum device_physical_location_vertical_position { + DEVICE_VERT_POS_UPPER, + DEVICE_VERT_POS_CENTER, + DEVICE_VERT_POS_LOWER, +}; + +/** + * enum device_physical_location_horizontal_position - Describes horizontal + * position of the device connection point on the panel surface. + * @DEVICE_HORI_POS_LEFT: Device connection point is at left part of panel. + * @DEVICE_HORI_POS_CENTER: Device connection point is at center part of panel. + * @DEVICE_HORI_POS_RIGHT: Device connection point is at right part of panel. + */ +enum device_physical_location_horizontal_position { + DEVICE_HORI_POS_LEFT, + DEVICE_HORI_POS_CENTER, + DEVICE_HORI_POS_RIGHT, +}; + +/** + * struct device_physical_location - Device data related to physical location + * of the device connection point. + * @panel: Panel surface of the system's housing that the device connection + * point resides on. + * @vertical_position: Vertical position of the device connection point within + * the panel. + * @horizontal_position: Horizontal position of the device connection point + * within the panel. + * @dock: Set if the device connection point resides in a docking station or + * port replicator. + * @lid: Set if this device connection point resides on the lid of laptop + * system. + */ +struct device_physical_location { + enum device_physical_location_panel panel; + enum device_physical_location_vertical_position vertical_position; + enum device_physical_location_horizontal_position horizontal_position; + bool dock; + bool lid; +}; + /** * struct device - The basic device structure * @parent: The device's "parent" device, the device to which it is attached. @@ -453,6 +522,8 @@ struct dev_msi_info { * device (i.e. the bus driver that discovered the device). * @iommu_group: IOMMU group the device belongs to. * @iommu: Per device generic IOMMU runtime data + * @physical_location: Describes physical location of the device connection + * point in the system housing. * @removable: Whether the device can be removed from the system. This * should be set by the subsystem / bus driver that discovered * the device. @@ -567,6 +638,8 @@ struct device { struct iommu_group *iommu_group; struct dev_iommu *iommu; + struct device_physical_location *physical_location; + enum device_removable removable; bool offline_disabled:1; -- cgit v1.2.3 From b041b525dab95352fbd666b14dc73ab898df465f Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 10 Mar 2022 12:48:53 -0800 Subject: x86/split_lock: Make life miserable for split lockers In https://lore.kernel.org/all/87y22uujkm.ffs@tglx/ Thomas said: Its's simply wishful thinking that stuff gets fixed because of a WARN_ONCE(). This has never worked. The only thing which works is to make stuff fail hard or slow it down in a way which makes it annoying enough to users to complain. He was talking about WBINVD. But it made me think about how we use the split lock detection feature in Linux. Existing code has three options for applications: 1) Don't enable split lock detection (allow arbitrary split locks) 2) Warn once when a process uses split lock, but let the process keep running with split lock detection disabled 3) Kill process that use split locks Option 2 falls into the "wishful thinking" territory that Thomas warns does nothing. But option 3 might not be viable in a situation with legacy applications that need to run. Hence make option 2 much stricter to "slow it down in a way which makes it annoying". Primary reason for this change is to provide better quality of service to the rest of the applications running on the system. Internal testing shows that even with many processes splitting locks, performance for the rest of the system is much more responsive. The new "warn" mode operates like this. When an application tries to execute a bus lock the #AC handler. 1) Delays (interruptibly) 10 ms before moving to next step. 2) Blocks (interruptibly) until it can get the semaphore If interrupted, just return. Assume the signal will either kill the task, or direct execution away from the instruction that is trying to get the bus lock. 3) Disables split lock detection for the current core 4) Schedules a work queue to re-enable split lock detect in 2 jiffies 5) Returns The work queue that re-enables split lock detection also releases the semaphore. There is a corner case where a CPU may be taken offline while split lock detection is disabled. A CPU hotplug handler handles this case. Old behaviour was to only print the split lock warning on the first occurrence of a split lock from a task. Preserve that by adding a flag to the task structure that suppresses subsequent split lock messages from that task. Signed-off-by: Tony Luck Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220310204854.31752-2-tony.luck@intel.com --- include/linux/sched.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index a8911b1f35aa..23e03c7824c4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -941,6 +941,9 @@ struct task_struct { #ifdef CONFIG_IOMMU_SVA unsigned pasid_activated:1; #endif +#ifdef CONFIG_CPU_SUP_INTEL + unsigned reported_split_lock:1; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ -- cgit v1.2.3 From 4fd62f15afa0d0da4823f429a2fb4c3492a84edf Mon Sep 17 00:00:00 2001 From: Chuanhong Guo Date: Sun, 24 Apr 2022 11:25:23 +0800 Subject: mtd: nand: make mtk_ecc.c a separated module this code will be used in mediatek snfi spi-mem controller with pipelined ECC engine. Signed-off-by: Chuanhong Guo Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220424032527.673605-2-gch981213@gmail.com --- include/linux/mtd/nand-ecc-mtk.h | 47 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 include/linux/mtd/nand-ecc-mtk.h (limited to 'include/linux') diff --git a/include/linux/mtd/nand-ecc-mtk.h b/include/linux/mtd/nand-ecc-mtk.h new file mode 100644 index 000000000000..0e48c36e6ca0 --- /dev/null +++ b/include/linux/mtd/nand-ecc-mtk.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * MTK SDG1 ECC controller + * + * Copyright (c) 2016 Mediatek + * Authors: Xiaolei Li + * Jorge Ramirez-Ortiz + */ + +#ifndef __DRIVERS_MTD_NAND_MTK_ECC_H__ +#define __DRIVERS_MTD_NAND_MTK_ECC_H__ + +#include + +enum mtk_ecc_mode {ECC_DMA_MODE = 0, ECC_NFI_MODE = 1}; +enum mtk_ecc_operation {ECC_ENCODE, ECC_DECODE}; + +struct device_node; +struct mtk_ecc; + +struct mtk_ecc_stats { + u32 corrected; + u32 bitflips; + u32 failed; +}; + +struct mtk_ecc_config { + enum mtk_ecc_operation op; + enum mtk_ecc_mode mode; + dma_addr_t addr; + u32 strength; + u32 sectors; + u32 len; +}; + +int mtk_ecc_encode(struct mtk_ecc *, struct mtk_ecc_config *, u8 *, u32); +void mtk_ecc_get_stats(struct mtk_ecc *, struct mtk_ecc_stats *, int); +int mtk_ecc_wait_done(struct mtk_ecc *, enum mtk_ecc_operation); +int mtk_ecc_enable(struct mtk_ecc *, struct mtk_ecc_config *); +void mtk_ecc_disable(struct mtk_ecc *); +void mtk_ecc_adjust_strength(struct mtk_ecc *ecc, u32 *p); +unsigned int mtk_ecc_get_parity_bits(struct mtk_ecc *ecc); + +struct mtk_ecc *of_mtk_ecc_get(struct device_node *); +void mtk_ecc_release(struct mtk_ecc *); + +#endif -- cgit v1.2.3 From 7d84c1ebf9ddafca27b481e6da7d24a023dacaa2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2022 21:20:02 +0200 Subject: x86/aperfmperf: Replace aperfmperf_get_khz() The frequency invariance infrastructure provides the APERF/MPERF samples already. Utilize them for the cpu frequency display in /proc/cpuinfo. The sample is considered valid for 20ms. So for idle or isolated NOHZ full CPUs the function returns 0, which is matching the previous behaviour. This gets rid of the mass IPIs and a delay of 20ms for stabilizing observed by Eric when reading /proc/cpuinfo. Reported-by: Eric Dumazet Signed-off-by: Thomas Gleixner Tested-by: Eric Dumazet Reviewed-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220415161206.875029458@linutronix.de --- include/linux/cpufreq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 35c7d6db4139..d5595d57f4e5 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1199,7 +1199,6 @@ static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy, struct cpufreq_governor *old_gov) { } #endif -extern void arch_freq_prepare_all(void); extern unsigned int arch_freq_get_on_cpu(int cpu); #ifndef arch_set_freq_scale -- cgit v1.2.3 From b941820ec938d175fd6e4e7ba5b4a6ca9d092afd Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Mon, 25 Apr 2022 14:45:44 +0300 Subject: ACPI: OSL: Remove the helper for deactivating memory region There are no more users for acpi_release_memory(). Signed-off-by: Heikki Krogerus Reviewed-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index d7136d13aa44..fadda404bcc9 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -520,9 +520,6 @@ int acpi_check_resource_conflict(const struct resource *res); int acpi_check_region(resource_size_t start, resource_size_t n, const char *name); -acpi_status acpi_release_memory(acpi_handle handle, struct resource *res, - u32 level); - int acpi_resources_are_enforced(void); #ifdef CONFIG_HIBERNATION -- cgit v1.2.3 From 0a8e98305f63deaf0a799d5cf5532cc83af035d1 Mon Sep 17 00:00:00 2001 From: Tokunori Ikegami Date: Thu, 24 Mar 2022 02:04:56 +0900 Subject: mtd: cfi_cmdset_0002: Use chip_ready() for write on S29GL064N Since commit dfeae1073583("mtd: cfi_cmdset_0002: Change write buffer to check correct value") buffered writes fail on S29GL064N. This is because, on S29GL064N, reads return 0xFF at the end of DQ polling for write completion, where as, chip_good() check expects actual data written to the last location to be returned post DQ polling completion. Fix is to revert to using chip_good() for S29GL064N which only checks for DQ lines to settle down to determine write completion. Link: https://lore.kernel.org/r/b687c259-6413-26c9-d4c9-b3afa69ea124@pengutronix.de/ Fixes: dfeae1073583("mtd: cfi_cmdset_0002: Change write buffer to check correct value") Cc: stable@vger.kernel.org Signed-off-by: Tokunori Ikegami Acked-by: Vignesh Raghavendra Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220323170458.5608-3-ikegami.t@gmail.com --- include/linux/mtd/cfi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h index fd1ecb821106..d88bb56c18e2 100644 --- a/include/linux/mtd/cfi.h +++ b/include/linux/mtd/cfi.h @@ -286,6 +286,7 @@ struct cfi_private { map_word sector_erase_cmd; unsigned long chipshift; /* Because they're of the same type */ const char *im_name; /* inter_module name for cmdset_setup */ + unsigned long quirks; struct flchip chips[]; /* per-chip data structure for each chip */ }; -- cgit v1.2.3 From ed36d04e8f8d7b00db451b0fa56a54e8e02ec43e Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 25 Apr 2022 13:42:02 +0100 Subject: iommu: Introduce device_iommu_capable() iommu_capable() only really works for systems where all IOMMU instances are completely homogeneous, and all devices are IOMMU-mapped. Implement the new variant which will be able to give a more accurate answer for whichever device the caller is actually interested in, and even more so once all the external users have been converted and we can reliably pass the device pointer through the internal driver interface too. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/8407eb9586677995b7a9fd70d0fd82d85929a9bb.1650878781.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 9208eca4b0d1..e26cf84e5d82 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -417,6 +417,7 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops); extern int bus_iommu_probe(struct bus_type *bus); extern bool iommu_present(struct bus_type *bus); +extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap); extern bool iommu_capable(struct bus_type *bus, enum iommu_cap cap); extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus); extern struct iommu_group *iommu_group_get_by_id(int id); @@ -689,6 +690,11 @@ static inline bool iommu_present(struct bus_type *bus) return false; } +static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap) +{ + return false; +} + static inline bool iommu_capable(struct bus_type *bus, enum iommu_cap cap) { return false; -- cgit v1.2.3 From d0be55fbeb6ac694d15af5d1aad19cdec8cd64e5 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 25 Apr 2022 13:42:03 +0100 Subject: iommu: Add capability for pre-boot DMA protection VT-d's dmar_platform_optin() actually represents a combination of properties fairly well standardised by Microsoft as "Pre-boot DMA Protection" and "Kernel DMA Protection"[1]. As such, we can provide interested consumers with an abstracted capability rather than driver-specific interfaces that won't scale. We name it for the former aspect since that's what external callers are most likely to be interested in; the latter is for the IOMMU layer to handle itself. [1] https://docs.microsoft.com/en-us/windows-hardware/design/device-experiences/oem-kernel-dma-protection Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Lu Baolu Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/d6218dff2702472da80db6aec2c9589010684551.1650878781.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e26cf84e5d82..4123693ae319 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -107,6 +107,8 @@ enum iommu_cap { transactions */ IOMMU_CAP_INTR_REMAP, /* IOMMU supports interrupt isolation */ IOMMU_CAP_NOEXEC, /* IOMMU_NOEXEC flag */ + IOMMU_CAP_PRE_BOOT_PROTECTION, /* Firmware says it used the IOMMU for + DMA protection and we should too */ }; /* These are the possible reserved region types */ -- cgit v1.2.3 From 86eaf4a5b4312bea8676fb79399d9e08b53d8e71 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 25 Apr 2022 13:42:04 +0100 Subject: thunderbolt: Make iommu_dma_protection more accurate Between me trying to get rid of iommu_present() and Mario wanting to support the AMD equivalent of DMAR_PLATFORM_OPT_IN, scrutiny has shown that the iommu_dma_protection attribute is being far too optimistic. Even if an IOMMU might be present for some PCI segment in the system, that doesn't necessarily mean it provides translation for the device(s) we care about. Furthermore, all that DMAR_PLATFORM_OPT_IN really does is tell us that memory was protected before the kernel was loaded, and prevent the user from disabling the intel-iommu driver entirely. While that lets us assume kernel integrity, what matters for actual runtime DMA protection is whether we trust individual devices, based on the "external facing" property that we expect firmware to describe for Thunderbolt ports. It's proven challenging to determine the appropriate ports accurately given the variety of possible topologies, so while still not getting a perfect answer, by putting enough faith in firmware we can at least get a good bit closer. If we can see that any device near a Thunderbolt NHI has all the requisites for Kernel DMA Protection, chances are that it *is* a relevant port, but moreover that implies that firmware is playing the game overall, so we'll use that to assume that all Thunderbolt ports should be correctly marked and thus will end up fully protected. CC: Mario Limonciello Reviewed-by: Christoph Hellwig Acked-by: Mika Westerberg Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/b153f208bc9eafab5105bad0358b77366509d2d4.1650878781.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/thunderbolt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 124e13cb1469..7a8ad984e651 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -465,6 +465,7 @@ static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc) * @msix_ida: Used to allocate MSI-X vectors for rings * @going_away: The host controller device is about to disappear so when * this flag is set, avoid touching the hardware anymore. + * @iommu_dma_protection: An IOMMU will isolate external-facing ports. * @interrupt_work: Work scheduled to handle ring interrupt when no * MSI-X is used. * @hop_count: Number of rings (end point hops) supported by NHI. @@ -479,6 +480,7 @@ struct tb_nhi { struct tb_ring **rx_rings; struct ida msix_ida; bool going_away; + bool iommu_dma_protection; struct work_struct interrupt_work; u32 hop_count; unsigned long quirks; -- cgit v1.2.3 From 2147c438fde135d6c145a96e373d9348e7076f7f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 25 Apr 2022 16:40:02 -0700 Subject: x86/speculation: Add missing prototype for unpriv_ebpf_notify() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following warnings seen with "make W=1": kernel/sysctl.c:183:13: warning: no previous prototype for ‘unpriv_ebpf_notify’ [-Wmissing-prototypes] 183 | void __weak unpriv_ebpf_notify(int new_state) | ^~~~~~~~~~~~~~~~~~ arch/x86/kernel/cpu/bugs.c:659:6: warning: no previous prototype for ‘unpriv_ebpf_notify’ [-Wmissing-prototypes] 659 | void unpriv_ebpf_notify(int new_state) | ^~~~~~~~~~~~~~~~~~ Fixes: 44a3918c8245 ("x86/speculation: Include unprivileged eBPF status in Spectre v2 mitigation reporting") Reported-by: kernel test robot Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/5689d065f739602ececaee1e05e68b8644009608.1650930000.git.jpoimboe@redhat.com --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bdb5298735ce..ecc3d3ec41cf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2085,6 +2085,8 @@ void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev); bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev); +void unpriv_ebpf_notify(int new_state); + #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); -- cgit v1.2.3 From 1ea2a07a532b0e22aabe7e8483f935c672b9e7ed Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Apr 2022 08:49:50 +0800 Subject: iommu: Add DMA ownership management interfaces Multiple devices may be placed in the same IOMMU group because they cannot be isolated from each other. These devices must either be entirely under kernel control or userspace control, never a mixture. This adds dma ownership management in iommu core and exposes several interfaces for the device drivers and the device userspace assignment framework (i.e. VFIO), so that any conflict between user and kernel controlled dma could be detected at the beginning. The device driver oriented interfaces are, int iommu_device_use_default_domain(struct device *dev); void iommu_device_unuse_default_domain(struct device *dev); By calling iommu_device_use_default_domain(), the device driver tells the iommu layer that the device dma is handled through the kernel DMA APIs. The iommu layer will manage the IOVA and use the default domain for DMA address translation. The device user-space assignment framework oriented interfaces are, int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); void iommu_group_release_dma_owner(struct iommu_group *group); bool iommu_group_dma_owner_claimed(struct iommu_group *group); The device userspace assignment must be disallowed if the DMA owner claiming interface returns failure. Signed-off-by: Jason Gunthorpe Signed-off-by: Kevin Tian Signed-off-by: Lu Baolu Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/20220418005000.897664-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 9208eca4b0d1..77972ef978b5 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -675,6 +675,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +int iommu_device_use_default_domain(struct device *dev); +void iommu_device_unuse_default_domain(struct device *dev); + +int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); +void iommu_group_release_dma_owner(struct iommu_group *group); +bool iommu_group_dma_owner_claimed(struct iommu_group *group); + #else /* CONFIG_IOMMU_API */ struct iommu_ops {}; @@ -1031,6 +1038,30 @@ static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) { return NULL; } + +static inline int iommu_device_use_default_domain(struct device *dev) +{ + return 0; +} + +static inline void iommu_device_unuse_default_domain(struct device *dev) +{ +} + +static inline int +iommu_group_claim_dma_owner(struct iommu_group *group, void *owner) +{ + return -ENODEV; +} + +static inline void iommu_group_release_dma_owner(struct iommu_group *group) +{ +} + +static inline bool iommu_group_dma_owner_claimed(struct iommu_group *group) +{ + return false; +} #endif /* CONFIG_IOMMU_API */ /** -- cgit v1.2.3 From 25f3bcfc54bcf7b0e45d140ec8bfbbf10ba11869 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Apr 2022 08:49:51 +0800 Subject: driver core: Add dma_cleanup callback in bus_type The bus_type structure defines dma_configure() callback for bus drivers to configure DMA on the devices. This adds the paired dma_cleanup() callback and calls it during driver unbinding so that bus drivers can do some cleanup work. One use case for this paired DMA callbacks is for the bus driver to check for DMA ownership conflicts during driver binding, where multiple devices belonging to a same IOMMU group (the minimum granularity of isolation and protection) may be assigned to kernel drivers or user space respectively. Without this change, for example, the vfio driver has to listen to a bus BOUND_DRIVER event and then BUG_ON() in case of dma ownership conflict. This leads to bad user experience since careless driver binding operation may crash the system if the admin overlooks the group restriction. Aside from bad design, this leads to a security problem as a root user, even with lockdown=integrity, can force the kernel to BUG. With this change, the bus driver could check and set the DMA ownership in driver binding process and fail on ownership conflicts. The DMA ownership should be released during driver unbinding. Signed-off-by: Lu Baolu Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220418005000.897664-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/device/bus.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index a039ab809753..d8b29ccd07e5 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -59,6 +59,8 @@ struct fwnode_handle; * bus supports. * @dma_configure: Called to setup DMA configuration on a device on * this bus. + * @dma_cleanup: Called to cleanup DMA configuration on a device on + * this bus. * @pm: Power management operations of this bus, callback the specific * device driver's pm-ops. * @iommu_ops: IOMMU specific operations for this bus, used to attach IOMMU @@ -103,6 +105,7 @@ struct bus_type { int (*num_vf)(struct device *dev); int (*dma_configure)(struct device *dev); + void (*dma_cleanup)(struct device *dev); const struct dev_pm_ops *pm; -- cgit v1.2.3 From 4a6d9dd564d0e7339fc15ecc5ce66db4ad842be2 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Apr 2022 08:49:52 +0800 Subject: amba: Stop sharing platform_dma_configure() Stop sharing platform_dma_configure() helper as they are about to have their own bus dma_configure callbacks. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220418005000.897664-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/platform_device.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 7c96f169d274..17fde717df68 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -328,8 +328,6 @@ extern int platform_pm_restore(struct device *dev); #define platform_pm_restore NULL #endif -extern int platform_dma_configure(struct device *dev); - #ifdef CONFIG_PM_SLEEP #define USE_PLATFORM_PM_SLEEP_OPS \ .suspend = platform_pm_suspend, \ -- cgit v1.2.3 From 512881eacfa72c2136b27b9934b7b27504a9efc2 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Apr 2022 08:49:53 +0800 Subject: bus: platform,amba,fsl-mc,PCI: Add device DMA ownership management The devices on platform/amba/fsl-mc/PCI buses could be bound to drivers with the device DMA managed by kernel drivers or user-space applications. Unfortunately, multiple devices may be placed in the same IOMMU group because they cannot be isolated from each other. The DMA on these devices must either be entirely under kernel control or userspace control, never a mixture. Otherwise the driver integrity is not guaranteed because they could access each other through the peer-to-peer accesses which by-pass the IOMMU protection. This checks and sets the default DMA mode during driver binding, and cleanups during driver unbinding. In the default mode, the device DMA is managed by the device driver which handles DMA operations through the kernel DMA APIs (see Documentation/core-api/dma-api.rst). For cases where the devices are assigned for userspace control through the userspace driver framework(i.e. VFIO), the drivers(for example, vfio_pci/ vfio_platfrom etc.) may set a new flag (driver_managed_dma) to skip this default setting in the assumption that the drivers know what they are doing with the device DMA. Calling iommu_device_use_default_domain() before {of,acpi}_dma_configure is currently a problem. As things stand, the IOMMU driver ignored the initial iommu_probe_device() call when the device was added, since at that point it had no fwspec yet. In this situation, {of,acpi}_iommu_configure() are retriggering iommu_probe_device() after the IOMMU driver has seen the firmware data via .of_xlate to learn that it actually responsible for the given device. As the result, before that gets fixed, iommu_use_default_domain() goes at the end, and calls arch_teardown_dma_ops() if it fails. Cc: Greg Kroah-Hartman Cc: Bjorn Helgaas Cc: Stuart Yoder Cc: Laurentiu Tudor Signed-off-by: Lu Baolu Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Reviewed-by: Robin Murphy Tested-by: Eric Auger Link: https://lore.kernel.org/r/20220418005000.897664-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/amba/bus.h | 8 ++++++++ include/linux/fsl/mc.h | 8 ++++++++ include/linux/pci.h | 8 ++++++++ include/linux/platform_device.h | 8 ++++++++ 4 files changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index 6562f543c3e0..2ddce9bcd00e 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -79,6 +79,14 @@ struct amba_driver { void (*remove)(struct amba_device *); void (*shutdown)(struct amba_device *); const struct amba_id *id_table; + /* + * For most device drivers, no need to care about this flag as long as + * all DMAs are handled through the kernel DMA API. For some special + * ones, for example VFIO drivers, they know how to manage the DMA + * themselves and set this flag so that the IOMMU layer will allow them + * to setup and manage their own I/O address space. + */ + bool driver_managed_dma; }; /* diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index 7b6c42bfb660..27efef8affb1 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -32,6 +32,13 @@ struct fsl_mc_io; * @shutdown: Function called at shutdown time to quiesce the device * @suspend: Function called when a device is stopped * @resume: Function called when a device is resumed + * @driver_managed_dma: Device driver doesn't use kernel DMA API for DMA. + * For most device drivers, no need to care about this flag + * as long as all DMAs are handled through the kernel DMA API. + * For some special ones, for example VFIO drivers, they know + * how to manage the DMA themselves and set this flag so that + * the IOMMU layer will allow them to setup and manage their + * own I/O address space. * * Generic DPAA device driver object for device drivers that are registered * with a DPRC bus. This structure is to be embedded in each device-specific @@ -45,6 +52,7 @@ struct fsl_mc_driver { void (*shutdown)(struct fsl_mc_device *dev); int (*suspend)(struct fsl_mc_device *dev, pm_message_t state); int (*resume)(struct fsl_mc_device *dev); + bool driver_managed_dma; }; #define to_fsl_mc_driver(_drv) \ diff --git a/include/linux/pci.h b/include/linux/pci.h index 60adf42460ab..b933d2b08d4d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -895,6 +895,13 @@ struct module; * created once it is bound to the driver. * @driver: Driver model structure. * @dynids: List of dynamically added device IDs. + * @driver_managed_dma: Device driver doesn't use kernel DMA API for DMA. + * For most device drivers, no need to care about this flag + * as long as all DMAs are handled through the kernel DMA API. + * For some special ones, for example VFIO drivers, they know + * how to manage the DMA themselves and set this flag so that + * the IOMMU layer will allow them to setup and manage their + * own I/O address space. */ struct pci_driver { struct list_head node; @@ -913,6 +920,7 @@ struct pci_driver { const struct attribute_group **dev_groups; struct device_driver driver; struct pci_dynids dynids; + bool driver_managed_dma; }; static inline struct pci_driver *to_pci_driver(struct device_driver *drv) diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 17fde717df68..b3d9c744f1e5 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -210,6 +210,14 @@ struct platform_driver { struct device_driver driver; const struct platform_device_id *id_table; bool prevent_deferred_probe; + /* + * For most device drivers, no need to care about this flag as long as + * all DMAs are handled through the kernel DMA API. For some special + * ones, for example VFIO drivers, they know how to manage the DMA + * themselves and set this flag so that the IOMMU layer will allow them + * to setup and manage their own I/O address space. + */ + bool driver_managed_dma; }; #define to_platform_driver(drv) (container_of((drv), struct platform_driver, \ -- cgit v1.2.3 From a5f1bd1afacd7b1e088f93f66af5453df0d8be9a Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Apr 2022 08:50:00 +0800 Subject: iommu: Remove iommu group changes notifier The iommu group changes notifer is not referenced in the tree. Remove it to avoid dead code. Suggested-by: Christoph Hellwig Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220418005000.897664-12-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 77972ef978b5..6ef2df258673 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -407,13 +407,6 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) return dev->iommu->iommu_dev->ops; } -#define IOMMU_GROUP_NOTIFY_ADD_DEVICE 1 /* Device added */ -#define IOMMU_GROUP_NOTIFY_DEL_DEVICE 2 /* Pre Device removed */ -#define IOMMU_GROUP_NOTIFY_BIND_DRIVER 3 /* Pre Driver bind */ -#define IOMMU_GROUP_NOTIFY_BOUND_DRIVER 4 /* Post Driver bind */ -#define IOMMU_GROUP_NOTIFY_UNBIND_DRIVER 5 /* Pre Driver unbind */ -#define IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER 6 /* Post Driver unbind */ - extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops); extern int bus_iommu_probe(struct bus_type *bus); extern bool iommu_present(struct bus_type *bus); @@ -478,10 +471,6 @@ extern int iommu_group_for_each_dev(struct iommu_group *group, void *data, extern struct iommu_group *iommu_group_get(struct device *dev); extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group); extern void iommu_group_put(struct iommu_group *group); -extern int iommu_group_register_notifier(struct iommu_group *group, - struct notifier_block *nb); -extern int iommu_group_unregister_notifier(struct iommu_group *group, - struct notifier_block *nb); extern int iommu_register_device_fault_handler(struct device *dev, iommu_dev_fault_handler_t handler, void *data); @@ -878,18 +867,6 @@ static inline void iommu_group_put(struct iommu_group *group) { } -static inline int iommu_group_register_notifier(struct iommu_group *group, - struct notifier_block *nb) -{ - return -ENODEV; -} - -static inline int iommu_group_unregister_notifier(struct iommu_group *group, - struct notifier_block *nb) -{ - return 0; -} - static inline int iommu_register_device_fault_handler(struct device *dev, iommu_dev_fault_handler_t handler, -- cgit v1.2.3 From 00675017e0aeba5305665c52ded4ddce6a4c0231 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 4 Apr 2022 12:51:40 +0200 Subject: fs: add two trivial lookup helpers Similar to the addition of lookup_one() add a version of lookup_one_unlocked() and lookup_one_positive_unlocked() that take idmapped mounts into account. This is required to port overlay to support idmapped base layers. Cc: Tested-by: Giuseppe Scrivano Reviewed-by: Amir Goldstein Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- include/linux/namei.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/namei.h b/include/linux/namei.h index e89329bb3134..caeb08a98536 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -69,6 +69,12 @@ extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int); struct dentry *lookup_one(struct user_namespace *, const char *, struct dentry *, int); +struct dentry *lookup_one_unlocked(struct user_namespace *mnt_userns, + const char *name, struct dentry *base, + int len); +struct dentry *lookup_one_positive_unlocked(struct user_namespace *mnt_userns, + const char *name, + struct dentry *base, int len); extern int follow_down_one(struct path *); extern int follow_down(struct path *); -- cgit v1.2.3 From dbde6d0c7a5a462a1767a07c28eadd2c3dd08fb2 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Thu, 28 Apr 2022 16:51:05 +0200 Subject: hv_sock: Add validation for untrusted Hyper-V values For additional robustness in the face of Hyper-V errors or malicious behavior, validate all values that originate from packets that Hyper-V has sent to the guest in the host-to-guest ring buffer. Ensure that invalid values cannot cause data being copied out of the bounds of the source buffer in hvs_stream_dequeue(). Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20220428145107.7878-4-parri.andrea@gmail.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 460a716f4748..a01c9fd0a334 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1696,6 +1696,11 @@ static inline u32 hv_pkt_datalen(const struct vmpacket_descriptor *desc) return (desc->len8 << 3) - (desc->offset8 << 3); } +/* Get packet length associated with descriptor */ +static inline u32 hv_pkt_len(const struct vmpacket_descriptor *desc) +{ + return desc->len8 << 3; +} struct vmpacket_descriptor * hv_pkt_iter_first_raw(struct vmbus_channel *channel); -- cgit v1.2.3 From da795eb239d9e9496812e22cb582d38a71e0789a Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Thu, 28 Apr 2022 16:51:06 +0200 Subject: Drivers: hv: vmbus: Accept hv_sock offers in isolated guests So that isolated guests can communicate with the host via hv_sock channels. Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220428145107.7878-5-parri.andrea@gmail.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index a01c9fd0a334..b028905d8334 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1064,10 +1064,14 @@ u64 vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, u64 rqst_addr); u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id); +static inline bool is_hvsock_offer(const struct vmbus_channel_offer_channel *o) +{ + return !!(o->offer.chn_flags & VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER); +} + static inline bool is_hvsock_channel(const struct vmbus_channel *c) { - return !!(c->offermsg.offer.chn_flags & - VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER); + return is_hvsock_offer(&c->offermsg); } static inline bool is_sub_channel(const struct vmbus_channel *c) -- cgit v1.2.3 From 1c9de08f7f952b4101f092802581344033d84429 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Thu, 28 Apr 2022 16:51:07 +0200 Subject: Drivers: hv: vmbus: Refactor the ring-buffer iterator functions With no users of hv_pkt_iter_next_raw() and no "external" users of hv_pkt_iter_first_raw(), the iterator functions can be refactored and simplified to remove some indirection/code. Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220428145107.7878-6-parri.andrea@gmail.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 35 ++++------------------------------- 1 file changed, 4 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b028905d8334..c440c45887c2 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1706,55 +1706,28 @@ static inline u32 hv_pkt_len(const struct vmpacket_descriptor *desc) return desc->len8 << 3; } -struct vmpacket_descriptor * -hv_pkt_iter_first_raw(struct vmbus_channel *channel); - struct vmpacket_descriptor * hv_pkt_iter_first(struct vmbus_channel *channel); struct vmpacket_descriptor * __hv_pkt_iter_next(struct vmbus_channel *channel, - const struct vmpacket_descriptor *pkt, - bool copy); + const struct vmpacket_descriptor *pkt); void hv_pkt_iter_close(struct vmbus_channel *channel); static inline struct vmpacket_descriptor * -hv_pkt_iter_next_pkt(struct vmbus_channel *channel, - const struct vmpacket_descriptor *pkt, - bool copy) +hv_pkt_iter_next(struct vmbus_channel *channel, + const struct vmpacket_descriptor *pkt) { struct vmpacket_descriptor *nxt; - nxt = __hv_pkt_iter_next(channel, pkt, copy); + nxt = __hv_pkt_iter_next(channel, pkt); if (!nxt) hv_pkt_iter_close(channel); return nxt; } -/* - * Get next packet descriptor without copying it out of the ring buffer - * If at end of list, return NULL and update host. - */ -static inline struct vmpacket_descriptor * -hv_pkt_iter_next_raw(struct vmbus_channel *channel, - const struct vmpacket_descriptor *pkt) -{ - return hv_pkt_iter_next_pkt(channel, pkt, false); -} - -/* - * Get next packet descriptor from iterator - * If at end of list, return NULL and update host. - */ -static inline struct vmpacket_descriptor * -hv_pkt_iter_next(struct vmbus_channel *channel, - const struct vmpacket_descriptor *pkt) -{ - return hv_pkt_iter_next_pkt(channel, pkt, true); -} - #define foreach_vmbus_pkt(pkt, channel) \ for (pkt = hv_pkt_iter_first(channel); pkt; \ pkt = hv_pkt_iter_next(channel, pkt)) -- cgit v1.2.3 From 6043257b1de069fbb5a2a52d7211c0275bc8c0e0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 11 Apr 2022 12:16:05 -0300 Subject: iommu: Introduce the domain op enforce_cache_coherency() This new mechanism will replace using IOMMU_CAP_CACHE_COHERENCY and IOMMU_CACHE to control the no-snoop blocking behavior of the IOMMU. Currently only Intel and AMD IOMMUs are known to support this feature. They both implement it as an IOPTE bit, that when set, will cause PCIe TLPs to that IOVA with the no-snoop bit set to be treated as though the no-snoop bit was clear. The new API is triggered by calling enforce_cache_coherency() before mapping any IOVA to the domain which globally switches on no-snoop blocking. This allows other implementations that might block no-snoop globally and outside the IOPTE - AMD also documents such a HW capability. Leave AMD out of sync with Intel and have it block no-snoop even for in-kernel users. This can be trivially resolved in a follow up patch. Only VFIO needs to call this API because it does not have detailed control over the device to avoid requesting no-snoop behavior at the device level. Other places using domains with real kernel drivers should simply avoid asking their devices to set the no-snoop bit. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Acked-by: Robin Murphy Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v3-2cf356649677+a32-intel_no_snoop_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 1 + include/linux/iommu.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 2f9891cb3d00..4c2baf2446c2 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -540,6 +540,7 @@ struct dmar_domain { u8 has_iotlb_device: 1; u8 iommu_coherency: 1; /* indicate coherency of iommu access */ u8 iommu_snooping: 1; /* indicate snooping control feature */ + u8 force_snooping : 1; /* Create IOPTEs with snoop control */ struct list_head devices; /* all devices' list */ struct iova_domain iovad; /* iova's that belong to this domain */ diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4123693ae319..c7ad6b10e261 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -274,6 +274,9 @@ struct iommu_ops { * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue * @iova_to_phys: translate iova to physical address + * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE, + * including no-snoop TLPs on PCIe or other platform + * specific mechanisms. * @enable_nesting: Enable nesting * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*) * @free: Release the domain after use. @@ -302,6 +305,7 @@ struct iommu_domain_ops { phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); + bool (*enforce_cache_coherency)(struct iommu_domain *domain); int (*enable_nesting)(struct iommu_domain *domain); int (*set_pgtable_quirks)(struct iommu_domain *domain, unsigned long quirks); -- cgit v1.2.3 From 71cfafda9c9bd9812cdb62ddb94daf65a1af12c1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 11 Apr 2022 12:16:06 -0300 Subject: vfio: Move the Intel no-snoop control off of IOMMU_CACHE IOMMU_CACHE means "normal DMA to this iommu_domain's IOVA should be cache coherent" and is used by the DMA API. The definition allows for special non-coherent DMA to exist - ie processing of the no-snoop flag in PCIe TLPs - so long as this behavior is opt-in by the device driver. The flag is mainly used by the DMA API to synchronize the IOMMU setting with the expected cache behavior of the DMA master. eg based on dev_is_dma_coherent() in some case. For Intel IOMMU IOMMU_CACHE was redefined to mean 'force all DMA to be cache coherent' which has the practical effect of causing the IOMMU to ignore the no-snoop bit in a PCIe TLP. x86 platforms are always IOMMU_CACHE, so Intel should ignore this flag. Instead use the new domain op enforce_cache_coherency() which causes every IOPTE created in the domain to have the no-snoop blocking behavior. Reconfigure VFIO to always use IOMMU_CACHE and call enforce_cache_coherency() to operate the special Intel behavior. Remove the IOMMU_CACHE test from Intel IOMMU. Ultimately VFIO plumbs the result of enforce_cache_coherency() back into the x86 platform code through kvm_arch_register_noncoherent_dma() which controls if the WBINVD instruction is available in the guest. No other archs implement kvm_arch_register_noncoherent_dma() nor are there any other known consumers of VFIO_DMA_CC_IOMMU that might be affected by the user visible result change on non-x86 archs. Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Acked-by: Alex Williamson Acked-by: Robin Murphy Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v3-2cf356649677+a32-intel_no_snoop_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 4c2baf2446c2..72e5d7900e71 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -539,7 +539,6 @@ struct dmar_domain { u8 has_iotlb_device: 1; u8 iommu_coherency: 1; /* indicate coherency of iommu access */ - u8 iommu_snooping: 1; /* indicate snooping control feature */ u8 force_snooping : 1; /* Create IOPTEs with snoop control */ struct list_head devices; /* all devices' list */ -- cgit v1.2.3 From f78dc1dad829e505d83e33dc0879887f074c52e1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 11 Apr 2022 12:16:07 -0300 Subject: iommu: Redefine IOMMU_CAP_CACHE_COHERENCY as the cap flag for IOMMU_CACHE While the comment was correct that this flag was intended to convey the block no-snoop support in the IOMMU, it has become widely implemented and used to mean the IOMMU supports IOMMU_CACHE as a map flag. Only the Intel driver was different. Now that the Intel driver is using enforce_cache_coherency() update the comment to make it clear that IOMMU_CAP_CACHE_COHERENCY is only about IOMMU_CACHE. Fix the Intel driver to return true since IOMMU_CACHE always works. The two places that test this flag, usnic and vdpa, are both assigning userspace pages to a driver controlled iommu_domain and require IOMMU_CACHE behavior as they offer no way for userspace to synchronize caches. Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Acked-by: Robin Murphy Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v3-2cf356649677+a32-intel_no_snoop_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c7ad6b10e261..575ab27ede5b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -103,8 +103,7 @@ static inline bool iommu_is_dma_domain(struct iommu_domain *domain) } enum iommu_cap { - IOMMU_CAP_CACHE_COHERENCY, /* IOMMU can enforce cache coherent DMA - transactions */ + IOMMU_CAP_CACHE_COHERENCY, /* IOMMU_CACHE is supported */ IOMMU_CAP_INTR_REMAP, /* IOMMU supports interrupt isolation */ IOMMU_CAP_NOEXEC, /* IOMMU_NOEXEC flag */ IOMMU_CAP_PRE_BOOT_PROTECTION, /* Firmware says it used the IOMMU for -- cgit v1.2.3 From 992be5d3c818fcc277db246cb409659ca82abdbe Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 30 Mar 2022 16:05:35 +0100 Subject: firmware: arm_scmi: Make name_get operations return a const A few protocol operations are available that returns a pointer to an internal character array representing resource name. Make those functions return a const pointer to such array. Link: https://lore.kernel.org/r/20220330150551.2573938-7-cristian.marussi@arm.com Signed-off-by: Cristian Marussi Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index b87551f41f9f..ced37d1de1fe 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -146,7 +146,8 @@ struct scmi_perf_proto_ops { */ struct scmi_power_proto_ops { int (*num_domains_get)(const struct scmi_protocol_handle *ph); - char *(*name_get)(const struct scmi_protocol_handle *ph, u32 domain); + const char *(*name_get)(const struct scmi_protocol_handle *ph, + u32 domain); #define SCMI_POWER_STATE_TYPE_SHIFT 30 #define SCMI_POWER_STATE_ID_MASK (BIT(28) - 1) #define SCMI_POWER_STATE_PARAM(type, id) \ @@ -484,7 +485,8 @@ struct scmi_sensor_proto_ops { */ struct scmi_reset_proto_ops { int (*num_domains_get)(const struct scmi_protocol_handle *ph); - char *(*name_get)(const struct scmi_protocol_handle *ph, u32 domain); + const char *(*name_get)(const struct scmi_protocol_handle *ph, + u32 domain); int (*latency_get)(const struct scmi_protocol_handle *ph, u32 domain); int (*reset)(const struct scmi_protocol_handle *ph, u32 domain); int (*assert)(const struct scmi_protocol_handle *ph, u32 domain); -- cgit v1.2.3 From b260fccaebdc2c838e62aaef24fedf497f181d10 Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 30 Mar 2022 16:05:40 +0100 Subject: firmware: arm_scmi: Add SCMI v3.1 protocol extended names support Using the common protocol helper implementation add support for all new SCMIv3.1 extended names commands related to all protocols with the exception of SENSOR_AXIS_GET_NAME. Link: https://lore.kernel.org/r/20220330150551.2573938-12-cristian.marussi@arm.com Signed-off-by: Cristian Marussi Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index ced37d1de1fe..56e6f13355b8 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -13,7 +13,7 @@ #include #include -#define SCMI_MAX_STR_SIZE 16 +#define SCMI_MAX_STR_SIZE 64 #define SCMI_MAX_NUM_RATES 16 /** -- cgit v1.2.3 From 7aa75496ea1f38dfd99b93c66f8d9bc525d11efc Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 30 Mar 2022 16:05:48 +0100 Subject: firmware: arm_scmi: Add SCMI v3.1 clock notifications Add SCMI v3.1 clock pre and post notifications. Link: https://lore.kernel.org/r/20220330150551.2573938-20-cristian.marussi@arm.com Signed-off-by: Cristian Marussi Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 56e6f13355b8..0e20acc80d50 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -44,6 +44,8 @@ struct scmi_clock_info { char name[SCMI_MAX_STR_SIZE]; unsigned int enable_latency; bool rate_discrete; + bool rate_changed_notifications; + bool rate_change_requested_notifications; union { struct { int num_rates; @@ -744,6 +746,8 @@ void scmi_protocol_unregister(const struct scmi_protocol *proto); /* SCMI Notification API - Custom Event Reports */ enum scmi_notification_events { SCMI_EVENT_POWER_STATE_CHANGED = 0x0, + SCMI_EVENT_CLOCK_RATE_CHANGED = 0x0, + SCMI_EVENT_CLOCK_RATE_CHANGE_REQUESTED = 0x1, SCMI_EVENT_PERFORMANCE_LIMITS_CHANGED = 0x0, SCMI_EVENT_PERFORMANCE_LEVEL_CHANGED = 0x1, SCMI_EVENT_SENSOR_TRIP_POINT_EVENT = 0x0, @@ -760,6 +764,13 @@ struct scmi_power_state_changed_report { unsigned int power_state; }; +struct scmi_clock_rate_notif_report { + ktime_t timestamp; + unsigned int agent_id; + unsigned int clock_id; + unsigned long long rate; +}; + struct scmi_system_power_state_notifier_report { ktime_t timestamp; unsigned int agent_id; -- cgit v1.2.3 From 4c74701b1eb7636eb0cdd66b488b42920105122a Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 30 Mar 2022 16:05:49 +0100 Subject: firmware: arm_scmi: Add SCMI v3.1 VOLTAGE_LEVEL_SET_COMPLETE Add SCMI v3.1 voltage protocol support for asynchronous VOLTAGE_LEVEL_SET command. Note that, if a voltage domain is advertised to support the asynchronous version of VOLTAGE_LEVEL_SET, the command will be issued asynchronously unless explicitly requested to use the synchronous version by setting the mode to SCMI_VOLTAGE_LEVEL_SET_SYNC when calling voltage_ops->level_set. The SCMI regulator driver level_set invocation has been left unchanged so that it will transparently use the asynchronous version if available. Link: https://lore.kernel.org/r/20220330150551.2573938-21-cristian.marussi@arm.com Signed-off-by: Cristian Marussi Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 0e20acc80d50..1c58646ba381 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -495,6 +495,11 @@ struct scmi_reset_proto_ops { int (*deassert)(const struct scmi_protocol_handle *ph, u32 domain); }; +enum scmi_voltage_level_mode { + SCMI_VOLTAGE_LEVEL_SET_AUTO, + SCMI_VOLTAGE_LEVEL_SET_SYNC, +}; + /** * struct scmi_voltage_info - describe one available SCMI Voltage Domain * @@ -507,7 +512,8 @@ struct scmi_reset_proto_ops { * supported voltage level * @negative_volts_allowed: True if any of the entries of @levels_uv represent * a negative voltage. - * @attributes: represents Voltage Domain advertised attributes + * @async_level_set: True when the voltage domain supports asynchronous level + * set commands. * @name: name assigned to the Voltage Domain by platform * @num_levels: number of total entries in @levels_uv. * @levels_uv: array of entries describing the available voltage levels for @@ -517,7 +523,7 @@ struct scmi_voltage_info { unsigned int id; bool segmented; bool negative_volts_allowed; - unsigned int attributes; + bool async_level_set; char name[SCMI_MAX_STR_SIZE]; unsigned int num_levels; #define SCMI_VOLTAGE_SEGMENT_LOW 0 @@ -548,7 +554,7 @@ struct scmi_voltage_proto_ops { int (*config_get)(const struct scmi_protocol_handle *ph, u32 domain_id, u32 *config); int (*level_set)(const struct scmi_protocol_handle *ph, u32 domain_id, - u32 flags, s32 volt_uV); + enum scmi_voltage_level_mode mode, s32 volt_uV); int (*level_get)(const struct scmi_protocol_handle *ph, u32 domain_id, s32 *volt_uV); }; -- cgit v1.2.3 From ac3e62f51b3fbda78a44fff62ece8f11d8f08a25 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 7 Feb 2022 15:38:40 +0100 Subject: iio: core: Clarify the modes As part of a previous discussion with Jonathan Cameron [1], it appeared necessary to clarify the meaning of each mode so that new developers could understand better what they should use or not use and when. The idea of renaming these modes as been let aside because naming is a big deal and requires a lot of thinking. So for now let's focus on correctly explaining what each mode implies. [1] https://lore.kernel.org/linux-iio/20210930165510.2295e6c4@jic23-huawei/ Suggested-by: Jonathan Cameron Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20220207143840.707510-14-miquel.raynal@bootlin.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 85cb924debd9..233d2e6b7721 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -315,7 +315,54 @@ static inline bool iio_channel_has_available(const struct iio_chan_spec *chan, s64 iio_get_time_ns(const struct iio_dev *indio_dev); unsigned int iio_get_time_res(const struct iio_dev *indio_dev); -/* Device operating modes */ +/* + * Device operating modes + * @INDIO_DIRECT_MODE: There is an access to either: + * a) The last single value available for devices that do not provide + * on-demand reads. + * b) A new value after performing an on-demand read otherwise. + * On most devices, this is a single-shot read. On some devices with data + * streams without an 'on-demand' function, this might also be the 'last value' + * feature. Above all, this mode internally means that we are not in any of the + * other modes, and sysfs reads should work. + * Device drivers should inform the core if they support this mode. + * @INDIO_BUFFER_TRIGGERED: Common mode when dealing with kfifo buffers. + * It indicates that an explicit trigger is required. This requests the core to + * attach a poll function when enabling the buffer, which is indicated by the + * _TRIGGERED suffix. + * The core will ensure this mode is set when registering a triggered buffer + * with iio_triggered_buffer_setup(). + * @INDIO_BUFFER_SOFTWARE: Another kfifo buffer mode, but not event triggered. + * No poll function can be attached because there is no triggered infrastructure + * we can use to cause capture. There is a kfifo that the driver will fill, but + * not "only one scan at a time". Typically, hardware will have a buffer that + * can hold multiple scans. Software may read one or more scans at a single time + * and push the available data to a Kfifo. This means the core will not attach + * any poll function when enabling the buffer. + * The core will ensure this mode is set when registering a simple kfifo buffer + * with devm_iio_kfifo_buffer_setup(). + * @INDIO_BUFFER_HARDWARE: For specific hardware, if unsure do not use this mode. + * Same as above but this time the buffer is not a kfifo where we have direct + * access to the data. Instead, the consumer driver must access the data through + * non software visible channels (or DMA when there is no demux possible in + * software) + * The core will ensure this mode is set when registering a dmaengine buffer + * with devm_iio_dmaengine_buffer_setup(). + * @INDIO_EVENT_TRIGGERED: Very unusual mode. + * Triggers usually refer to an external event which will start data capture. + * Here it is kind of the opposite as, a particular state of the data might + * produce an event which can be considered as an event. We don't necessarily + * have access to the data itself, but to the event produced. For example, this + * can be a threshold detector. The internal path of this mode is very close to + * the INDIO_BUFFER_TRIGGERED mode. + * The core will ensure this mode is set when registering a triggered event. + * @INDIO_HARDWARE_TRIGGERED: Very unusual mode. + * Here, triggers can result in data capture and can be routed to multiple + * hardware components, which make them close to regular triggers in the way + * they must be managed by the core, but without the entire interrupts/poll + * functions burden. Interrupts are irrelevant as the data flow is hardware + * mediated and distributed. + */ #define INDIO_DIRECT_MODE 0x01 #define INDIO_BUFFER_TRIGGERED 0x02 #define INDIO_BUFFER_SOFTWARE 0x04 -- cgit v1.2.3 From cc10eee95204579fcd66fd5965073fdcbf629676 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 13 Apr 2022 01:36:16 -0600 Subject: PCI/ACPI: add a helper for retrieving _OSC Control DWORDs During _OSC negotiation, when the 'Control' DWORD is needed from the result buffer after running _OSC, a couple of places performed manual pointer arithmetic to offset into the right spot in the raw buffer. Add a acpi_osc_ctx_get_pci_control() helper to use the #define'd DWORD offsets to fetch the DWORDs needed from @acpi_osc_context, and replace the above instances of the open-coded arithmetic. Cc: "Rafael J. Wysocki" Suggested-by: Davidlohr Bueso Acked-by: Rafael J. Wysocki Reviewed-by: Rafael J. Wysocki Reviewed-by: Davidlohr Bueso Reviewed by: Adam Manzanares Signed-off-by: Vishal Verma Link: https://lore.kernel.org/r/20220413073618.291335-2-vishal.l.verma@intel.com Signed-off-by: Dan Williams --- include/linux/acpi.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index d7136d13aa44..04e5a038dd57 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -608,6 +608,13 @@ extern u32 osc_sb_native_usb4_control; #define OSC_PCI_EXPRESS_LTR_CONTROL 0x00000020 #define OSC_PCI_EXPRESS_DPC_CONTROL 0x00000080 +static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context) +{ + u32 *ret = context->ret.pointer; + + return ret[OSC_CONTROL_DWORD]; +} + #define ACPI_GSB_ACCESS_ATTRIB_QUICK 0x00000002 #define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV 0x00000004 #define ACPI_GSB_ACCESS_ATTRIB_BYTE 0x00000006 @@ -1004,6 +1011,12 @@ static inline int acpi_register_wakeup_handler(int wake_irq, static inline void acpi_unregister_wakeup_handler( bool (*wakeup)(void *context), void *context) { } +struct acpi_osc_context; +static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context) +{ + return 0; +} + #endif /* !CONFIG_ACPI */ #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC -- cgit v1.2.3 From 241d26bc26add2e2867c546f7474902406d37c60 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 13 Apr 2022 01:36:17 -0600 Subject: PCI/ACPI: Prefer CXL _OSC instead of PCIe _OSC for CXL host bridges OB In preparation for negotiating OS control of CXL _OSC features, do the minimal enabling to use CXL _OSC to handle the base PCIe feature negotiation. Recall that CXL _OSC is a super-set of PCIe _OSC and the CXL 2.0 specification mandates: "If a CXL Host Bridge device exposes CXL _OSC, CXL aware OSPM shall evaluate CXL _OSC and not evaluate PCIe _OSC." Rather than pass a boolean flag alongside @root to all the helper functions that need to consider PCIe specifics, add is_pcie() and is_cxl() helper functions to check the flavor of @root. This also allows for dynamic fallback to PCIe _OSC in cases where an attempt to use CXL _OXC fails. This can happen on CXL 1.1 platforms that publish ACPI0016 devices to indicate CXL host bridges, but do not publish the optional CXL _OSC method. CXL _OSC is mandatory for CXL 2.0 hosts. Cc: Bjorn Helgaas Cc: "Rafael J. Wysocki" Cc: Robert Moore Reviewed-by: Jonathan Cameron Reviewed-by: Rafael J. Wysocki Reviewed-by: Davidlohr Bueso Signed-off-by: Vishal Verma Link: https://lore.kernel.org/r/20220413073618.291335-3-vishal.l.verma@intel.com Signed-off-by: Dan Williams --- include/linux/acpi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 04e5a038dd57..82d91d9ccce5 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -550,6 +550,10 @@ struct acpi_osc_context { acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); +/* Number of _OSC capability DWORDS depends on bridge type */ +#define OSC_PCI_CAPABILITY_DWORDS 3 +#define OSC_CXL_CAPABILITY_DWORDS 5 + /* Indexes into _OSC Capabilities Buffer (DWORDs 2 & 3 are device-specific) */ #define OSC_QUERY_DWORD 0 /* DWORD 1 */ #define OSC_SUPPORT_DWORD 1 /* DWORD 2 */ -- cgit v1.2.3 From 56368029d93bbb3246ee2e03268fa6dd9754be05 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 13 Apr 2022 01:36:18 -0600 Subject: PCI/ACPI: negotiate CXL _OSC Add full support for negotiating _OSC as defined in the CXL 2.0 spec, as applicable to CXL-enabled platforms. Advertise support for the CXL features we support - 'CXL 2.0 port/device register access', 'Protocol Error Reporting', and 'CXL Native Hot Plug'. Request control for 'CXL Memory Error Reporting'. The requests are dependent on CONFIG_* based prerequisites, and prior PCI enabling, similar to how the standard PCI _OSC bits are determined. The CXL specification does not define any additional constraints on the hotplug flow beyond PCIe native hotplug, so a kernel that supports native PCIe hotplug, supports CXL hotplug. For error handling protocol and link errors just use PCIe AER. There is nascent support for amending AER events with CXL specific status [1], but there's otherwise no additional OS responsibility for CXL errors beyond PCIe AER. CXL Memory Errors behave the same as typical memory errors so CONFIG_MEMORY_FAILURE is sufficient to indicate support to platform firmware. [1]: https://lore.kernel.org/linux-cxl/164740402242.3912056.8303625392871313860.stgit@dwillia2-desk3.amr.corp.intel.com/ Cc: Bjorn Helgaas Cc: "Rafael J. Wysocki" Cc: Robert Moore Cc: Dan Williams Reviewed-by: Rafael J. Wysocki Reviewed-by: Davidlohr Bueso Signed-off-by: Vishal Verma Link: https://lore.kernel.org/r/20220413073618.291335-4-vishal.l.verma@intel.com Signed-off-by: Dan Williams --- include/linux/acpi.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 82d91d9ccce5..378a431666b3 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -554,10 +554,12 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); #define OSC_PCI_CAPABILITY_DWORDS 3 #define OSC_CXL_CAPABILITY_DWORDS 5 -/* Indexes into _OSC Capabilities Buffer (DWORDs 2 & 3 are device-specific) */ +/* Indexes into _OSC Capabilities Buffer (DWORDs 2 to 5 are device-specific) */ #define OSC_QUERY_DWORD 0 /* DWORD 1 */ #define OSC_SUPPORT_DWORD 1 /* DWORD 2 */ #define OSC_CONTROL_DWORD 2 /* DWORD 3 */ +#define OSC_EXT_SUPPORT_DWORD 3 /* DWORD 4 */ +#define OSC_EXT_CONTROL_DWORD 4 /* DWORD 5 */ /* _OSC Capabilities DWORD 1: Query/Control and Error Returns (generic) */ #define OSC_QUERY_ENABLE 0x00000001 /* input */ @@ -612,6 +614,15 @@ extern u32 osc_sb_native_usb4_control; #define OSC_PCI_EXPRESS_LTR_CONTROL 0x00000020 #define OSC_PCI_EXPRESS_DPC_CONTROL 0x00000080 +/* CXL _OSC: Capabilities DWORD 4: Support Field */ +#define OSC_CXL_1_1_PORT_REG_ACCESS_SUPPORT 0x00000001 +#define OSC_CXL_2_0_PORT_DEV_REG_ACCESS_SUPPORT 0x00000002 +#define OSC_CXL_PROTOCOL_ERR_REPORTING_SUPPORT 0x00000004 +#define OSC_CXL_NATIVE_HP_SUPPORT 0x00000008 + +/* CXL _OSC: Capabilities DWORD 5: Control Field */ +#define OSC_CXL_ERROR_REPORTING_CONTROL 0x00000001 + static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context) { u32 *ret = context->ret.pointer; @@ -619,6 +630,13 @@ static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context) return ret[OSC_CONTROL_DWORD]; } +static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context) +{ + u32 *ret = context->ret.pointer; + + return ret[OSC_EXT_CONTROL_DWORD]; +} + #define ACPI_GSB_ACCESS_ATTRIB_QUICK 0x00000002 #define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV 0x00000004 #define ACPI_GSB_ACCESS_ATTRIB_BYTE 0x00000006 @@ -1021,6 +1039,11 @@ static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context) return 0; } +static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context) +{ + return 0; +} + #endif /* !CONFIG_ACPI */ #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC -- cgit v1.2.3 From d864b8ea6468cf1dce614a58eec92a23d8e07fec Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 26 Apr 2022 12:22:44 -0700 Subject: cxl/acpi: Add root device lockdep validation The CXL "root" device, ACPI0017, is an attach point for coordinating platform level CXL resources and is the parent device for a CXL port topology tree. As such it has distinct locking rules relative to other CXL subsystem objects, but because it is an ACPI device the lock class is established well before it is given to the cxl_acpi driver. However, the lockdep API does support changing the lock class "live" for situations like this. Add a device_lock_set_class() helper that a driver can use in ->probe() to set a custom lock class, and device_lock_reset_class() to return to the default "no validate" class before the custom lock class key goes out of scope after ->remove(). Note the helpers are all macros to support dead code elimination in the CONFIG_PROVE_LOCKING=n case, however device_set_lock_class() still needs #ifdef CONFIG_PROVE_LOCKING since lockdep_match_class() explicitly does not have a helper in the CONFIG_PROVE_LOCKING=n case (see comment in lockdep.h). The lockdep API needs 2 small tweaks to prevent "unused" warnings for the @key argument to lock_set_class(), and a new lock_set_novalidate_class() is added to supplement lockdep_set_novalidate_class() in the cases where the lock class is converted while the lock is held. Suggested-by: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Ingo Molnar Cc: Will Deacon Cc: Waiman Long Cc: Boqun Feng Cc: Alison Schofield Cc: Vishal Verma Cc: Ben Widawsky Cc: Jonathan Cameron Reviewed-by: Greg Kroah-Hartman Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/165100081305.1528964.11138612430659737238.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/device.h | 43 +++++++++++++++++++++++++++++++++++++++++++ include/linux/lockdep.h | 6 +++++- 2 files changed, 48 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 93459724dcde..833b0b3b0193 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -850,6 +850,49 @@ static inline bool device_supports_offline(struct device *dev) return dev->bus && dev->bus->offline && dev->bus->online; } +#define __device_lock_set_class(dev, name, key) \ +do { \ + struct device *__d2 __maybe_unused = dev; \ + lock_set_class(&__d2->mutex.dep_map, name, key, 0, _THIS_IP_); \ +} while (0) + +/** + * device_lock_set_class - Specify a temporary lock class while a device + * is attached to a driver + * @dev: device to modify + * @key: lock class key data + * + * This must be called with the device_lock() already held, for example + * from driver ->probe(). Take care to only override the default + * lockdep_no_validate class. + */ +#ifdef CONFIG_LOCKDEP +#define device_lock_set_class(dev, key) \ +do { \ + struct device *__d = dev; \ + dev_WARN_ONCE(__d, !lockdep_match_class(&__d->mutex, \ + &__lockdep_no_validate__), \ + "overriding existing custom lock class\n"); \ + __device_lock_set_class(__d, #key, key); \ +} while (0) +#else +#define device_lock_set_class(dev, key) __device_lock_set_class(dev, #key, key) +#endif + +/** + * device_lock_reset_class - Return a device to the default lockdep novalidate state + * @dev: device to modify + * + * This must be called with the device_lock() already held, for example + * from driver ->remove(). + */ +#define device_lock_reset_class(dev) \ +do { \ + struct device *__d __maybe_unused = dev; \ + lock_set_novalidate_class(&__d->mutex.dep_map, "&dev->mutex", \ + _THIS_IP_); \ +} while (0) + void lock_device_hotplug(void); void unlock_device_hotplug(void); int lock_device_hotplug_sysfs(void); diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 467b94257105..43b0dc6a0b21 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -290,6 +290,9 @@ extern void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip); +#define lock_set_novalidate_class(l, n, i) \ + lock_set_class(l, n, &__lockdep_no_validate__, 0, i) + static inline void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, unsigned long ip) { @@ -357,7 +360,8 @@ static inline void lockdep_set_selftest_task(struct task_struct *task) # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) # define lock_release(l, i) do { } while (0) # define lock_downgrade(l, i) do { } while (0) -# define lock_set_class(l, n, k, s, i) do { } while (0) +# define lock_set_class(l, n, key, s, i) do { (void)(key); } while (0) +# define lock_set_novalidate_class(l, n, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \ -- cgit v1.2.3 From fd3abd2cafa46955846d731b9a6ded2c19ab73d8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 21 Apr 2022 08:33:45 -0700 Subject: device-core: Kill the lockdep_mutex Per Peter [1], the lockdep API has native support for all the use cases lockdep_mutex was attempting to enable. Now that all lockdep_mutex users have been converted to those APIs, drop this lock. Link: https://lore.kernel.org/r/Ylf0dewci8myLvoW@hirez.programming.kicks-ass.net [1] Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Suggested-by: Peter Zijlstra Reviewed-by: Ira Weiny Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/165055522548.3745911.14298368286915484086.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/device.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 833b0b3b0193..073f1b0126ac 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -400,8 +400,6 @@ struct dev_msi_info { * This identifies the device type and carries type-specific * information. * @mutex: Mutex to synchronize calls to its driver. - * @lockdep_mutex: An optional debug lock that a subsystem can use as a - * peer lock to gain localized lockdep coverage of the device_lock. * @bus: Type of bus device is on. * @driver: Which driver has allocated this * @platform_data: Platform data specific to the device. @@ -499,9 +497,6 @@ struct device { core doesn't touch it */ void *driver_data; /* Driver data, set and get with dev_set_drvdata/dev_get_drvdata */ -#ifdef CONFIG_PROVE_LOCKING - struct mutex lockdep_mutex; -#endif struct mutex mutex; /* mutex to synchronize calls to * its driver. */ -- cgit v1.2.3 From 9096bbe951ddd3f9dc813e73b5bde6d5715d1cdb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:15:58 -0700 Subject: mm: shmem: make shmem_init return void The return value of shmem_init is never used. So we can make it return void now. [akpm@linux-foundation.org: remove `return;' from void-returning function, per Muchun Song] Link: https://lkml.kernel.org/r/20220328112707.22217-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Muchun Song Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index ab51d3cd39bd..3e915cc550bc 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -56,7 +56,7 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) * Functions in mm/shmem.c called directly from elsewhere: */ extern const struct fs_parameter_spec shmem_fs_parameters[]; -extern int shmem_init(void); +extern void shmem_init(void); extern int shmem_init_fs_context(struct fs_context *fc); extern struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); -- cgit v1.2.3 From ef7a4ffc4c7f969d61e297c53cb2de1d6f012a11 Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Thu, 28 Apr 2022 23:16:00 -0700 Subject: mm/memcontrol.c: make cgroup_memory_noswap static cgroup_memory_noswap is only used in mm/memcontrol.c, therefore just make it static, and remove export in include/linux/memcontrol.h Link: https://lkml.kernel.org/r/20220421124736.62180-1-lujialin4@huawei.com Signed-off-by: Lu Jialin Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Muchun Song Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 89b14729d59f..fe580cb96683 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -937,10 +937,6 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); -#ifdef CONFIG_MEMCG_SWAP -extern bool cgroup_memory_noswap; -#endif - void folio_memcg_lock(struct folio *folio); void folio_memcg_unlock(struct folio *folio); void lock_page_memcg(struct page *page); -- cgit v1.2.3 From 2ba2b008a8bf5fd268a43d03ba79e0ad464d6836 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 28 Apr 2022 23:16:02 -0700 Subject: Revert "mm/memory-failure.c: fix race with changing page compound again" Reverts commit 888af2701db7 ("mm/memory-failure.c: fix race with changing page compound again") because now we fetch the page refcount under hugetlb_lock in try_memory_failure_hugetlb() so that the race check is no longer necessary. Link: https://lkml.kernel.org/r/20220408135323.1559401-4-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Suggested-by: Miaohe Lin Reviewed-by: Miaohe Lin Reviewed-by: Mike Kravetz Cc: Miaohe Lin Cc: Yang Shi Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9f44254af8ce..d446e834a3e5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3251,7 +3251,6 @@ enum mf_action_page_type { MF_MSG_BUDDY, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, - MF_MSG_DIFFERENT_PAGE_SIZE, MF_MSG_UNKNOWN, }; -- cgit v1.2.3 From 7d6e2d96384556a4f30547803be1f606eb805a62 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 28 Apr 2022 23:16:09 -0700 Subject: mm: untangle config dependencies for demote-on-reclaim At the time demote-on-reclaim was introduced, it was tied to CONFIG_HOTPLUG_CPU + CONFIG_MIGRATE, but that is not really accurate. The only two things we need to depend on are CONFIG_NUMA + CONFIG_MIGRATE, so clean this up. Furthermore, we only register the hotplug memory notifier when the system has CONFIG_MEMORY_HOTPLUG. Link: https://lkml.kernel.org/r/20220322224016.4574-1-osalvador@suse.de Signed-off-by: Oscar Salvador Suggested-by: "Huang, Ying" Reviewed-by: "Huang, Ying" Cc: Dave Hansen Cc: Abhishek Goel Cc: Baolin Wang Signed-off-by: Andrew Morton --- include/linux/migrate.h | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 90e75d5a54d6..2707bfd43a0d 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -47,17 +47,8 @@ void folio_migrate_copy(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count); -extern bool numa_demotion_enabled; -extern void migrate_on_reclaim_init(void); -#ifdef CONFIG_HOTPLUG_CPU -extern void set_migration_target_nodes(void); -#else -static inline void set_migration_target_nodes(void) {} -#endif #else -static inline void set_migration_target_nodes(void) {} - static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, @@ -82,9 +73,23 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, return -ENOSYS; } -#define numa_demotion_enabled false #endif /* CONFIG_MIGRATION */ +#if defined(CONFIG_MIGRATION) && defined(CONFIG_NUMA) +extern void set_migration_target_nodes(void); +extern void migrate_on_reclaim_init(void); +extern bool numa_demotion_enabled; +extern int next_demotion_node(int node); +#else +static inline void set_migration_target_nodes(void) {} +static inline void migrate_on_reclaim_init(void) {} +static inline int next_demotion_node(int node) +{ + return NUMA_NO_NODE; +} +#define numa_demotion_enabled false +#endif + #ifdef CONFIG_COMPACTION extern int PageMovable(struct page *page); extern void __SetPageMovable(struct page *page, struct address_space *mapping); @@ -172,15 +177,6 @@ struct migrate_vma { int migrate_vma_setup(struct migrate_vma *args); void migrate_vma_pages(struct migrate_vma *migrate); void migrate_vma_finalize(struct migrate_vma *migrate); -int next_demotion_node(int node); - -#else /* CONFIG_MIGRATION disabled: */ - -static inline int next_demotion_node(int node) -{ - return NUMA_NO_NODE; -} - #endif /* CONFIG_MIGRATION */ #endif /* _LINUX_MIGRATE_H */ -- cgit v1.2.3 From 6a8e0596f00469c15ec556b9f3624acd2e9a04f9 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:10 -0700 Subject: mm: rmap: introduce pfn_mkclean_range() to cleans PTEs The page_mkclean_one() is supposed to be used with the pfn that has a associated struct page, but not all the pfns (e.g. DAX) have a struct page. Introduce a new function pfn_mkclean_range() to cleans the PTEs (including PMDs) mapped with range of pfns which has no struct page associated with them. This helper will be used by DAX device in the next patch to make pfns clean. Link: https://lkml.kernel.org/r/20220403053957.10770-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alistair Popple Cc: Al Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Hugh Dickins Cc: Jan Kara Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Ross Zwisler Cc: Xiongchun Duan Cc: Xiyu Yang Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/rmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 17230c458341..8573aae50d96 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -261,6 +261,9 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); */ int folio_mkclean(struct folio *); +int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, + struct vm_area_struct *vma); + void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); /* -- cgit v1.2.3 From 0e5e64c0b0d7bb33a5e971ad17e771cf6e0a1127 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:10 -0700 Subject: mm: simplify follow_invalidate_pte() The only user (DAX) of range and pmdpp parameters of follow_invalidate_pte() is gone, it is safe to remove them and make it static to simlify the code. This is revertant of the following commits: 097963959594 ("mm: add follow_pte_pmd()") a4d1a8852513 ("dax: update to new mmu_notifier semantic") There is only one caller of the follow_invalidate_pte(). So just fold it into follow_pte() and remove it. Link: https://lkml.kernel.org/r/20220403053957.10770-7-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Christoph Hellwig Cc: Alistair Popple Cc: Al Viro Cc: Dan Williams Cc: Hugh Dickins Cc: Jan Kara Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Ross Zwisler Cc: Xiongchun Duan Cc: Xiyu Yang Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d446e834a3e5..16d6b46d2d3e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1845,9 +1845,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, - struct mmu_notifier_range *range, pte_t **ptepp, - pmd_t **pmdpp, spinlock_t **ptlp); int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, -- cgit v1.2.3 From 3afa793082e624f7bd83533010ff4a676451d4ee Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 28 Apr 2022 23:16:14 -0700 Subject: mm/mmap: drop arch_vm_get_page_pgprot() There are no platforms left which use arch_vm_get_page_prot(). Just drop generic arch_vm_get_page_prot(). Link: https://lkml.kernel.org/r/20220414062125.609297-8-anshuman.khandual@arm.com Cc: Andrew Morton Cc: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Christophe Leroy Reviewed-by: Catalin Marinas Signed-off-by: Anshuman Khandual Cc: Christoph Hellwig Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mman.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mman.h b/include/linux/mman.h index b66e91b8176c..58b3abd457a3 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -93,10 +93,6 @@ static inline void vm_unacct_memory(long pages) #define arch_calc_vm_flag_bits(flags) 0 #endif -#ifndef arch_vm_get_page_prot -#define arch_vm_get_page_prot(vm_flags) __pgprot(0) -#endif - #ifndef arch_validate_prot /* * This is called from mprotect(). PROT_GROWSDOWN and PROT_GROWSUP have -- cgit v1.2.3 From 5981611d0a006472d367d7a8e6ead8afaecf17c7 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:14 -0700 Subject: mm: hugetlb_vmemmap: cleanup hugetlb_vmemmap related functions Patch series "cleanup hugetlb_vmemmap". The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize" is more clear. In this series, cheanup related codes to make it more clear and expressive. This is suggested by David. This patch (of 3): The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". And some function names are prefixed with "huge_page" instead of "hugetlb", it is easily to be confused with THP. In this patch, cheanup related functions to make code more clear and expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220404074652.68024-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ac2a1d758a80..189bddb8eca4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -624,7 +624,7 @@ struct hstate { unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP - unsigned int nr_free_vmemmap_pages; + unsigned int optimize_vmemmap_pages; #endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ -- cgit v1.2.3 From f10f1442c309ccef7a80ba3dc4abde0978e86fb4 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:15 -0700 Subject: mm: hugetlb_vmemmap: cleanup hugetlb_free_vmemmap_enabled* The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". In this patch , cheanup the static key and hugetlb_free_vmemmap_enabled() to make code more expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 9d8eeaa67d05..573f499d5a2d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -192,16 +192,16 @@ enum pageflags { #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - hugetlb_free_vmemmap_enabled_key); + hugetlb_optimize_vmemmap_key); -static __always_inline bool hugetlb_free_vmemmap_enabled(void) +static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) { return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, - &hugetlb_free_vmemmap_enabled_key); + &hugetlb_optimize_vmemmap_key); } /* - * If the feature of freeing some vmemmap pages associated with each HugeTLB + * If the feature of optimizing vmemmap pages associated with each HugeTLB * page is enabled, the head vmemmap page frame is reused and all of the tail * vmemmap addresses map to the head vmemmap page frame (furture details can * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other @@ -218,7 +218,7 @@ static __always_inline bool hugetlb_free_vmemmap_enabled(void) */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { - if (!hugetlb_free_vmemmap_enabled()) + if (!hugetlb_optimize_vmemmap_enabled()) return page; /* @@ -247,7 +247,7 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) return page; } -static inline bool hugetlb_free_vmemmap_enabled(void) +static inline bool hugetlb_optimize_vmemmap_enabled(void) { return false; } -- cgit v1.2.3 From 47010c040dec8af6347ec6259104fc13f7e7e30a Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 28 Apr 2022 23:16:15 -0700 Subject: mm: hugetlb_vmemmap: cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP* The word of "free" is not expressive enough to express the feature of optimizing vmemmap pages associated with each HugeTLB, rename this keywork to "optimize". In this patch , cheanup configs to make code more expressive. Link: https://lkml.kernel.org/r/20220404074652.68024-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Mike Kravetz Cc: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 +- include/linux/mm.h | 2 +- include/linux/page-flags.h | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 189bddb8eca4..ac2ece9e9c79 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -623,7 +623,7 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP unsigned int optimize_vmemmap_pages; #endif #ifdef CONFIG_CGROUP_HUGETLB diff --git a/include/linux/mm.h b/include/linux/mm.h index 16d6b46d2d3e..a8f4c7e96ad5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3145,7 +3145,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); int vmemmap_remap_alloc(unsigned long start, unsigned long end, diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 573f499d5a2d..1ea896887ee4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -190,13 +190,13 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) { - return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + return static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, &hugetlb_optimize_vmemmap_key); } -- cgit v1.2.3 From e3246d8f52173a798710314a42fea83223036fc8 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 28 Apr 2022 23:16:15 -0700 Subject: mm/sparse-vmemmap: add a pgmap argument to section activation Patch series "sparse-vmemmap: memory savings for compound devmaps (device-dax)", v9. This series minimizes 'struct page' overhead by pursuing a similar approach as Muchun Song series "Free some vmemmap pages of hugetlb page" (now merged since v5.14), but applied to devmap with @vmemmap_shift (device-dax). The vmemmap dedpulication original idea (already used in HugeTLB) is to reuse/deduplicate tail page vmemmap areas, particular the area which only describes tail pages. So a vmemmap page describes 64 struct pages, and the first page for a given ZONE_DEVICE vmemmap would contain the head page and 63 tail pages. The second vmemmap page would contain only tail pages, and that's what gets reused across the rest of the subsection/section. The bigger the page size, the bigger the savings (2M hpage -> save 6 vmemmap pages; 1G hpage -> save 4094 vmemmap pages). This is done for PMEM /specifically only/ on device-dax configured namespaces, not fsdax. In other words, a devmap with a @vmemmap_shift. In terms of savings, per 1Tb of memory, the struct page cost would go down with compound devmap: * with 2M pages we lose 4G instead of 16G (0.39% instead of 1.5% of total memory) * with 1G pages we lose 40MB instead of 16G (0.0014% instead of 1.5% of total memory) The series is mostly summed up by patch 4, and to summarize what the series does: Patches 1 - 3: Minor cleanups in preparation for patch 4. Move the very nice docs of hugetlb_vmemmap.c into a Documentation/vm/ entry. Patch 4: Patch 4 is the one that takes care of the struct page savings (also referred to here as tail-page/vmemmap deduplication). Much like Muchun series, we reuse the second PTE tail page vmemmap areas across a given @vmemmap_shift On important difference though, is that contrary to the hugetlbfs series, there's no vmemmap for the area because we are late-populating it as opposed to remapping a system-ram range. IOW no freeing of pages of already initialized vmemmap like the case for hugetlbfs, which greatly simplifies the logic (besides not being arch-specific). altmap case unchanged and still goes via the vmemmap_populate(). Also adjust the newly added docs to the device-dax case. [Note that device-dax is still a little behind HugeTLB in terms of savings. I have an additional simple patch that reuses the head vmemmap page too, as a follow-up. That will double the savings and namespaces initialization.] Patch 5: Initialize fewer struct pages depending on the page size with DRAM backed struct pages -- because fewer pages are unique and most tail pages (with bigger vmemmap_shift). NVDIMM namespace bootstrap improves from ~268-358 ms to ~80-110/<1ms on 128G NVDIMMs with 2M and 1G respectivally. And struct page needed capacity will be 3.8x / 1071x smaller for 2M and 1G respectivelly. Tested on x86 with 1.5Tb of pmem (including pinning, and RDMA registration/deregistration scalability with 2M MRs) This patch (of 5): In support of using compound pages for devmap mappings, plumb the pgmap down to the vmemmap_populate implementation. Note that while altmap is retrievable from pgmap the memory hotplug code passes altmap without pgmap[*], so both need to be independently plumbed. So in addition to @altmap, pass @pgmap to sparse section populate functions namely: sparse_add_section section_activate populate_section_memmap __populate_section_memmap Passing @pgmap allows __populate_section_memmap() to both fetch the vmemmap_shift in which memmap metadata is created for and also to let sparse-vmemmap fetch pgmap ranges to co-relate to a given section and pick whether to just reuse tail pages from past onlined sections. While at it, fix the kdoc for @altmap for sparse_add_section(). [*] https://lore.kernel.org/linux-mm/20210319092635.6214-1-osalvador@suse.de/ Link: https://lkml.kernel.org/r/20220420155310.9712-1-joao.m.martins@oracle.com Link: https://lkml.kernel.org/r/20220420155310.9712-2-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Dan Williams Reviewed-by: Muchun Song Cc: Vishal Verma Cc: Matthew Wilcox Cc: Jason Gunthorpe Cc: Jane Chu Cc: Mike Kravetz Cc: Jonathan Corbet Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 5 ++++- include/linux/mm.h | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 1ce6f8044f1e..e0b2209ab71c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -15,6 +15,7 @@ struct memory_block; struct memory_group; struct resource; struct vmem_altmap; +struct dev_pagemap; #ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION /* @@ -122,6 +123,7 @@ typedef int __bitwise mhp_t; struct mhp_params { struct vmem_altmap *altmap; pgprot_t pgprot; + struct dev_pagemap *pgmap; }; bool mhp_range_allowed(u64 start, u64 size, bool need_mapping); @@ -333,7 +335,8 @@ extern void remove_pfn_range_from_zone(struct zone *zone, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_section(int nid, unsigned long pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap); extern void sparse_remove_section(struct mem_section *ms, unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap); diff --git a/include/linux/mm.h b/include/linux/mm.h index a8f4c7e96ad5..80bba49387e9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3154,7 +3154,8 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, - unsigned long nr_pages, int nid, struct vmem_altmap *altmap); + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); -- cgit v1.2.3 From 4917f55b4ef963e2d2288fe4eb651728be8db406 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 28 Apr 2022 23:16:16 -0700 Subject: mm/sparse-vmemmap: improve memory savings for compound devmaps A compound devmap is a dev_pagemap with @vmemmap_shift > 0 and it means that pages are mapped at a given huge page alignment and utilize uses compound pages as opposed to order-0 pages. Take advantage of the fact that most tail pages look the same (except the first two) to minimize struct page overhead. Allocate a separate page for the vmemmap area which contains the head page and separate for the next 64 pages. The rest of the subsections then reuse this tail vmemmap page to initialize the rest of the tail pages. Sections are arch-dependent (e.g. on x86 it's 64M, 128M or 512M) and when initializing compound devmap with big enough @vmemmap_shift (e.g. 1G PUD) it may cross multiple sections. The vmemmap code needs to consult @pgmap so that multiple sections that all map the same tail data can refer back to the first copy of that data for a given gigantic page. On compound devmaps with 2M align, this mechanism lets 6 pages be saved out of the 8 necessary PFNs necessary to set the subsection's 512 struct pages being mapped. On a 1G compound devmap it saves 4094 pages. Altmap isn't supported yet, given various restrictions in altmap pfn allocator, thus fallback to the already in use vmemmap_populate(). It is worth noting that altmap for devmap mappings was there to relieve the pressure of inordinate amounts of memmap space to map terabytes of pmem. With compound pages the motivation for altmaps for pmem gets reduced. Link: https://lkml.kernel.org/r/20220420155310.9712-5-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Muchun Song Cc: Christoph Hellwig Cc: Dan Williams Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Vishal Verma Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bba49387e9..b9316b2c11ce 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3161,7 +3161,7 @@ p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, - struct vmem_altmap *altmap); + struct vmem_altmap *altmap, struct page *reuse); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, -- cgit v1.2.3 From ba91fb7dd03c4f463453f28af24851097a6547fb Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 28 Apr 2022 23:16:16 -0700 Subject: include/linux/swapops.h: remove stub for non_swap_entry() The stub for non_swap_entry() may not help much, because MAX_SWAPFILES has already contained all the information to decide whether a swap entry is real swap entry or pesudo ones (migrations, ...). There can be some performance influences on non_swap_entry() with below conditions all met: !CONFIG_MIGRATION && !CONFIG_MEMORY_FAILURE && !CONFIG_DEVICE_PRIVATE But that's definitely not the major config most machines will use, at the meantime it's already in a slow path of swap entry (being parsed from a swap pte), so IMHO it shouldn't be a major issue. Also according to the analysis from Alistair, somehow the stub didn't do the job right [1]. To make the code cleaner, let's drop the stub. [1] https://lore.kernel.org/lkml/8735ihbw6g.fsf@nvdebian.thelocal/ Note: the uffd-wp shmem & hugetlbfs series will need this patch to make sure swap entries work as expected with below config as spotted by Alistair: !CONFIG_MIGRATION && !CONFIG_MEMORY_FAILURE && !CONFIG_DEVICE_PRIVATE && CONFIG_PTE_MARKER (PS: this config should mostly never gonna happen, though, afaict..) Link: https://lkml.kernel.org/r/20220413191147.66645-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Alistair Popple Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/swapops.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index d356ab4047f7..5af852b68805 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -387,18 +387,10 @@ static inline void num_poisoned_pages_inc(void) } #endif -#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \ - defined(CONFIG_DEVICE_PRIVATE) static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; } -#else -static inline int non_swap_entry(swp_entry_t entry) -{ - return 0; -} -#endif #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ -- cgit v1.2.3 From 7609385337a4feb6236e42dcd0df2185683ce839 Mon Sep 17 00:00:00 2001 From: xu xin Date: Thu, 28 Apr 2022 23:16:16 -0700 Subject: ksm: count ksm merging pages for each process Some applications or containers want to use KSM by calling madvise() to advise areas of address space to be MERGEABLE. But they may not know which applications are more likely to cause real merges in the deployment. If this patch is applied, it helps them know their corresponding number of merged pages, and then optimize their app code. As current KSM only counts the number of KSM merging pages(e.g. ksm_pages_sharing and ksm_pages_shared) of the whole system, we cannot see the more fine-grained KSM merging, for the upper application optimization, the merging area cannot be set easily according to the KSM page merging probability of each process. Therefore, it is necessary to add extra statistical means so that the upper level users can know the detailed KSM merging information of each process. We add a new proc file named as ksm_merging_pages under /proc// to indicate the involved ksm merging pages of this process. [akpm@linux-foundation.org: fix comment typo, remove BUG_ON()s] Link: https://lkml.kernel.org/r/20220325082318.2352853-1-xu.xin16@zte.com.cn Signed-off-by: xu xin Reported-by: kernel test robot Reviewed-by: Yang Yang Reviewed-by: Ran Xiaokai Reported-by: Zeal Robot Cc: Kees Cook Cc: Alexey Dobriyan Cc: Stephen Rothwell Cc: Ohhoon Kwon Cc: Matthew Wilcox (Oracle) Cc: Stephen Brennan Cc: Vlastimil Babka Cc: Feng Tang Cc: Yang Yang Cc: Ran Xiaokai Cc: Zeal Robot Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8834e38c06a4..87eddd509de2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -654,6 +654,13 @@ struct mm_struct { #ifdef CONFIG_IOMMU_SVA u32 pasid; +#endif +#ifdef CONFIG_KSM + /* + * Represent how many pages of this process are involved in KSM + * merging. + */ + unsigned long ksm_merging_pages; #endif } __randomize_layout; -- cgit v1.2.3 From 94bfe85bde18a99612bb5d0ce348643c2d352836 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Thu, 28 Apr 2022 23:16:16 -0700 Subject: mm/vmstat: add events for ksm cow Users may use ksm by calling madvise(, , MADV_MERGEABLE) when they want to save memory, it's a tradeoff by suffering delay on ksm cow. Users can get to know how much memory ksm saved by reading /sys/kernel/mm/ksm/pages_sharing, but they don't know what's the costs of ksm cow, and this is important of some delay sensitive tasks. So add ksm cow events to help users evaluate whether or how to use ksm. Also update Documentation/admin-guide/mm/ksm.rst with new added events. Link: https://lkml.kernel.org/r/20220331035616.2390805-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reviewed-by: David Hildenbrand Reviewed-by: xu xin Reviewed-by: Ran Xiaokai Cc: Matthew Wilcox (Oracle) Cc: Jonathan Corbet Cc: Dave Hansen Cc: Saravanan D Cc: Minchan Kim Cc: John Hubbard Signed-off-by: Andrew Morton --- include/linux/vm_event_item.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 16a0a4fd000b..e83967e4c20e 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -133,6 +133,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, KSM_SWPIN_COPY, #endif #endif +#ifdef CONFIG_KSM + COW_KSM, +#endif #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, DIRECT_MAP_LEVEL3_SPLIT, -- cgit v1.2.3 From 024c61eaff17f6a8c73cb8b25137251f0c3ef039 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Apr 2022 23:16:17 -0700 Subject: mm: compaction: remove unneeded return value of kcompactd_run Patch series "A few cleanup and fixup patches for compaction". This series contains a few patches to clean up some obsolete comment, remove unneeded return value and so on. Also we fix the possible NULL pointer dereference. More details can be found in the respective changelogs. This patch (of 12): The return value of kcompactd_run() is unused now. Clean it up. Link: https://lkml.kernel.org/r/20220418141253.24298-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220418141253.24298-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc; Mel Gorman Cc: David Hildenbrand Cc: Vlastimil Babka Cc: Pintu Kumar Cc: Charan Teja Kalla Signed-off-by: Andrew Morton --- include/linux/compaction.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 34bce35c808d..52a9ff65faee 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -177,7 +177,7 @@ static inline bool compaction_withdrawn(enum compact_result result) bool compaction_zonelist_suitable(struct alloc_context *ac, int order, int alloc_flags); -extern int kcompactd_run(int nid); +extern void kcompactd_run(int nid); extern void kcompactd_stop(int nid); extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx); @@ -212,9 +212,8 @@ static inline bool compaction_withdrawn(enum compact_result result) return true; } -static inline int kcompactd_run(int nid) +static inline void kcompactd_run(int nid) { - return 0; } static inline void kcompactd_stop(int nid) { -- cgit v1.2.3 From 766475cb526b2fc5ca21374b5810d6d8557870fc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 28 Apr 2022 12:28:59 +0200 Subject: ARM: omap1: add back omap_set_dma_priority() stub One of my multiplatform patches went a little too far and removed a declaration that is needed for compile-testing the omapfb driver on non-OMAP1 platforms: arm-linux-gnueabi-ld: drivers/video/fbdev/omap/omapfb_main.o: in function `omapfb_do_probe': omapfb_main.c:(.text+0x41ec): undefined reference to `omap_set_dma_priority' Add back the inline stub, and in turn hide the definition when omapfb is disabled, like we do for the usb specific bits. Reported-by: kernel test robot Reviewed-by: Tony Lindgren Fixes: 52ef8efcb75e ("dma: omap: hide legacy interface") Signed-off-by: Arnd Bergmann --- include/linux/omap-dma.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/omap-dma.h b/include/linux/omap-dma.h index 254b4e10511b..6f6c31e3fb93 100644 --- a/include/linux/omap-dma.h +++ b/include/linux/omap-dma.h @@ -294,7 +294,14 @@ struct omap_system_dma_plat_info { extern struct omap_system_dma_plat_info *omap_get_plat_info(void); +#if defined(CONFIG_ARCH_OMAP1) extern void omap_set_dma_priority(int lch, int dst_port, int priority); +#else +static inline void omap_set_dma_priority(int lch, int dst_port, int priority) +{ +} +#endif + extern int omap_request_dma(int dev_id, const char *dev_name, void (*callback)(int lch, u16 ch_status, void *data), void *data, int *dma_ch); -- cgit v1.2.3 From 50e7b416d2ab10b9771bd00a4d85df90ad2e4b37 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 28 Apr 2022 15:43:37 +0100 Subject: sched/fair: Remove sched_trace_*() helper functions We no longer need them as we can use DWARF debug info or BTF + pahole to re-generate the required structs to compile against them for a given kernel. This moves the burden of maintaining these helper functions to the module. https://github.com/qais-yousef/sched_tp Note that pahole v1.15 is required at least for using DWARF. And for BTF v1.23 which is not yet released will be required. There's alignment problem that will lead to crashes in earlier versions when used with BTF. We should have enough infrastructure to make these helper functions now obsolete, so remove them. [Rewrote commit message to reflect the new alternative] Signed-off-by: Dietmar Eggemann Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220428144338.479094-2-qais.yousef@arm.com --- include/linux/sched.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 67f06f72c50e..fc74ea2578b7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2378,20 +2378,6 @@ static inline void rseq_syscall(struct pt_regs *regs) #endif -const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq); -char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len); -int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq); - -const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq); -const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq); -const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq); - -int sched_trace_rq_cpu(struct rq *rq); -int sched_trace_rq_cpu_capacity(struct rq *rq); -int sched_trace_rq_nr_running(struct rq *rq); - -const struct cpumask *sched_trace_rd_span(struct root_domain *rd); - #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); -- cgit v1.2.3 From 498af8d1678ae2351218337b47bbf3cb0fc16821 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Fri, 29 Apr 2022 12:39:45 +0100 Subject: firmware: arm_ffa: Add ffa_dev_get_drvdata helper function Add a helper function to fetch ffa_dev's driver_data using dev_get_drvdata. At the same time move existing ffa_dev_set_drvdata to use dev_set_drvdata. Link: https://lore.kernel.org/r/20220429113946.2087145-3-sudeep.holla@arm.com Suggested-by: Arunachalam Ganapathy Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 85651e41ded8..e5c76c1ef9ed 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -38,7 +38,12 @@ struct ffa_driver { static inline void ffa_dev_set_drvdata(struct ffa_device *fdev, void *data) { - fdev->dev.driver_data = data; + dev_set_drvdata(&fdev->dev, data); +} + +static inline void *ffa_dev_get_drvdata(struct ffa_device *fdev) +{ + return dev_get_drvdata(&fdev->dev); } #if IS_REACHABLE(CONFIG_ARM_FFA_TRANSPORT) -- cgit v1.2.3 From 892de36fd4a98fab3298d417c051d9099af5448d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 29 Apr 2022 12:22:10 -0400 Subject: SUNRPC: Ensure gss-proxy connects on setup For reasons best known to the author, gss-proxy does not implement a NULL procedure, and returns RPC_PROC_UNAVAIL. However we still want to ensure that we connect to the service at setup time. So add a quirk-flag specially for this case. Fixes: 1d658336b05f ("SUNRPC: Add RPC based upcall mechanism for RPCGSS auth") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 267b7aeaf1a6..db5149567305 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -160,6 +160,7 @@ struct rpc_add_xprt_test { #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT (1UL << 9) #define RPC_CLNT_CREATE_SOFTERR (1UL << 10) #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) +#define RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL (1UL << 12) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, -- cgit v1.2.3 From ec2a0f9c8b50865a11720651c9c536f20c578def Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 29 Apr 2022 14:36:58 -0700 Subject: kasan: mark KASAN_VMALLOC flags as kasan_vmalloc_flags_t Fix sparse warning: mm/kasan/shadow.c:496:15: warning: restricted kasan_vmalloc_flags_t degrades to integer Link: https://lkml.kernel.org/r/52d8fccdd3a48d4bdfd0ff522553bac2a13f1579.1649351254.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reported-by: kernel test robot Cc: Andrey Konovalov Cc: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Signed-off-by: Andrew Morton --- include/linux/kasan.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index ceebcb9de7bf..b092277bf48d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -23,10 +23,10 @@ struct task_struct; typedef unsigned int __bitwise kasan_vmalloc_flags_t; -#define KASAN_VMALLOC_NONE 0x00u -#define KASAN_VMALLOC_INIT 0x01u -#define KASAN_VMALLOC_VM_ALLOC 0x02u -#define KASAN_VMALLOC_PROT_NORMAL 0x04u +#define KASAN_VMALLOC_NONE ((__force kasan_vmalloc_flags_t)0x00u) +#define KASAN_VMALLOC_INIT ((__force kasan_vmalloc_flags_t)0x01u) +#define KASAN_VMALLOC_VM_ALLOC ((__force kasan_vmalloc_flags_t)0x02u) +#define KASAN_VMALLOC_PROT_NORMAL ((__force kasan_vmalloc_flags_t)0x04u) #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) -- cgit v1.2.3 From 5d8de293c224896a4da99763fce4f9794308caf4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 14:37:59 -0700 Subject: vmcore: convert copy_oldmem_page() to take an iov_iter Patch series "Convert vmcore to use an iov_iter", v5. For some reason several people have been sending bad patches to fix compiler warnings in vmcore recently. Here's how it should be done. Compile-tested only on x86. As noted in the first patch, s390 should take this conversion a bit further, but I'm not inclined to do that work myself. This patch (of 3): Instead of passing in a 'buf' and 'userbuf' argument, pass in an iov_iter. s390 needs more work to pass the iov_iter down further, or refactor, but I'd be more comfortable if someone who can test on s390 did that work. It's more convenient to convert the whole of read_from_oldmem() to take an iov_iter at the same time, so rename it to read_from_oldmem_iter() and add a temporary read_from_oldmem() wrapper that creates an iov_iter. Link: https://lkml.kernel.org/r/20220408090636.560886-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20220408090636.560886-2-bhe@redhat.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Cc: Heiko Carstens Signed-off-by: Andrew Morton --- include/linux/crash_dump.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 620821549b23..a1cf7d5c03c7 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -24,11 +24,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot); -extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, - unsigned long, int); -extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, - int userbuf); +ssize_t copy_oldmem_page(struct iov_iter *i, unsigned long pfn, size_t csize, + unsigned long offset); +ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset); void vmcore_cleanup(void); -- cgit v1.2.3 From e0690479917cbce740eef51fa3de92c69647a5ad Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 14:37:59 -0700 Subject: vmcore: convert read_from_oldmem() to take an iov_iter Remove the read_from_oldmem() wrapper introduced earlier and convert all the remaining callers to pass an iov_iter. Link: https://lkml.kernel.org/r/20220408090636.560886-4-bhe@redhat.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Cc: Tiezhu Yang Cc: Amit Daniel Kachhap Cc: Al Viro Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/crash_dump.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index a1cf7d5c03c7..0f3a656293b0 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -134,13 +134,11 @@ static inline int vmcore_add_device_dump(struct vmcoredd_data *data) #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ #ifdef CONFIG_PROC_VMCORE -ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf, - bool encrypted); +ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, + u64 *ppos, bool encrypted); #else -static inline ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf, - bool encrypted) +static inline ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, + u64 *ppos, bool encrypted) { return -EOPNOTSUPP; } -- cgit v1.2.3 From f485922d8fe4e44f6d52a5bb95a603b7c65554bb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 29 Apr 2022 14:38:01 -0700 Subject: pipe: make poll_usage boolean and annotate its access Patch series "Fix data-races around epoll reported by KCSAN." This series suppresses a false positive KCSAN's message and fixes a real data-race. This patch (of 2): pipe_poll() runs locklessly and assigns 1 to poll_usage. Once poll_usage is set to 1, it never changes in other places. However, concurrent writes of a value trigger KCSAN, so let's make KCSAN happy. BUG: KCSAN: data-race in pipe_poll / pipe_poll write to 0xffff8880042f6678 of 4 bytes by task 174 on cpu 3: pipe_poll (fs/pipe.c:656) ep_item_poll.isra.0 (./include/linux/poll.h:88 fs/eventpoll.c:853) do_epoll_wait (fs/eventpoll.c:1692 fs/eventpoll.c:1806 fs/eventpoll.c:2234) __x64_sys_epoll_wait (fs/eventpoll.c:2246 fs/eventpoll.c:2241 fs/eventpoll.c:2241) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113) write to 0xffff8880042f6678 of 4 bytes by task 177 on cpu 1: pipe_poll (fs/pipe.c:656) ep_item_poll.isra.0 (./include/linux/poll.h:88 fs/eventpoll.c:853) do_epoll_wait (fs/eventpoll.c:1692 fs/eventpoll.c:1806 fs/eventpoll.c:2234) __x64_sys_epoll_wait (fs/eventpoll.c:2246 fs/eventpoll.c:2241 fs/eventpoll.c:2241) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113) Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 177 Comm: epoll_race Not tainted 5.17.0-58927-gf443e374ae13 #6 Hardware name: Red Hat KVM, BIOS 1.11.0-2.amzn2 04/01/2014 Link: https://lkml.kernel.org/r/20220322002653.33865-1-kuniyu@amazon.co.jp Link: https://lkml.kernel.org/r/20220322002653.33865-2-kuniyu@amazon.co.jp Fixes: 3b844826b6c6 ("pipe: avoid unnecessary EPOLLET wakeups under normal loads") Signed-off-by: Kuniyuki Iwashima Cc: Alexander Duyck Cc: Al Viro Cc: Davidlohr Bueso Cc: Kuniyuki Iwashima Cc: "Soheil Hassas Yeganeh" Cc: "Sridhar Samudrala" Signed-off-by: Andrew Morton --- include/linux/pipe_fs_i.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index c00c618ef290..cb0fd633a610 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -71,7 +71,7 @@ struct pipe_inode_info { unsigned int files; unsigned int r_counter; unsigned int w_counter; - unsigned int poll_usage; + bool poll_usage; struct page *tmp_page; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; -- cgit v1.2.3 From d679ae94fdd5d3ab00c35078f5af5f37e068b03d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 29 Apr 2022 14:38:01 -0700 Subject: list: fix a data-race around ep->rdllist ep_poll() first calls ep_events_available() with no lock held and checks if ep->rdllist is empty by list_empty_careful(), which reads rdllist->prev. Thus all accesses to it need some protection to avoid store/load-tearing. Note INIT_LIST_HEAD_RCU() already has the annotation for both prev and next. Commit bf3b9f6372c4 ("epoll: Add busy poll support to epoll with socket fds.") added the first lockless ep_events_available(), and commit c5a282e9635e ("fs/epoll: reduce the scope of wq lock in epoll_wait()") made some ep_events_available() calls lockless and added single call under a lock, finally commit e59d3c64cba6 ("epoll: eliminate unnecessary lock for zero timeout") made the last ep_events_available() lockless. BUG: KCSAN: data-race in do_epoll_wait / do_epoll_wait write to 0xffff88810480c7d8 of 8 bytes by task 1802 on cpu 0: INIT_LIST_HEAD include/linux/list.h:38 [inline] list_splice_init include/linux/list.h:492 [inline] ep_start_scan fs/eventpoll.c:622 [inline] ep_send_events fs/eventpoll.c:1656 [inline] ep_poll fs/eventpoll.c:1806 [inline] do_epoll_wait+0x4eb/0xf40 fs/eventpoll.c:2234 do_epoll_pwait fs/eventpoll.c:2268 [inline] __do_sys_epoll_pwait fs/eventpoll.c:2281 [inline] __se_sys_epoll_pwait+0x12b/0x240 fs/eventpoll.c:2275 __x64_sys_epoll_pwait+0x74/0x80 fs/eventpoll.c:2275 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88810480c7d8 of 8 bytes by task 1799 on cpu 1: list_empty_careful include/linux/list.h:329 [inline] ep_events_available fs/eventpoll.c:381 [inline] ep_poll fs/eventpoll.c:1797 [inline] do_epoll_wait+0x279/0xf40 fs/eventpoll.c:2234 do_epoll_pwait fs/eventpoll.c:2268 [inline] __do_sys_epoll_pwait fs/eventpoll.c:2281 [inline] __se_sys_epoll_pwait+0x12b/0x240 fs/eventpoll.c:2275 __x64_sys_epoll_pwait+0x74/0x80 fs/eventpoll.c:2275 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0xffff88810480c7d0 -> 0xffff888103c15098 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 1799 Comm: syz-fuzzer Tainted: G W 5.17.0-rc7-syzkaller-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Link: https://lkml.kernel.org/r/20220322002653.33865-3-kuniyu@amazon.co.jp Fixes: e59d3c64cba6 ("epoll: eliminate unnecessary lock for zero timeout") Fixes: c5a282e9635e ("fs/epoll: reduce the scope of wq lock in epoll_wait()") Fixes: bf3b9f6372c4 ("epoll: Add busy poll support to epoll with socket fds.") Signed-off-by: Kuniyuki Iwashima Reported-by: syzbot+bdd6e38a1ed5ee58d8bd@syzkaller.appspotmail.com Cc: Al Viro , Andrew Morton Cc: Kuniyuki Iwashima Cc: Kuniyuki Iwashima Cc: "Soheil Hassas Yeganeh" Cc: Davidlohr Bueso Cc: "Sridhar Samudrala" Cc: Alexander Duyck Signed-off-by: Andrew Morton --- include/linux/list.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index dd6c2041d09c..d7d2bfa1a365 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -35,7 +35,7 @@ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); - list->prev = list; + WRITE_ONCE(list->prev, list); } #ifdef CONFIG_DEBUG_LIST @@ -306,7 +306,7 @@ static inline int list_empty(const struct list_head *head) static inline void list_del_init_careful(struct list_head *entry) { __list_del_entry(entry); - entry->prev = entry; + WRITE_ONCE(entry->prev, entry); smp_store_release(&entry->next, entry); } @@ -326,7 +326,7 @@ static inline void list_del_init_careful(struct list_head *entry) static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); - return list_is_head(next, head) && (next == head->prev); + return list_is_head(next, head) && (next == READ_ONCE(head->prev)); } /** -- cgit v1.2.3 From a9866bef5171c859cfabc1155c594d28f194aa23 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 29 Apr 2022 14:38:02 -0700 Subject: ptrace: fix wrong comment of PT_DTRACE PT_DTRACE is only used on um now, fix the wrong comment. Link: https://lkml.kernel.org/r/1649240981-11024-3-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- include/linux/ptrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 15b3d176b6b4..db4509587d2c 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -30,7 +30,7 @@ extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 -#define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */ +#define PT_DTRACE 0x00000002 /* delayed trace (used on um) */ #define PT_OPT_FLAG_SHIFT 3 /* PT_TRACE_* event enable flags */ -- cgit v1.2.3 From f94fd25cb0aaf77fd7453f31c5d394a1a68ecf60 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Apr 2022 18:45:06 -0600 Subject: tcp: pass back data left in socket after receive This is currently done for CMSG_INQ, add an ability to do so via struct msghdr as well and have CMSG_INQ use that too. If the caller sets msghdr->msg_get_inq, then we'll pass back the hint in msghdr->msg_inq. Rearrange struct msghdr a bit so we can add this member while shrinking it at the same time. On a 64-bit build, it was 96 bytes before this change and 88 bytes afterwards. Reviewed-by: Eric Dumazet Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/650c22ca-cffc-0255-9a05-2413a1e20826@kernel.dk Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 6f85f5d957ef..12085c9a8544 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -50,6 +50,9 @@ struct linger { struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ + + int msg_inq; /* output, data left in socket */ + struct iov_iter msg_iter; /* data */ /* @@ -62,8 +65,9 @@ struct msghdr { void __user *msg_control_user; }; bool msg_control_is_user : 1; - __kernel_size_t msg_controllen; /* ancillary data buffer length */ + bool msg_get_inq : 1;/* return INQ after receive */ unsigned int msg_flags; /* flags on received message */ + __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ }; -- cgit v1.2.3 From 657dd5f97b2ed16cdaa6339f42f9130240af1c04 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 28 Apr 2022 11:58:45 +0100 Subject: net: inline skb_zerocopy_iter_dgram skb_zerocopy_iter_dgram() is a small proxy function, inline it. For that, move __zerocopy_sg_from_iter into linux/skbuff.h Signed-off-by: Pavel Begunkov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index dc4b3e1cf21b..3270cb72e4d8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -684,20 +684,6 @@ struct ubuf_info { int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); -struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size); -struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg); - -void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); - -void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, - bool success); - -int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len); -int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, - struct msghdr *msg, int len, - struct ubuf_info *uarg); - /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -1679,6 +1665,28 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) } #endif +struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size); +struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, + struct ubuf_info *uarg); + +void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); + +void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, + bool success); + +int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length); + +static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, + struct msghdr *msg, int len) +{ + return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); +} + +int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, + struct msghdr *msg, int len, + struct ubuf_info *uarg); + /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) -- cgit v1.2.3 From c526fd8f9f4f21cb83c0b1c9a1ee9c0ac9be9e2e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 28 Apr 2022 11:58:46 +0100 Subject: net: inline dev_queue_xmit() Inline dev_queue_xmit() and dev_queue_xmit_accel(), they both are small proxy functions doing nothing but redirecting the control flow to __dev_queue_xmit(). Signed-off-by: Pavel Begunkov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b75ca2d095ae..4aba92a4042a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2940,10 +2940,20 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); -int dev_queue_xmit(struct sk_buff *skb); -int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); +int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id); +static inline int dev_queue_xmit(struct sk_buff *skb) +{ + return __dev_queue_xmit(skb, NULL); +} + +static inline int dev_queue_xmit_accel(struct sk_buff *skb, + struct net_device *sb_dev) +{ + return __dev_queue_xmit(skb, sb_dev); +} + static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { int ret; -- cgit v1.2.3 From 48cec73a891cca087fbc7791c4753784180991a9 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Fri, 29 Apr 2022 09:19:53 +0200 Subject: net: lan966x: Fix compilation error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Starting from the blamed commit, the lan966x build fails with the following compilation error: drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c:342:9: error: implicit declaration of function ‘ptp_find_pin_unlocked’ [-Werror=implicit-function-declaration] 342 | pin = ptp_find_pin_unlocked(phc->clock, PTP_PF_EXTTS, 0); The issue is that there is no stub function for ptp_find_pin_unlocked in case CONFIG_PTP_1588_CLOCK is not selected. Therefore add one. Reported-by: kernel test robot Fixes: f3d8e0a9c28ba0 ("net: lan966x: Add support for PTP_PF_EXTTS") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 554454cb8693..e8cc8b6bbf50 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -321,6 +321,10 @@ static inline int ptp_clock_index(struct ptp_clock *ptp) static inline int ptp_find_pin(struct ptp_clock *ptp, enum ptp_pin_function func, unsigned int chan) { return -1; } +static inline int ptp_find_pin_unlocked(struct ptp_clock *ptp, + enum ptp_pin_function func, + unsigned int chan) +{ return -1; } static inline int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay) { return -EOPNOTSUPP; } -- cgit v1.2.3 From e788be95a57a9bebe446878ce9bf2750f6fe4974 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Apr 2022 17:25:16 -0600 Subject: task_work: allow TWA_SIGNAL without a rescheduling IPI Some use cases don't always need an IPI when sending a TWA_SIGNAL notification. Add TWA_SIGNAL_NO_IPI, which is just like TWA_SIGNAL, except it doesn't send an IPI to the target task. It merely sets TIF_NOTIFY_SIGNAL and wakes up the task. This can be useful in avoiding a forceful transition to the kernel if the task is running in userspace. Depending on the task_work in question, it may be quite fine waiting for the next reschedule or kernel enter anyway, or the use case may even have other mechanisms for hinting to the task that a transition may be useful. This can drive more cooperative scheduling of task_work. Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/821f42b6-7d91-8074-8212-d34998097de4@kernel.dk Signed-off-by: Jens Axboe --- include/linux/sched/signal.h | 13 +++++++++++-- include/linux/task_work.h | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3c8b34876744..66b689f6cfcb 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -355,14 +355,23 @@ static inline void clear_notify_signal(void) smp_mb__after_atomic(); } +/* + * Returns 'true' if kick_process() is needed to force a transition from + * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work. + */ +static inline bool __set_notify_signal(struct task_struct *task) +{ + return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && + !wake_up_state(task, TASK_INTERRUPTIBLE); +} + /* * Called to break out of interruptible wait loops, and enter the * exit_to_user_mode_loop(). */ static inline void set_notify_signal(struct task_struct *task) { - if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && - !wake_up_state(task, TASK_INTERRUPTIBLE)) + if (__set_notify_signal(task)) kick_process(task); } diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 897494b597ba..795ef5a68429 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -17,6 +17,7 @@ enum task_work_notify_mode { TWA_NONE, TWA_RESUME, TWA_SIGNAL, + TWA_SIGNAL_NO_IPI, }; static inline bool task_work_pending(struct task_struct *task) -- cgit v1.2.3 From d664e399128bd78b905ff480917e2c2d4949e101 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Apr 2022 15:31:02 +0200 Subject: sched: Fix missing prototype warnings A W=1 build emits more than a dozen missing prototype warnings related to scheduler and scheduler specific includes. Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220413133024.249118058@linutronix.de --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index fc74ea2578b7..a27316f5f737 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2388,4 +2388,6 @@ static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } #endif +extern void sched_set_stop_task(int cpu, struct task_struct *stop); + #endif -- cgit v1.2.3 From 1a90bfd220201fbe050dfc15deaac20ca5f15638 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 13 Apr 2022 15:31:05 +0200 Subject: smp: Make softirq handling RT safe in flush_smp_call_function_queue() flush_smp_call_function_queue() invokes do_softirq() which is not available on PREEMPT_RT. flush_smp_call_function_queue() is invoked from the idle task and the migration task with preemption or interrupts disabled. So RT kernels cannot process soft interrupts in that context as that has to acquire 'sleeping spinlocks' which is not possible with preemption or interrupts disabled and forbidden from the idle task anyway. The currently known SMP function call which raises a soft interrupt is in the block layer, but this functionality is not enabled on RT kernels due to latency and performance reasons. RT could wake up ksoftirqd unconditionally, but this wants to be avoided if there were soft interrupts pending already when this is invoked in the context of the migration task. The migration task might have preempted a threaded interrupt handler which raised a soft interrupt, but did not reach the local_bh_enable() to process it. The "running" ksoftirqd might prevent the handling in the interrupt thread context which is causing latency issues. Add a new function which handles this case explicitely for RT and falls back to do_softirq() on !RT kernels. In the RT case this warns when one of the flushed SMP function calls raised a soft interrupt so this can be investigated. [ tglx: Moved the RT part out of SMP code ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/YgKgL6aPj8aBES6G@linutronix.de Link: https://lore.kernel.org/r/20220413133024.356509586@linutronix.de --- include/linux/interrupt.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f40754caaefa..a49fe8d88676 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -607,6 +607,15 @@ struct softirq_action asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); +#ifdef CONFIG_PREEMPT_RT +extern void do_softirq_post_smp_call_flush(unsigned int was_pending); +#else +static inline void do_softirq_post_smp_call_flush(unsigned int unused) +{ + do_softirq(); +} +#endif + extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); extern void __raise_softirq_irqoff(unsigned int nr); -- cgit v1.2.3 From 47f753c1108e287edb3e27fad8a7511a9d55578e Mon Sep 17 00:00:00 2001 From: Tan Tee Min Date: Fri, 29 Apr 2022 19:58:07 +0800 Subject: net: stmmac: disable Split Header (SPH) for Intel platforms Based on DesignWare Ethernet QoS datasheet, we are seeing the limitation of Split Header (SPH) feature is not supported for Ipv4 fragmented packet. This SPH limitation will cause ping failure when the packets size exceed the MTU size. For example, the issue happens once the basic ping packet size is larger than the configured MTU size and the data is lost inside the fragmented packet, replaced by zeros/corrupted values, and leads to ping fail. So, disable the Split Header for Intel platforms. v2: Add fixes tag in commit message. Fixes: 67afd6d1cfdf("net: stmmac: Add Split Header support and enable it in XGMAC cores") Cc: # 5.10.x Suggested-by: Ong, Boon Leong Signed-off-by: Mohammad Athari Bin Ismail Signed-off-by: Wong Vee Khee Signed-off-by: Tan Tee Min Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 24eea1b05ca2..29917850f079 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -270,5 +270,6 @@ struct plat_stmmacenet_data { int msi_rx_base_vec; int msi_tx_base_vec; bool use_phy_wol; + bool sph_disable; }; #endif -- cgit v1.2.3 From 9bd1d9a0d8bb1a549831fd98fcc3105960f7068b Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Sun, 1 May 2022 16:55:06 +0200 Subject: soc: apple: Add RTKit IPC library Apple SoCs such as the M1 come with multiple embedded co-processors running proprietary firmware. Communication with those is established over a simple mailbox using the RTKit IPC protocol. This cannot be implemented inside the mailbox subsystem since on top of communication over channels we also need support for starting, hibernating and resetting these co-processors. We also need to handle shared memory allocations differently depending on the co-processor and don't want to split that across multiple drivers. Reviewed-by: Arnd Bergmann Signed-off-by: Sven Peter --- include/linux/soc/apple/rtkit.h | 155 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 include/linux/soc/apple/rtkit.h (limited to 'include/linux') diff --git a/include/linux/soc/apple/rtkit.h b/include/linux/soc/apple/rtkit.h new file mode 100644 index 000000000000..88eb832eac7b --- /dev/null +++ b/include/linux/soc/apple/rtkit.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR MIT */ +/* + * Apple RTKit IPC Library + * Copyright (C) The Asahi Linux Contributors + * + * Apple's SoCs come with various co-processors running their RTKit operating + * system. This protocol library is used by client drivers to use the + * features provided by them. + */ +#ifndef _LINUX_APPLE_RTKIT_H_ +#define _LINUX_APPLE_RTKIT_H_ + +#include +#include +#include + +/* + * Struct to represent implementation-specific RTKit operations. + * + * @buffer: Shared memory buffer allocated inside normal RAM. + * @iomem: Shared memory buffer controlled by the co-processors. + * @size: Size of the shared memory buffer. + * @iova: Device VA of shared memory buffer. + * @is_mapped: Shared memory buffer is managed by the co-processor. + */ + +struct apple_rtkit_shmem { + void *buffer; + void __iomem *iomem; + size_t size; + dma_addr_t iova; + bool is_mapped; +}; + +/* + * Struct to represent implementation-specific RTKit operations. + * + * @crashed: Called when the co-processor has crashed. Runs in process + * context. + * @recv_message: Function called when a message from RTKit is received + * on a non-system endpoint. Called from a worker thread. + * @recv_message_early: + * Like recv_message, but called from atomic context. It + * should return true if it handled the message. If it + * returns false, the message will be passed on to the + * worker thread. + * @shmem_setup: Setup shared memory buffer. If bfr.is_iomem is true the + * buffer is managed by the co-processor and needs to be mapped. + * Otherwise the buffer is managed by Linux and needs to be + * allocated. If not specified dma_alloc_coherent is used. + * Called in process context. + * @shmem_destroy: Undo the shared memory buffer setup in shmem_setup. If not + * specified dma_free_coherent is used. Called in process + * context. + */ +struct apple_rtkit_ops { + void (*crashed)(void *cookie); + void (*recv_message)(void *cookie, u8 endpoint, u64 message); + bool (*recv_message_early)(void *cookie, u8 endpoint, u64 message); + int (*shmem_setup)(void *cookie, struct apple_rtkit_shmem *bfr); + void (*shmem_destroy)(void *cookie, struct apple_rtkit_shmem *bfr); +}; + +struct apple_rtkit; + +/* + * Initializes the internal state required to handle RTKit. This + * should usually be called within _probe. + * + * @dev: Pointer to the device node this coprocessor is assocated with + * @cookie: opaque cookie passed to all functions defined in rtkit_ops + * @mbox_name: mailbox name used to communicate with the co-processor + * @mbox_idx: mailbox index to be used if mbox_name is NULL + * @ops: pointer to rtkit_ops to be used for this co-processor + */ +struct apple_rtkit *devm_apple_rtkit_init(struct device *dev, void *cookie, + const char *mbox_name, int mbox_idx, + const struct apple_rtkit_ops *ops); + +/* + * Reinitialize internal structures. Must only be called with the co-processor + * is held in reset. + */ +int apple_rtkit_reinit(struct apple_rtkit *rtk); + +/* + * Handle RTKit's boot process. Should be called after the CPU of the + * co-processor has been started. + */ +int apple_rtkit_boot(struct apple_rtkit *rtk); + +/* + * Quiesce the co-processor. + */ +int apple_rtkit_quiesce(struct apple_rtkit *rtk); + +/* + * Wake the co-processor up from hibernation mode. + */ +int apple_rtkit_wake(struct apple_rtkit *rtk); + +/* + * Shutdown the co-processor + */ +int apple_rtkit_shutdown(struct apple_rtkit *rtk); + +/* + * Checks if RTKit is running and ready to handle messages. + */ +bool apple_rtkit_is_running(struct apple_rtkit *rtk); + +/* + * Checks if RTKit has crashed. + */ +bool apple_rtkit_is_crashed(struct apple_rtkit *rtk); + +/* + * Starts an endpoint. Must be called after boot but before any messages can be + * sent or received from that endpoint. + */ +int apple_rtkit_start_ep(struct apple_rtkit *rtk, u8 endpoint); + +/* + * Send a message to the given endpoint. + * + * @rtk: RTKit reference + * @ep: target endpoint + * @message: message to be sent + * @completeion: will be completed once the message has been submitted + * to the hardware FIFO. Can be NULL. + * @atomic: if set to true this function can be called from atomic + * context. + */ +int apple_rtkit_send_message(struct apple_rtkit *rtk, u8 ep, u64 message, + struct completion *completion, bool atomic); + +/* + * Send a message to the given endpoint and wait until it has been submitted + * to the hardware FIFO. + * Will return zero on success and a negative error code on failure + * (e.g. -ETIME when the message couldn't be written within the given + * timeout) + * + * @rtk: RTKit reference + * @ep: target endpoint + * @message: message to be sent + * @timeout: timeout in milliseconds to allow the message transmission + * to be completed + * @atomic: if set to true this function can be called from atomic + * context. + */ +int apple_rtkit_send_message_wait(struct apple_rtkit *rtk, u8 ep, u64 message, + unsigned long timeout, bool atomic); + +#endif /* _LINUX_APPLE_RTKIT_H_ */ -- cgit v1.2.3 From 3254e0b9eb5649ffaa48717ebc9c593adc4ee6a9 Mon Sep 17 00:00:00 2001 From: Alexandru Tachici Date: Fri, 29 Apr 2022 18:34:31 +0300 Subject: ethtool: Add 10base-T1L link mode entry Add entry for the 10base-T1L full duplex mode. Reviewed-by: Andrew Lunn Reviewed-by: Oleksij Rempel Signed-off-by: Alexandru Tachici Signed-off-by: David S. Miller --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 36ca2b5c2253..b12af9e2f389 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -65,7 +65,7 @@ extern const int phy_basic_ports_array[3]; extern const int phy_fibre_port_array[1]; extern const int phy_all_ports_features_array[7]; extern const int phy_10_100_features_array[4]; -extern const int phy_basic_t1_features_array[2]; +extern const int phy_basic_t1_features_array[3]; extern const int phy_gbit_features_array[2]; extern const int phy_10gbit_features_array[1]; -- cgit v1.2.3 From 3da8ffd8545f62fec85a48a3c637b2f427974f11 Mon Sep 17 00:00:00 2001 From: Alexandru Tachici Date: Fri, 29 Apr 2022 18:34:34 +0300 Subject: net: phy: Add 10BASE-T1L support in phy-c45 This patch is needed because the BASE-T1 uses different registers for status, control and advertisement to those already employed in the existing phy-c45 functions. Where required, genphy_c45 functions will now check whether the device supports BASE-T1 and use the specific registers instead: 45.2.7.19 BASE-T1 AN control register, 45.2.7.20 BASE-T1 AN status, 45.2.7.21 BASE-T1 AN advertisement register, 45.2.7.22 BASE-T1 AN LP Base Page ability register, 45.2.1.185 BASE-T1 PMA/PMD control register. Tested-by: Oleksij Rempel Signed-off-by: Alexandru Tachici Signed-off-by: David S. Miller --- include/linux/mdio.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/phy.h | 3 +++ 2 files changed, 73 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index ecac96d52e01..00177567cfef 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -340,6 +340,76 @@ static inline void mii_10gbt_stat_mod_linkmode_lpa_t(unsigned long *advertising, advertising, lpa & MDIO_AN_10GBT_STAT_LP10G); } +/** + * mii_t1_adv_l_mod_linkmode_t + * @advertising: target the linkmode advertisement settings + * @lpa: value of the BASE-T1 Autonegotiation Advertisement [15:0] Register + * + * A small helper function that translates BASE-T1 Autonegotiation + * Advertisement [15:0] Register bits to linkmode advertisement settings. + * Other bits in advertising aren't changed. + */ +static inline void mii_t1_adv_l_mod_linkmode_t(unsigned long *advertising, u32 lpa) +{ + linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising, + lpa & MDIO_AN_T1_ADV_L_PAUSE_CAP); + linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising, + lpa & MDIO_AN_T1_ADV_L_PAUSE_ASYM); +} + +/** + * mii_t1_adv_m_mod_linkmode_t + * @advertising: target the linkmode advertisement settings + * @lpa: value of the BASE-T1 Autonegotiation Advertisement [31:16] Register + * + * A small helper function that translates BASE-T1 Autonegotiation + * Advertisement [31:16] Register bits to linkmode advertisement settings. + * Other bits in advertising aren't changed. + */ +static inline void mii_t1_adv_m_mod_linkmode_t(unsigned long *advertising, u32 lpa) +{ + linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT1L_Full_BIT, + advertising, lpa & MDIO_AN_T1_ADV_M_B10L); +} + +/** + * linkmode_adv_to_mii_t1_adv_l_t + * @advertising: the linkmode advertisement settings + * + * A small helper function that translates linkmode advertisement + * settings to phy autonegotiation advertisements for the + * BASE-T1 Autonegotiation Advertisement [15:0] Register. + */ +static inline u32 linkmode_adv_to_mii_t1_adv_l_t(unsigned long *advertising) +{ + u32 result = 0; + + if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising)) + result |= MDIO_AN_T1_ADV_L_PAUSE_CAP; + if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising)) + result |= MDIO_AN_T1_ADV_L_PAUSE_ASYM; + + return result; +} + +/** + * linkmode_adv_to_mii_t1_adv_m_t + * @advertising: the linkmode advertisement settings + * + * A small helper function that translates linkmode advertisement + * settings to phy autonegotiation advertisements for the + * BASE-T1 Autonegotiation Advertisement [31:16] Register. + */ +static inline u32 linkmode_adv_to_mii_t1_adv_m_t(unsigned long *advertising) +{ + u32 result = 0; + + if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT1L_Full_BIT, advertising)) + result |= MDIO_AN_T1_ADV_M_B10L; + + return result; +} + int __mdiobus_read(struct mii_bus *bus, int addr, u32 regnum); int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val); int __mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum, diff --git a/include/linux/phy.h b/include/linux/phy.h index b12af9e2f389..2d12054932ba 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -570,6 +570,7 @@ struct macsec_ops; * @autoneg_complete: Flag auto negotiation of the link has completed * @mdix: Current crossover * @mdix_ctrl: User setting of crossover + * @pma_extable: Cached value of PMA/PMD Extended Abilities Register * @interrupts: Flag interrupts have been enabled * @interface: enum phy_interface_t value * @skb: Netlink message for cable diagnostics @@ -698,6 +699,8 @@ struct phy_device { u8 mdix; u8 mdix_ctrl; + int pma_extable; + void (*phy_link_change)(struct phy_device *phydev, bool up); void (*adjust_link)(struct net_device *dev); -- cgit v1.2.3 From 246d921646c071b878480997c294db6c83215b06 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 23 Nov 2021 13:37:52 -0500 Subject: fs-verity: define a function to return the integrity protected file digest Define a function named fsverity_get_digest() to return the verity file digest and the associated hash algorithm (enum hash_algo). This assumes that before calling fsverity_get_digest() the file must have been opened, which is even true for the IMA measure/appraise on file open policy rule use case (func=FILE_CHECK). do_open() calls vfs_open() immediately prior to ima_file_check(). Acked-by: Eric Biggers Signed-off-by: Mimi Zohar --- include/linux/fsverity.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index a7afc800bd8d..7af030fa3c36 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -12,8 +12,16 @@ #define _LINUX_FSVERITY_H #include +#include +#include #include +/* + * Largest digest size among all hash algorithms supported by fs-verity. + * Currently assumed to be <= size of fsverity_descriptor::root_hash. + */ +#define FS_VERITY_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE + /* Verity operations for filesystems */ struct fsverity_operations { @@ -131,6 +139,9 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *arg); /* measure.c */ int fsverity_ioctl_measure(struct file *filp, void __user *arg); +int fsverity_get_digest(struct inode *inode, + u8 digest[FS_VERITY_MAX_DIGEST_SIZE], + enum hash_algo *alg); /* open.c */ @@ -170,6 +181,13 @@ static inline int fsverity_ioctl_measure(struct file *filp, void __user *arg) return -EOPNOTSUPP; } +static inline int fsverity_get_digest(struct inode *inode, + u8 digest[FS_VERITY_MAX_DIGEST_SIZE], + enum hash_algo *alg) +{ + return -EOPNOTSUPP; +} + /* open.c */ static inline int fsverity_file_open(struct inode *inode, struct file *filp) -- cgit v1.2.3 From 8cf9e1210adf49cd83d11e2b1984540cf0e0be71 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 28 Apr 2022 17:59:39 +0200 Subject: mm: slab: fix comment for ARCH_KMALLOC_MINALIGN The comment next to the ARCH_KMALLOC_MINALIGN definition says that ARCH_KMALLOC_MINALIGN can be defined in arch headers. This is incorrect: it's actually ARCH_DMA_MINALIGN that can be defined there. Fix the comment. Signed-off-by: Andrey Konovalov Acked-by: David Rientjes Signed-off-by: Vlastimil Babka Link: https://lore.kernel.org/r/fe1ca7a25a054b61d1038686d07569416e287e7b.1651161548.git.andreyknvl@google.com --- include/linux/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 11ceddcae9f4..8c090599fc21 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -197,7 +197,7 @@ void kmem_dump_obj(void *object); /* * Some archs want to perform DMA into kmalloc caches and need a guaranteed * alignment larger than the alignment of a 64-bit integer. - * Setting ARCH_KMALLOC_MINALIGN in arch headers allows that. + * Setting ARCH_DMA_MINALIGN in arch headers allows that. */ #if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8 #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN -- cgit v1.2.3 From 154036a3b3f39cb17fc65511011f4c4024846e91 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 28 Apr 2022 17:59:40 +0200 Subject: mm: slab: fix comment for __assume_kmalloc_alignment The comment next to the __assume_kmalloc_alignment definition is not precise: kmalloc relies on kmem_cache_alloc, so kmalloc technically returns pointers aligned to both ARCH_KMALLOC_MINALIGN and ARCH_SLAB_MINALIGN, not only to ARCH_KMALLOC_MINALIGN. (See create_kmalloc_cache()->create_boot_cache()->calculate_alignment() for SLAB and SLUB and __do_kmalloc_node() for SLOB.) Clarify the comment. The assumption specified by __assume_kmalloc_alignment is still correct, although it can be made stronger. I'll leave this to a separate patch. Signed-off-by: Andrey Konovalov Acked-by: David Rientjes Signed-off-by: Vlastimil Babka Link: https://lore.kernel.org/r/84d8142747230f2015eaf9705ee7c2e1a9f56596.1651161548.git.andreyknvl@google.com --- include/linux/slab.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 8c090599fc21..58bb9392775d 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -217,9 +217,9 @@ void kmem_dump_obj(void *object); #endif /* - * kmalloc and friends return ARCH_KMALLOC_MINALIGN aligned - * pointers. kmem_cache_alloc and friends return ARCH_SLAB_MINALIGN - * aligned pointers. + * kmem_cache_alloc and friends return pointers aligned to ARCH_SLAB_MINALIGN. + * kmalloc and friends return pointers aligned to both ARCH_KMALLOC_MINALIGN + * and ARCH_SLAB_MINALIGN, but here we only assume the former alignment. */ #define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN) #define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN) -- cgit v1.2.3 From 23587f7c5daa936524ede26201253a089666b5d9 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 29 Apr 2022 17:05:45 +0800 Subject: mm/slub: remove unused kmem_cache_order_objects max max field holds the largest slab order that was ever used for a slab cache. But it's unused now. Remove it. Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Acked-by: David Rientjes Signed-off-by: Vlastimil Babka Link: https://lore.kernel.org/r/20220429090545.33413-1-linmiaohe@huawei.com --- include/linux/slub_def.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 33c5c0e3bd8d..f9c68a9dac04 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -105,7 +105,6 @@ struct kmem_cache { struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ - struct kmem_cache_order_objects max; struct kmem_cache_order_objects min; gfp_t allocflags; /* gfp flags to use on each alloc */ int refcount; /* Refcount for slab cache destroy */ -- cgit v1.2.3 From 94f697c5384bd7f9632acca483ba1ef9dd99ea97 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Fri, 29 Apr 2022 12:01:53 +0200 Subject: mtd: spi-nor: move spi_nor_write_ear() to winbond module The "Extended Address Register" is winbond specific. If the flash is larger than 16MiB and is used in 3 byte address mode, it is used to set the remaining address bits. Move the write_ear() function, the opcode macros and the spimem op template into the winbond module and rename them accordingly. Signed-off-by: Michael Walle Signed-off-by: Pratyush Yadav Reviewed-by: Pratyush Yadav Link: https://lore.kernel.org/r/20220429100153.2338501-1-michael@walle.cc --- include/linux/mtd/spi-nor.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 5e25a7b75ae2..e505c4a5c530 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -47,8 +47,6 @@ #define SPINOR_OP_RDID 0x9f /* Read JEDEC ID */ #define SPINOR_OP_RDSFDP 0x5a /* Read SFDP */ #define SPINOR_OP_RDCR 0x35 /* Read configuration register */ -#define SPINOR_OP_RDEAR 0xc8 /* Read Extended Address Register */ -#define SPINOR_OP_WREAR 0xc5 /* Write Extended Address Register */ #define SPINOR_OP_SRSTEN 0x66 /* Software Reset Enable */ #define SPINOR_OP_SRST 0x99 /* Software Reset */ #define SPINOR_OP_GBULK 0x98 /* Global Block Unlock */ -- cgit v1.2.3 From b170143ae1113882731666aec9b9105356f1fc17 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Sun, 1 May 2022 16:55:08 +0200 Subject: soc: apple: Add SART driver The NVMe co-processor on the Apple M1 uses a DMA address filter called SART for some DMA transactions. This adds a simple driver used to configure the memory regions from which DMA transactions are allowed. Unlike a real IOMMU, SART does not support any pagetables and can't be implemented inside the IOMMU subsystem using iommu_ops. It also can't be implemented using dma_map_ops since not all DMA transactions of the NVMe controller are filtered by SART. Instead, most buffers have to be registered using the integrated NVMe IOMMU and we can't have two separate dma_map_ops implementations for a single device. Co-developed-by: Hector Martin Reviewed-by: Arnd Bergmann Signed-off-by: Hector Martin Signed-off-by: Sven Peter --- include/linux/soc/apple/sart.h | 53 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 include/linux/soc/apple/sart.h (limited to 'include/linux') diff --git a/include/linux/soc/apple/sart.h b/include/linux/soc/apple/sart.h new file mode 100644 index 000000000000..2249bf6cde09 --- /dev/null +++ b/include/linux/soc/apple/sart.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR MIT */ +/* + * Apple SART device driver + * Copyright (C) The Asahi Linux Contributors + * + * Apple SART is a simple address filter for DMA transactions. + * Regions of physical memory must be added to the SART's allow + * list before any DMA can target these. Unlike a proper + * IOMMU no remapping can be done. + */ + +#ifndef _LINUX_SOC_APPLE_SART_H_ +#define _LINUX_SOC_APPLE_SART_H_ + +#include +#include +#include + +struct apple_sart; + +/* + * Get a reference to the SART attached to dev. + * + * Looks for the phandle reference in apple,sart and returns a pointer + * to the corresponding apple_sart struct to be used with + * apple_sart_add_allowed_region and apple_sart_remove_allowed_region. + */ +struct apple_sart *devm_apple_sart_get(struct device *dev); + +/* + * Adds the region [paddr, paddr+size] to the DMA allow list. + * + * @sart: SART reference + * @paddr: Start address of the region to be used for DMA + * @size: Size of the region to be used for DMA. + */ +int apple_sart_add_allowed_region(struct apple_sart *sart, phys_addr_t paddr, + size_t size); + +/* + * Removes the region [paddr, paddr+size] from the DMA allow list. + * + * Note that exact same paddr and size used for apple_sart_add_allowed_region + * have to be passed. + * + * @sart: SART reference + * @paddr: Start address of the region no longer used for DMA + * @size: Size of the region no longer used for DMA. + */ +int apple_sart_remove_allowed_region(struct apple_sart *sart, phys_addr_t paddr, + size_t size); + +#endif /* _LINUX_SOC_APPLE_SART_H_ */ -- cgit v1.2.3 From f502cc568de95e5ed9cc9e6133fa454fbe0c5c01 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 4 Mar 2022 11:48:38 -0800 Subject: KVM: Add max_vcpus field in common 'struct kvm' For TDX guests, the maximum number of vcpus needs to be specified when the TDX guest VM is initialized (creating the TDX data corresponding to TDX guest) before creating vcpu. It needs to record the maximum number of vcpus on VM creation (KVM_CREATE_VM) and return error if the number of vcpus exceeds it Because there is already max_vcpu member in arm64 struct kvm_arch, move it to common struct kvm and initialize it to KVM_MAX_VCPUS before kvm_arch_init_vm() instead of adding it to x86 struct kvm_arch. Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Message-Id: Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 252ee4a61b58..f94f72bbd2d3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -725,6 +725,7 @@ struct kvm { * and is accessed atomically. */ atomic_t online_vcpus; + int max_vcpus; int created_vcpus; int last_boosted_vcpu; struct list_head vm_list; -- cgit v1.2.3 From db05628435aa761d30b4eae481a82befe7a8492a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:12 +0200 Subject: blk-cgroup: move blkcg_{get,set}_fc_appid out of line No need to have these helpers inline. Also remove the stubs and just use an IS_ENABLED for the get side (the set side already is only built conditionlly). Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 58 ++-------------------------------------------- 1 file changed, 2 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 652cd05b0924..7a2f7de30173 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -218,61 +218,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } #endif /* CONFIG_BLK_CGROUP */ -#ifdef CONFIG_BLK_CGROUP_FC_APPID -/* - * Sets the fc_app_id field associted to blkcg - * @app_id: application identifier - * @cgrp_id: cgroup id - * @app_id_len: size of application identifier - */ -static inline int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len) -{ - struct cgroup *cgrp; - struct cgroup_subsys_state *css; - struct blkcg *blkcg; - int ret = 0; - - if (app_id_len > FC_APPID_LEN) - return -EINVAL; - - cgrp = cgroup_get_from_id(cgrp_id); - if (!cgrp) - return -ENOENT; - css = cgroup_get_e_css(cgrp, &io_cgrp_subsys); - if (!css) { - ret = -ENOENT; - goto out_cgrp_put; - } - blkcg = css_to_blkcg(css); - /* - * There is a slight race condition on setting the appid. - * Worst case an I/O may not find the right id. - * This is no different from the I/O we let pass while obtaining - * the vmid from the fabric. - * Adding the overhead of a lock is not necessary. - */ - strlcpy(blkcg->fc_app_id, app_id, app_id_len); - css_put(css); -out_cgrp_put: - cgroup_put(cgrp); - return ret; -} +int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len); +char *blkcg_get_fc_appid(struct bio *bio); -/** - * blkcg_get_fc_appid - get the fc app identifier associated with a bio - * @bio: target bio - * - * On success return the fc_app_id, on failure return NULL - */ -static inline char *blkcg_get_fc_appid(struct bio *bio) -{ - if (bio && bio->bi_blkg && - (bio->bi_blkg->blkcg->fc_app_id[0] != '\0')) - return bio->bi_blkg->blkcg->fc_app_id; - return NULL; -} -#else -static inline int blkcg_set_fc_appid(char *buf, u64 id, size_t len) { return -EINVAL; } -static inline char *blkcg_get_fc_appid(struct bio *bio) { return NULL; } -#endif /*CONFIG_BLK_CGROUP_FC_APPID*/ #endif /* _BLK_CGROUP_H */ -- cgit v1.2.3 From 216889aad362b5b7e998a5371348b5e95d485dd1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:13 +0200 Subject: blk-cgroup: move blk_cgroup_congested out line There is no urgent need to inline this function, so move it out of line. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-6-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 7a2f7de30173..988965c1c27b 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -135,25 +135,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) return NULL; } -static inline bool blk_cgroup_congested(void) -{ - struct cgroup_subsys_state *css; - bool ret = false; - - rcu_read_lock(); - css = kthread_blkcg(); - if (!css) - css = task_css(current, io_cgrp_id); - while (css) { - if (atomic_read(&css->cgroup->congestion_count)) { - ret = true; - break; - } - css = css->parent; - } - rcu_read_unlock(); - return ret; -} +bool blk_cgroup_congested(void); /** * blkcg_parent - get the parent of a blkcg -- cgit v1.2.3 From 397c9f46ee4d99024c64954b007c1b5762d01cb4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:14 +0200 Subject: blk-cgroup: move blkcg_{pin,unpin}_online out of line Move these two functions out of line as there is no good reason to inline them. Also switch to passing a cgroup_subsys_state instead of doing the conversion in the caller to prepare for making the blkcg structure private to blk-cgroup. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 46 ++-------------------------------------------- 1 file changed, 2 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 988965c1c27b..0fb7459096e9 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -111,7 +111,6 @@ struct blkcg_gq { extern struct cgroup_subsys_state * const blkcg_root_css; -void blkcg_destroy_blkgs(struct blkcg *blkcg); void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); void blkcg_maybe_throttle_current(void); @@ -136,49 +135,8 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) } bool blk_cgroup_congested(void); - -/** - * blkcg_parent - get the parent of a blkcg - * @blkcg: blkcg of interest - * - * Return the parent blkcg of @blkcg. Can be called anytime. - */ -static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) -{ - return css_to_blkcg(blkcg->css.parent); -} - -/** - * blkcg_pin_online - pin online state - * @blkcg: blkcg of interest - * - * While pinned, a blkcg is kept online. This is primarily used to - * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline - * while an associated cgwb is still active. - */ -static inline void blkcg_pin_online(struct blkcg *blkcg) -{ - refcount_inc(&blkcg->online_pin); -} - -/** - * blkcg_unpin_online - unpin online state - * @blkcg: blkcg of interest - * - * This is primarily used to impedance-match blkg and cgwb lifetimes so - * that blkg doesn't go offline while an associated cgwb is still active. - * When this count goes to zero, all active cgwbs have finished so the - * blkcg can continue destruction by calling blkcg_destroy_blkgs(). - */ -static inline void blkcg_unpin_online(struct blkcg *blkcg) -{ - do { - if (!refcount_dec_and_test(&blkcg->online_pin)) - break; - blkcg_destroy_blkgs(blkcg); - blkcg = blkcg_parent(blkcg); - } while (blkcg); -} +void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css); +void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css); #else /* CONFIG_BLK_CGROUP */ -- cgit v1.2.3 From dec223c92a4688f6c9642d640cfe15a99d289dd4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:15 +0200 Subject: blk-cgroup: move struct blkcg to block/blk-cgroup.h There is no real need to expose the blkcg structure to the whole kernel. Move it to the private header an expose a helper to let the writeback code access the cgwb_list member. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-8-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 6 ++---- include/linux/blk-cgroup.h | 32 +------------------------------- 2 files changed, 3 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 87ce24d238f3..2bd073fa6bb5 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -17,8 +17,6 @@ #include #include -struct blkcg; - static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) { kref_get(&bdi->refcnt); @@ -154,7 +152,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp); void wb_memcg_offline(struct mem_cgroup *memcg); -void wb_blkcg_offline(struct blkcg *blkcg); +void wb_blkcg_offline(struct cgroup_subsys_state *css); /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode @@ -378,7 +376,7 @@ static inline void wb_memcg_offline(struct mem_cgroup *memcg) { } -static inline void wb_blkcg_offline(struct blkcg *blkcg) +static inline void wb_blkcg_offline(struct cgroup_subsys_state *css) { } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 0fb7459096e9..d7b188095040 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -37,29 +37,6 @@ enum blkg_iostat_type { BLKG_IOSTAT_NR, }; -struct blkcg_gq; -struct blkg_policy_data; - -struct blkcg { - struct cgroup_subsys_state css; - spinlock_t lock; - refcount_t online_pin; - - struct radix_tree_root blkg_tree; - struct blkcg_gq __rcu *blkg_hint; - struct hlist_head blkg_list; - - struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; - - struct list_head all_blkcgs_node; -#ifdef CONFIG_BLK_CGROUP_FC_APPID - char fc_app_id[FC_APPID_LEN]; -#endif -#ifdef CONFIG_CGROUP_WRITEBACK - struct list_head cgwb_list; -#endif -}; - struct blkg_iostat { u64 bytes[BLKG_IOSTAT_NR]; u64 ios[BLKG_IOSTAT_NR]; @@ -114,11 +91,6 @@ extern struct cgroup_subsys_state * const blkcg_root_css; void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); void blkcg_maybe_throttle_current(void); -static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct blkcg, css) : NULL; -} - /** * bio_blkcg - grab the blkcg associated with a bio * @bio: target bio @@ -137,12 +109,10 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) bool blk_cgroup_congested(void); void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css); void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css); +struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css); #else /* CONFIG_BLK_CGROUP */ -struct blkcg { -}; - struct blkcg_gq { }; -- cgit v1.2.3 From f4a6a61cb6d40d9ae63e47743d33200f3efe3fe7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:16 +0200 Subject: blktrace: cleanup the __trace_note_message interface Pass the cgroup_subsys_state instead of a the blkg so that blktrace doesn't need to poke into blk-cgroup internals, and give the name a blk prefix as the current name is way too generic for a public interface. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-9-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 22501a293fa5..623e22492afa 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -27,12 +27,10 @@ struct blk_trace { atomic_t dropped; }; -struct blkcg; - extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); extern void blk_trace_shutdown(struct request_queue *); -extern __printf(3, 4) -void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *fmt, ...); +__printf(3, 4) void __blk_trace_note_message(struct blk_trace *bt, + struct cgroup_subsys_state *css, const char *fmt, ...); /** * blk_add_trace_msg - Add a (simple) message to the blktrace stream @@ -47,14 +45,14 @@ void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *f * NOTE: Can not use 'static inline' due to presence of var args... * **/ -#define blk_add_cgroup_trace_msg(q, cg, fmt, ...) \ +#define blk_add_cgroup_trace_msg(q, css, fmt, ...) \ do { \ struct blk_trace *bt; \ \ rcu_read_lock(); \ bt = rcu_dereference((q)->blk_trace); \ if (unlikely(bt)) \ - __trace_note_message(bt, cg, fmt, ##__VA_ARGS__);\ + __blk_trace_note_message(bt, css, fmt, ##__VA_ARGS__);\ rcu_read_unlock(); \ } while (0) #define blk_add_trace_msg(q, fmt, ...) \ -- cgit v1.2.3 From bbb1ebe7a909db4de49777fb7676d5bf293f34c9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:17 +0200 Subject: blk-cgroup: replace bio_blkcg with bio_blkcg_css All callers of bio_blkcg actually want the CSS, so replace it with an interface that does return the CSS. This now allows to move struct blkcg_gq to block/blk-cgroup.h instead of exposing it in a public header. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-10-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 83 +++------------------------------------------- 1 file changed, 5 insertions(+), 78 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index d7b188095040..97c7968e3204 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -28,94 +28,18 @@ #define FC_APPID_LEN 129 #ifdef CONFIG_BLK_CGROUP - -enum blkg_iostat_type { - BLKG_IOSTAT_READ, - BLKG_IOSTAT_WRITE, - BLKG_IOSTAT_DISCARD, - - BLKG_IOSTAT_NR, -}; - -struct blkg_iostat { - u64 bytes[BLKG_IOSTAT_NR]; - u64 ios[BLKG_IOSTAT_NR]; -}; - -struct blkg_iostat_set { - struct u64_stats_sync sync; - struct blkg_iostat cur; - struct blkg_iostat last; -}; - -/* association between a blk cgroup and a request queue */ -struct blkcg_gq { - /* Pointer to the associated request_queue */ - struct request_queue *q; - struct list_head q_node; - struct hlist_node blkcg_node; - struct blkcg *blkcg; - - /* all non-root blkcg_gq's are guaranteed to have access to parent */ - struct blkcg_gq *parent; - - /* reference count */ - struct percpu_ref refcnt; - - /* is this blkg online? protected by both blkcg and q locks */ - bool online; - - struct blkg_iostat_set __percpu *iostat_cpu; - struct blkg_iostat_set iostat; - - struct blkg_policy_data *pd[BLKCG_MAX_POLS]; - - spinlock_t async_bio_lock; - struct bio_list async_bios; - union { - struct work_struct async_bio_work; - struct work_struct free_work; - }; - - atomic_t use_delay; - atomic64_t delay_nsec; - atomic64_t delay_start; - u64 last_delay; - int last_use; - - struct rcu_head rcu_head; -}; - extern struct cgroup_subsys_state * const blkcg_root_css; void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); void blkcg_maybe_throttle_current(void); - -/** - * bio_blkcg - grab the blkcg associated with a bio - * @bio: target bio - * - * This returns the blkcg associated with a bio, %NULL if not associated. - * Callers are expected to either handle %NULL or know association has been - * done prior to calling this. - */ -static inline struct blkcg *bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_blkg) - return bio->bi_blkg->blkcg; - return NULL; -} - bool blk_cgroup_congested(void); void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css); void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css); struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css); +struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio); #else /* CONFIG_BLK_CGROUP */ -struct blkcg_gq { -}; - #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) static inline void blkcg_maybe_throttle_current(void) { } @@ -123,7 +47,10 @@ static inline bool blk_cgroup_congested(void) { return false; } #ifdef CONFIG_BLOCK static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } -static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } +static inline struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) +{ + return NULL; +} #endif /* CONFIG_BLOCK */ #endif /* CONFIG_BLK_CGROUP */ -- cgit v1.2.3 From 7f20ba7c42fd899557cef7d001f48711c3066ba5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:18 +0200 Subject: blk-cgroup: remove pointless CONFIG_BLOCK ifdefs No need to make BLK_CGROUP stubs conditional on CONFIG_BLOCK as they can't be used without that. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-11-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 97c7968e3204..abbfa97d6d46 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -44,15 +44,11 @@ struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio); static inline void blkcg_maybe_throttle_current(void) { } static inline bool blk_cgroup_congested(void) { return false; } - -#ifdef CONFIG_BLOCK static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } static inline struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) { return NULL; } -#endif /* CONFIG_BLOCK */ - #endif /* CONFIG_BLK_CGROUP */ int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len); -- cgit v1.2.3 From c97ab271576dec2170e7b804cb05f7617b30fed9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:19 +0200 Subject: blk-cgroup: remove unneeded includes from Remove all the includes that aren't actually needed from and push them to the actual source files where needed. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-12-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index abbfa97d6d46..9f40dbc65f82 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -14,16 +14,11 @@ * Nauman Rafique */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include + +struct bio; +struct cgroup_subsys_state; +struct request_queue; #define FC_APPID_LEN 129 -- cgit v1.2.3 From f624506f98b198e65b44da303f44974590fb16c0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Apr 2022 06:27:23 +0200 Subject: kthread: unexport kthread_blkcg kthread_blkcg is only used by the built-in blk-cgroup code. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220420042723.1010598-16-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/kthread.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index de5d75bafd66..30e5bec81d2b 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -222,9 +222,5 @@ void kthread_associate_blkcg(struct cgroup_subsys_state *css); struct cgroup_subsys_state *kthread_blkcg(void); #else static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { } -static inline struct cgroup_subsys_state *kthread_blkcg(void) -{ - return NULL; -} #endif #endif /* _LINUX_KTHREAD_H */ -- cgit v1.2.3 From d639af621600dc52dd9ed0dcf62b22595f2f5b52 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 22 Mar 2022 09:16:39 +0000 Subject: net/mlx5: fs, split software and IFC flow destination definitions Separate flow destinations between software and IFC. Flow destination type passed by callers was used as the input in firmware commands and over the years software only types were added which resulted in mixing between the two. Create an IFC enum that contains only the flow destinations defined when talking to the firmware. Now that there is a proper software only enum for flow destinations the hardcoded values can be removed as the values are no longer used in firmware commands. Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 11 +++++++++++ include/linux/mlx5/mlx5_ifc.h | 16 ++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index e3bfed68b08a..9da9df9ae751 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -40,6 +40,17 @@ #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) +enum mlx5_flow_destination_type { + MLX5_FLOW_DESTINATION_TYPE_VPORT, + MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE, + MLX5_FLOW_DESTINATION_TYPE_TIR, + MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER, + MLX5_FLOW_DESTINATION_TYPE_UPLINK, + MLX5_FLOW_DESTINATION_TYPE_PORT, + MLX5_FLOW_DESTINATION_TYPE_COUNTER, + MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM, +}; + enum { MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO = 1 << 16, MLX5_FLOW_CONTEXT_ACTION_ENCRYPT = 1 << 17, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 7d2d0ba82144..7f4ec9faa180 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1806,16 +1806,12 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_c0[0x740]; }; -enum mlx5_flow_destination_type { - MLX5_FLOW_DESTINATION_TYPE_VPORT = 0x0, - MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE = 0x1, - MLX5_FLOW_DESTINATION_TYPE_TIR = 0x2, - MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER = 0x6, - MLX5_FLOW_DESTINATION_TYPE_UPLINK = 0x8, - - MLX5_FLOW_DESTINATION_TYPE_PORT = 0x99, - MLX5_FLOW_DESTINATION_TYPE_COUNTER = 0x100, - MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM = 0x101, +enum mlx5_ifc_flow_destination_type { + MLX5_IFC_FLOW_DESTINATION_TYPE_VPORT = 0x0, + MLX5_IFC_FLOW_DESTINATION_TYPE_FLOW_TABLE = 0x1, + MLX5_IFC_FLOW_DESTINATION_TYPE_TIR = 0x2, + MLX5_IFC_FLOW_DESTINATION_TYPE_FLOW_SAMPLER = 0x6, + MLX5_IFC_FLOW_DESTINATION_TYPE_UPLINK = 0x8, }; enum mlx5_flow_table_miss_action { -- cgit v1.2.3 From 6510bc0d7cb4119ebe10f9ab43aca6fd0e5f3e73 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 15 Mar 2022 14:08:23 +0000 Subject: net/mlx5: fs, add unused destination type When the caller doesn't pass a destination fs_core will create a unused rule just so a context can be returned. This unused rule is zeroed out and its type is 0 which can be mixed up with MLX5_FLOW_DESTINATION_TYPE_VPORT. Create a dedicated type to differentiate between the two named MLX5_FLOW_DESTINATION_TYPE_NONE. Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 9da9df9ae751..8135713b0d2d 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -41,6 +41,7 @@ #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) enum mlx5_flow_destination_type { + MLX5_FLOW_DESTINATION_TYPE_NONE, MLX5_FLOW_DESTINATION_TYPE_VPORT, MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE, MLX5_FLOW_DESTINATION_TYPE_TIR, -- cgit v1.2.3 From c9bc1a0ef9f613a7bc1adfff4c67dc5e5d7d1709 Mon Sep 17 00:00:00 2001 From: "Dustin L. Howett" Date: Thu, 17 Feb 2022 10:59:30 -0600 Subject: platform/chrome: cros_ec_lpcs: reserve the MEC LPC I/O ports first Some ChromeOS EC devices (such as the Framework Laptop) only map I/O ports 0x800-0x807. Making the larger reservation required by the non-MEC LPC (the 0xFF ports for the memory map, and the 0xFF ports for the parameter region) is non-viable on these devices. Since we probe the MEC EC first, we can get away with a smaller reservation that covers the MEC EC ports. If we fall back to classic LPC, we can grow the reservation to cover the memory map and the parameter region. cros_ec_lpc_probe also interacted with I/O ports 0x800-0x807 without a reservation. Restructuring the code to request the MEC LPC region first obviates the need to do so. Signed-off-by: Dustin L. Howett Signed-off-by: Tzung-Bi Shih Link: https://lore.kernel.org/r/20220217165930.15081-3-dustin@howett.net --- include/linux/platform_data/cros_ec_commands.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index c23554531961..8cfa8cfca77e 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -51,10 +51,14 @@ /* * The actual block is 0x800-0x8ff, but some BIOSes think it's 0x880-0x8ff * and they tell the kernel that so we have to think of it as two parts. + * + * Other BIOSes report only the I/O port region spanned by the Microchip + * MEC series EC; an attempt to address a larger region may fail. */ -#define EC_HOST_CMD_REGION0 0x800 -#define EC_HOST_CMD_REGION1 0x880 -#define EC_HOST_CMD_REGION_SIZE 0x80 +#define EC_HOST_CMD_REGION0 0x800 +#define EC_HOST_CMD_REGION1 0x880 +#define EC_HOST_CMD_REGION_SIZE 0x80 +#define EC_HOST_CMD_MEC_REGION_SIZE 0x8 /* EC command register bit functions */ #define EC_LPC_CMDR_DATA BIT(0) /* Data ready for host to read */ -- cgit v1.2.3 From 4c7f24f857c7cd0381dd92495db476066d1c6aec Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Sun, 1 May 2022 11:55:23 +0800 Subject: net: sysctl: introduce sysctl SYSCTL_THREE This patch introdues the SYSCTL_THREE. KUnit: [00:10:14] ================ sysctl_test (10 subtests) ================= [00:10:14] [PASSED] sysctl_test_api_dointvec_null_tbl_data [00:10:14] [PASSED] sysctl_test_api_dointvec_table_maxlen_unset [00:10:14] [PASSED] sysctl_test_api_dointvec_table_len_is_zero [00:10:14] [PASSED] sysctl_test_api_dointvec_table_read_but_position_set [00:10:14] [PASSED] sysctl_test_dointvec_read_happy_single_positive [00:10:14] [PASSED] sysctl_test_dointvec_read_happy_single_negative [00:10:14] [PASSED] sysctl_test_dointvec_write_happy_single_positive [00:10:14] [PASSED] sysctl_test_dointvec_write_happy_single_negative [00:10:14] [PASSED] sysctl_test_api_dointvec_write_single_less_int_min [00:10:14] [PASSED] sysctl_test_api_dointvec_write_single_greater_int_max [00:10:14] =================== [PASSED] sysctl_test =================== ./run_kselftest.sh -c sysctl ... ok 1 selftests: sysctl: sysctl.sh Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Hideaki YOSHIFUJI Cc: David Ahern Cc: Simon Horman Cc: Julian Anastasov Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Cc: Shuah Khan Cc: Andrew Morton Cc: Alexei Starovoitov Cc: Eric Dumazet Cc: Lorenz Bauer Cc: Akhmat Karakotov Signed-off-by: Tonghao Zhang Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- include/linux/sysctl.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 6353d6db69b2..80263f7cdb77 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -38,10 +38,10 @@ struct ctl_table_header; struct ctl_dir; /* Keep the same order as in fs/proc/proc_sysctl.c */ -#define SYSCTL_NEG_ONE ((void *)&sysctl_vals[0]) -#define SYSCTL_ZERO ((void *)&sysctl_vals[1]) -#define SYSCTL_ONE ((void *)&sysctl_vals[2]) -#define SYSCTL_TWO ((void *)&sysctl_vals[3]) +#define SYSCTL_ZERO ((void *)&sysctl_vals[0]) +#define SYSCTL_ONE ((void *)&sysctl_vals[1]) +#define SYSCTL_TWO ((void *)&sysctl_vals[2]) +#define SYSCTL_THREE ((void *)&sysctl_vals[3]) #define SYSCTL_FOUR ((void *)&sysctl_vals[4]) #define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[5]) #define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[6]) @@ -51,6 +51,7 @@ struct ctl_dir; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ #define SYSCTL_MAXOLDUID ((void *)&sysctl_vals[10]) +#define SYSCTL_NEG_ONE ((void *)&sysctl_vals[11]) extern const int sysctl_vals[]; -- cgit v1.2.3 From 62139f52b7e588d565aa9df81ea0a0548a68b823 Mon Sep 17 00:00:00 2001 From: Per-Daniel Olsson Date: Fri, 29 Apr 2022 09:22:08 +0200 Subject: regulator: pca9450: Make I2C Level Translator configurable Make the I2C Level Translator included in PCA9450 configurable from devicetree. The reset state is off. By setting nxp,i2c-lt-enable, the I2C Level Translator will be enabled while in STANDBY or RUN state. Signed-off-by: Per-Daniel Olsson Signed-off-by: Rickard x Andersson Link: https://lore.kernel.org/r/20220429072211.24957-2-rickaran@axis.com Signed-off-by: Mark Brown --- include/linux/regulator/pca9450.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h index 71902f41c919..3c01c2bf84f5 100644 --- a/include/linux/regulator/pca9450.h +++ b/include/linux/regulator/pca9450.h @@ -226,4 +226,11 @@ enum { #define WDOG_B_CFG_COLD_LDO12 0x80 #define WDOG_B_CFG_COLD 0xC0 +/* PCA9450_REG_CONFIG2 bits */ +#define I2C_LT_MASK 0x03 +#define I2C_LT_FORCE_DISABLE 0x00 +#define I2C_LT_ON_STANDBY_RUN 0x01 +#define I2C_LT_ON_RUN 0x02 +#define I2C_LT_FORCE_ENABLE 0x03 + #endif /* __LINUX_REG_PCA9450_H__ */ -- cgit v1.2.3 From 1dcbae86ee669bdb0338954cd0136863f5c96c0a Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Thu, 14 Apr 2022 12:27:24 -0700 Subject: soc: ti: wkup_m3_ipc: Add support for IO Isolation AM43xx support isolation of the IOs so that control is taken from the peripheral they are connected to and overridden by values present in the CTRL_CONF_* registers for the pad in the control module. The actual toggling happens from the wkup_m3, so use a DT property from the wkup_m3_ipc node to allow the PM code to communicate the necessity for placing the IOs into isolation to the firmware. Signed-off-by: Dave Gerlach Signed-off-by: Keerthy Signed-off-by: Drew Fustini Signed-off-by: Nishanth Menon Link: https://lore.kernel.org/r/20220414192722.2978837-3-dfustini@baylibre.com --- include/linux/wkup_m3_ipc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/wkup_m3_ipc.h b/include/linux/wkup_m3_ipc.h index 2bc52c6381d5..b706eac58f92 100644 --- a/include/linux/wkup_m3_ipc.h +++ b/include/linux/wkup_m3_ipc.h @@ -34,6 +34,7 @@ struct wkup_m3_ipc { int mem_type; unsigned long resume_addr; int vtt_conf; + int isolation_conf; int state; struct completion sync_complete; -- cgit v1.2.3 From ea082040fe071d2ba1f8f73792743d7ca9fb218e Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Tue, 26 Apr 2022 13:07:44 -0700 Subject: soc: ti: wkup_m3_ipc: Add support for i2c voltage scaling Allow loading of a binary containing i2c scaling sequences to be provided to the wkup_m3 firmware in order to properly scale voltage rails on the PMIC during low power modes like DeepSleep0. Proper binary format is determined by the FW in use. Code expects firmware to have 0x0C57 present as the first two bytes followed by one byte defining offset to sleep sequence followed by one byte defining offset to wake sequence and then lastly both sequences. Each sequence is a series of I2C transfers in the form: u8 length | u8 chip address | u8 byte0/reg address | u8 byte1 | u8 byteN .. The length indicates the number of bytes to transfer, including the register address. The length of each transfer is limited by the I2C buffer size of 32 bytes. Based on previous work by Russ Dill. [dfustini: replace FW_ACTION_HOTPLUG with FW_ACTION_UEVENT] Signed-off-by: Dave Gerlach Signed-off-by: Keerthy [dfustini: add NULL argument to rproc_da_to_va() call] Signed-off-by: Drew Fustini Signed-off-by: Nishanth Menon Link: https://lore.kernel.org/r/20220426200741.712842-3-dfustini@baylibre.com --- include/linux/wkup_m3_ipc.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wkup_m3_ipc.h b/include/linux/wkup_m3_ipc.h index b706eac58f92..fef0fac60f8c 100644 --- a/include/linux/wkup_m3_ipc.h +++ b/include/linux/wkup_m3_ipc.h @@ -37,6 +37,9 @@ struct wkup_m3_ipc { int isolation_conf; int state; + unsigned long volt_scale_offsets; + const char *sd_fw_name; + struct completion sync_complete; struct mbox_client mbox_client; struct mbox_chan *mbox; @@ -50,6 +53,12 @@ struct wkup_m3_wakeup_src { char src[10]; }; +struct wkup_m3_scale_data_header { + u16 magic; + u8 sleep_offset; + u8 wake_offset; +} __packed; + struct wkup_m3_ipc_ops { void (*set_mem_type)(struct wkup_m3_ipc *m3_ipc, int mem_type); void (*set_resume_address)(struct wkup_m3_ipc *m3_ipc, void *addr); -- cgit v1.2.3 From 2a21f9e6d9a408dbd09a01caf5fff42c2f70fa82 Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Sun, 1 May 2022 20:32:12 -0700 Subject: soc: ti: wkup_m3_ipc: Add debug option to halt m3 in suspend Add a debugfs option to allow configurable halting of the wkup_m3 during suspend at the last possible point before low power mode entry. This condition can only be resolved through JTAG and advancing beyond the while loop in a8_lp_ds0_handler [1]. Although this hangs the system it forces the system to remain active once it has been entirely configured for low power mode entry, allowing for register inspection through JTAG to help in debugging transition errors. Halt mode can be set using the enable_off_mode entry under wkup_m3_ipc in the debugfs. [1] https://git.ti.com/cgit/processor-firmware/ti-amx3-cm3-pm-firmware/tree/src/pm_services/pm_handlers.c?h=08.02.00.006#n141 Suggested-by: Brad Griffis Signed-off-by: Dave Gerlach [dfustini: add link for a8_lp_ds0_handler() in ti-amx3-cm3-pm-firmware] Signed-off-by: Drew Fustini Signed-off-by: Nishanth Menon Link: https://lore.kernel.org/r/20220502033211.1383158-1-dfustini@baylibre.com --- include/linux/wkup_m3_ipc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wkup_m3_ipc.h b/include/linux/wkup_m3_ipc.h index fef0fac60f8c..26d1eb058fa3 100644 --- a/include/linux/wkup_m3_ipc.h +++ b/include/linux/wkup_m3_ipc.h @@ -36,6 +36,7 @@ struct wkup_m3_ipc { int vtt_conf; int isolation_conf; int state; + u32 halt; unsigned long volt_scale_offsets; const char *sd_fw_name; @@ -46,6 +47,7 @@ struct wkup_m3_ipc { struct wkup_m3_ipc_ops *ops; int is_rtc_only; + struct dentry *dbg_path; }; struct wkup_m3_wakeup_src { -- cgit v1.2.3 From 3ba75c1316390b2bc39c19cb8f0f85922ab3f9ed Mon Sep 17 00:00:00 2001 From: Baskov Evgeniy Date: Thu, 3 Mar 2022 17:21:19 +0300 Subject: efi: libstub: declare DXE services table UEFI DXE services are not yet used in kernel code but are required to manipulate page table memory protection flags. Add required declarations to use DXE services functions. Signed-off-by: Baskov Evgeniy Link: https://lore.kernel.org/r/20220303142120.1975-2-baskov@ispras.ru [ardb: ignore absent DXE table but warn if the signature check fails] Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index fd266198f9d8..b1f7c6a3e5bf 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -385,6 +385,7 @@ void efi_native_runtime_setup(void); #define EFI_LOAD_FILE_PROTOCOL_GUID EFI_GUID(0x56ec3091, 0x954c, 0x11d2, 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b) #define EFI_LOAD_FILE2_PROTOCOL_GUID EFI_GUID(0x4006c0c1, 0xfcb3, 0x403e, 0x99, 0x6d, 0x4a, 0x6c, 0x87, 0x24, 0xe0, 0x6d) #define EFI_RT_PROPERTIES_TABLE_GUID EFI_GUID(0xeb66918a, 0x7eef, 0x402a, 0x84, 0x2e, 0x93, 0x1d, 0x21, 0xc3, 0x8a, 0xe9) +#define EFI_DXE_SERVICES_TABLE_GUID EFI_GUID(0x05ad34ba, 0x6f02, 0x4214, 0x95, 0x2e, 0x4d, 0xa0, 0x39, 0x8e, 0x2b, 0xb9) #define EFI_IMAGE_SECURITY_DATABASE_GUID EFI_GUID(0xd719b2cb, 0x3d3a, 0x4596, 0xa3, 0xbc, 0xda, 0xd0, 0x0e, 0x67, 0x65, 0x6f) #define EFI_SHIM_LOCK_GUID EFI_GUID(0x605dab50, 0xe046, 0x4300, 0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23) @@ -438,6 +439,7 @@ typedef struct { } efi_config_table_type_t; #define EFI_SYSTEM_TABLE_SIGNATURE ((u64)0x5453595320494249ULL) +#define EFI_DXE_SERVICES_TABLE_SIGNATURE ((u64)0x565245535f455844ULL) #define EFI_2_30_SYSTEM_TABLE_REVISION ((2 << 16) | (30)) #define EFI_2_20_SYSTEM_TABLE_REVISION ((2 << 16) | (20)) -- cgit v1.2.3 From 07768c55f9c2ad64ccae3ed82447a87d8af8a687 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 19 Mar 2022 19:00:20 +0100 Subject: efi/arm64: libstub: run image in place if randomized by the loader If the loader has already placed the EFI kernel image randomly in physical memory, and indicates having done so by installing the 'fixed placement' protocol onto the image handle, don't bother randomizing the placement again in the EFI stub. Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index b1f7c6a3e5bf..580ce607d6f5 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -410,6 +410,17 @@ void efi_native_runtime_setup(void); #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID EFI_GUID(0xc451ed2b, 0x9694, 0x45d3, 0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89) #define LINUX_EFI_COCO_SECRET_AREA_GUID EFI_GUID(0xadf956ad, 0xe98c, 0x484c, 0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47) +/* + * This GUID may be installed onto the kernel image's handle as a NULL protocol + * to signal to the stub that the placement of the image should be respected, + * and moving the image in physical memory is undesirable. To ensure + * compatibility with 64k pages kernels with virtually mapped stacks, and to + * avoid defeating physical randomization, this protocol should only be + * installed if the image was placed at a randomized 128k aligned address in + * memory. + */ +#define LINUX_EFI_LOADED_IMAGE_FIXED_GUID EFI_GUID(0xf5a37b6d, 0x3344, 0x42a5, 0xb6, 0xbb, 0x97, 0x86, 0x48, 0xc1, 0x89, 0x0a) + /* OEM GUIDs */ #define DELLEMC_EFI_RCI2_TABLE_GUID EFI_GUID(0x2d9f28a2, 0xa886, 0x456a, 0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55) #define AMD_SEV_MEM_ENCRYPT_GUID EFI_GUID(0x0cf29b71, 0x9e51, 0x433a, 0xa3, 0xb7, 0x81, 0xf3, 0xab, 0x16, 0xb8, 0x75) -- cgit v1.2.3 From f2b6e79c7378542e03e42fd47c0c8bf00a2fcf6b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 3 May 2022 16:44:02 +0200 Subject: Revert "usb: core: hcd: Create platform devices for onboard hubs in probe()" This reverts commit c40b62216c1aecc0dc00faf33d71bd71cb440337. The series still has built errors as reported in linux-next, so revert it for now. Reported-by: Stephen Rothwell Link: https://lore.kernel.org/r/20220502210728.0b36f3cd@canb.auug.org.au Cc: Stephen Boyd Cc: Douglas Anderson Cc: Matthias Kaehlcke Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/hcd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 4ebc91c09182..548a028f2dab 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -198,7 +198,6 @@ struct usb_hcd { struct usb_hcd *shared_hcd; struct usb_hcd *primary_hcd; - struct list_head onboard_hub_devs; #define HCD_BUFFER_POOLS 4 struct dma_pool *pool[HCD_BUFFER_POOLS]; -- cgit v1.2.3 From 67a7570ad31f2a7f02550153e774bf4c630f0fef Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 3 May 2022 16:44:11 +0200 Subject: Revert "usb: misc: Add onboard_usb_hub driver" This reverts commit 0298b4b95cb373c21e6323c905589f8dac42c5b4. The series still has built errors as reported in linux-next, so revert it for now. Reported-by: Stephen Rothwell Link: https://lore.kernel.org/r/20220502210728.0b36f3cd@canb.auug.org.au Cc: Alan Stern Cc: Douglas Anderson Cc: Matthias Kaehlcke Cc: Ravi Chandra Sadineni Cc: Stephen Boyd Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/onboard_hub.h | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 include/linux/usb/onboard_hub.h (limited to 'include/linux') diff --git a/include/linux/usb/onboard_hub.h b/include/linux/usb/onboard_hub.h deleted file mode 100644 index d9373230556e..000000000000 --- a/include/linux/usb/onboard_hub.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef __LINUX_USB_ONBOARD_HUB_H -#define __LINUX_USB_ONBOARD_HUB_H - -struct usb_device; -struct list_head; - -#if IS_ENABLED(CONFIG_USB_ONBOARD_HUB) -void onboard_hub_create_pdevs(struct usb_device *parent_hub, struct list_head *pdev_list); -void onboard_hub_destroy_pdevs(struct list_head *pdev_list); -#else -static inline void onboard_hub_create_pdevs(struct usb_device *parent_hub, - struct list_head *pdev_list) {} -static inline void onboard_hub_destroy_pdevs(struct list_head *pdev_list) {} -#endif - -#endif /* __LINUX_USB_ONBOARD_HUB_H */ -- cgit v1.2.3 From 1a9517a0a43040b77b5ef64d9053df214ba33d17 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 3 May 2022 16:44:24 +0200 Subject: Revert "of/platform: Add stubs for of_platform_device_create/destroy()" This reverts commit 8e8b11956486e3fe8cacf54a1d492ebdd8cc1fb2. The series still has built errors as reported in linux-next, so revert it for now. Reported-by: Stephen Rothwell Link: https://lore.kernel.org/r/20220502210728.0b36f3cd@canb.auug.org.au Cc: Stephen Boyd Cc: Douglas Anderson Cc: Rob Herring Cc: Matthias Kaehlcke Signed-off-by: Greg Kroah-Hartman --- include/linux/of_platform.h | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h index d15b6cd5e1c3..84a966623e78 100644 --- a/include/linux/of_platform.h +++ b/include/linux/of_platform.h @@ -61,18 +61,16 @@ static inline struct platform_device *of_find_device_by_node(struct device_node } #endif -extern int of_platform_bus_probe(struct device_node *root, - const struct of_device_id *matches, - struct device *parent); - -#ifdef CONFIG_OF_ADDRESS /* Platform devices and busses creation */ extern struct platform_device *of_platform_device_create(struct device_node *np, const char *bus_id, struct device *parent); extern int of_platform_device_destroy(struct device *dev, void *data); - +extern int of_platform_bus_probe(struct device_node *root, + const struct of_device_id *matches, + struct device *parent); +#ifdef CONFIG_OF_ADDRESS extern int of_platform_populate(struct device_node *root, const struct of_device_id *matches, const struct of_dev_auxdata *lookup, @@ -86,18 +84,6 @@ extern int devm_of_platform_populate(struct device *dev); extern void devm_of_platform_depopulate(struct device *dev); #else -/* Platform devices and busses creation */ -static inline struct platform_device *of_platform_device_create(struct device_node *np, - const char *bus_id, - struct device *parent) -{ - return NULL; -} -static inline int of_platform_device_destroy(struct device *dev, void *data) -{ - return -ENODEV; -} - static inline int of_platform_populate(struct device_node *root, const struct of_device_id *matches, const struct of_dev_auxdata *lookup, -- cgit v1.2.3 From 1ac17586c950a2c129393f8a92901a2b357acf24 Mon Sep 17 00:00:00 2001 From: Frank Rowand Date: Mon, 2 May 2022 13:17:40 -0500 Subject: of: overlay: add entry to of_overlay_action_name[] The values of enum of_overlay_notify_action are used to index into array of_overlay_action_name. Add an entry to of_overlay_action_name for the value recently added to of_overlay_notify_action. Array of_overlay_action_name[] is moved into include/linux/of.h adjacent to enum of_overlay_notify_action to make the connection between the two more obvious if either is modified in the future. The only use of of_overlay_action_name is for error reporting in overlay_notify(). All callers of overlay_notify() report the same error, but with fewer details. Remove the redundant error reports in the callers. Fixes: 067c098766c6 ("of: overlay: rework overlay apply and remove kfree()s") Reported-by: Dan Carpenter Signed-off-by: Frank Rowand Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220502181742.1402826-2-frowand.list@gmail.com --- include/linux/of.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 17741eee0ca4..f0a5d6b10c5a 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1550,6 +1550,19 @@ enum of_overlay_notify_action { OF_OVERLAY_POST_REMOVE, }; +static inline char *of_overlay_action_name(enum of_overlay_notify_action action) +{ + static char *of_overlay_action_name[] = { + "init", + "pre-apply", + "post-apply", + "pre-remove", + "post-remove", + }; + + return of_overlay_action_name[action]; +} + struct of_overlay_notify_data { struct device_node *overlay; struct device_node *target; -- cgit v1.2.3 From 282d8998e9979c2186af7f7d22366f2fc3149838 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Mar 2022 15:45:33 -0800 Subject: srcu: Prevent expedited GPs and blocking readers from consuming CPU If an SRCU reader blocks while a synchronize_srcu_expedited() waits for that same reader, then that grace period will spawn an endless series of workqueue handlers, consuming a full CPU. This quickly gets pointless because consuming more CPU isn't going to make that reader get done faster, especially if it is blocked waiting for an external event. This commit therefore spawns at most one pair of back-to-back workqueue handlers per expedited grace period phase, instead inserting increasing delays as that grace period phase grows older, but capped at 10 jiffies. In any case, if there have been at least 100 back-to-back workqueue handlers within a single jiffy, regardless of grace period or grace-period phase, then a one-jiffy delay is inserted. [ paulmck: Apply feedback from kernel test robot. ] Cc: Neeraj Upadhyay Reported-by: Song Liu Tested-by: kernel test robot Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 1b9ff4ed37e4..e3014319d1ad 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -71,9 +71,11 @@ struct srcu_struct { unsigned long srcu_gp_seq; /* Grace-period seq #. */ unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ + unsigned long srcu_gp_start; /* Last GP start timestamp (jiffies) */ unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ unsigned long srcu_size_jiffies; /* Current contention-measurement interval. */ unsigned long srcu_n_lock_retries; /* Contention events in current interval. */ + unsigned long srcu_n_exp_nodelay; /* # expedited no-delays in current GP phase. */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ bool sda_is_static; /* May ->sda be passed to free_percpu()? */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ @@ -83,6 +85,8 @@ struct srcu_struct { atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */ /* callback for the barrier */ /* operation. */ + unsigned long reschedule_jiffies; + unsigned long reschedule_count; struct delayed_work work; struct lockdep_map dep_map; }; -- cgit v1.2.3 From c29722fad4aabbf6bb841b8f058f858ec911df56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Tue, 8 Mar 2022 18:09:26 +0100 Subject: selinux: log anon inode class name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Log the anonymous inode class name in the security hook inode_init_security_anon. This name is the key for name based type transitions on the anon_inode security class on creation. Example: type=AVC msg=audit(02/16/22 22:02:50.585:216) : avc: granted \ { create } for pid=2136 comm=mariadbd anonclass=[io_uring] \ scontext=system_u:system_r:mysqld_t:s0 \ tcontext=system_u:system_r:mysqld_iouring_t:s0 tclass=anon_inode Add a new LSM audit data type holding the inode and the class name. Signed-off-by: Christian Göttsche [PM: adjusted 'anonclass' to be a trusted string, cgzones approved] Signed-off-by: Paul Moore --- include/linux/lsm_audit.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 17d02eda9538..97a8b21eb033 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -76,6 +76,7 @@ struct common_audit_data { #define LSM_AUDIT_DATA_IBENDPORT 14 #define LSM_AUDIT_DATA_LOCKDOWN 15 #define LSM_AUDIT_DATA_NOTIFICATION 16 +#define LSM_AUDIT_DATA_ANONINODE 17 union { struct path path; struct dentry *dentry; @@ -96,6 +97,7 @@ struct common_audit_data { struct lsm_ibpkey_audit *ibpkey; struct lsm_ibendport_audit *ibendport; int reason; + const char *anonclass; } u; /* this union contains LSM specific data */ union { -- cgit v1.2.3 From c2aa2dfef243efe213a480a1ee8566507a5152f4 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Tue, 3 May 2022 01:09:56 -0700 Subject: seccomp: Add wait_killable semantic to seccomp user notifier This introduces a per-filter flag (SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) that makes it so that when notifications are received by the supervisor the notifying process will transition to wait killable semantics. Although wait killable isn't a set of semantics formally exposed to userspace, the concept is searchable. If the notifying process is signaled prior to the notification being received by the userspace agent, it will be handled as normal. One quirk about how this is handled is that the notifying process only switches to TASK_KILLABLE if it receives a wakeup from either an addfd or a signal. This is to avoid an unnecessary wakeup of the notifying task. The reasons behind switching into wait_killable only after userspace receives the notification are: * Avoiding unncessary work - Often, workloads will perform work that they may abort (request racing comes to mind). This allows for syscalls to be aborted safely prior to the notification being received by the supervisor. In this, the supervisor doesn't end up doing work that the workload does not want to complete anyways. * Avoiding side effects - We don't want the syscall to be interruptible once the supervisor starts doing work because it may not be trivial to reverse the operation. For example, unmounting a file system may take a long time, and it's hard to rollback, or treat that as reentrant. * Avoid breaking runtimes - Various runtimes do not GC when they are during a syscall (or while running native code that subsequently calls a syscall). If many notifications are blocked, and not picked up by the supervisor, this can get the application into a bad state. Signed-off-by: Sargun Dhillon Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220503080958.20220-2-sargun@sargun.me --- include/linux/seccomp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 0c564e5d40ff..d31d76be4982 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -8,7 +8,8 @@ SECCOMP_FILTER_FLAG_LOG | \ SECCOMP_FILTER_FLAG_SPEC_ALLOW | \ SECCOMP_FILTER_FLAG_NEW_LISTENER | \ - SECCOMP_FILTER_FLAG_TSYNC_ESRCH) + SECCOMP_FILTER_FLAG_TSYNC_ESRCH | \ + SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) /* sizeof() the first published struct seccomp_notif_addfd */ #define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24 -- cgit v1.2.3 From 58caed3dacb4354a25a1aa8d2febc3e9648ba1f4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 May 2022 16:27:03 -0700 Subject: netdev: reshuffle netif_napi_add() APIs to allow dropping weight Most drivers should not have to worry about selecting the right weight for their NAPI instances and pass NAPI_POLL_WEIGHT. It'd be best if we didn't require the argument at all and selected the default internally. This change prepares the ground for such reshuffling, allowing for a smooth transition. The following API should remain after the next release cycle: netif_napi_add() netif_napi_add_weight() netif_napi_add_tx() netif_napi_add_tx_weight() Where the _weight() variants take an explicit weight argument. I opted for a _weight() suffix rather than a __ prefix, because we use __ in places to mean that caller needs to also issue a synchronize_net() call. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20220502232703.396351-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 50 +++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4aba92a4042a..eaf66e57d891 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2499,37 +2499,53 @@ static inline void *netdev_priv(const struct net_device *dev) */ #define NAPI_POLL_WEIGHT 64 +void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight); + /** - * netif_napi_add - initialize a NAPI context - * @dev: network device - * @napi: NAPI context - * @poll: polling function - * @weight: default weight + * netif_napi_add() - initialize a NAPI context + * @dev: network device + * @napi: NAPI context + * @poll: polling function + * @weight: default weight * * netif_napi_add() must be used to initialize a NAPI context prior to calling * *any* of the other NAPI-related functions. */ -void netif_napi_add(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight); +static inline void +netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) +{ + netif_napi_add_weight(dev, napi, poll, weight); +} + +static inline void +netif_napi_add_tx_weight(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight) +{ + set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); + netif_napi_add_weight(dev, napi, poll, weight); +} + +#define netif_tx_napi_add netif_napi_add_tx_weight /** - * netif_tx_napi_add - initialize a NAPI context - * @dev: network device - * @napi: NAPI context - * @poll: polling function - * @weight: default weight + * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only + * @dev: network device + * @napi: NAPI context + * @poll: polling function * * This variant of netif_napi_add() should be used from drivers using NAPI * to exclusively poll a TX queue. * This will avoid we add it into napi_hash[], thus polluting this hash table. */ -static inline void netif_tx_napi_add(struct net_device *dev, +static inline void netif_napi_add_tx(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), - int weight) + int (*poll)(struct napi_struct *, int)) { - set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); - netif_napi_add(dev, napi, poll, weight); + netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } /** -- cgit v1.2.3 From c674df973ad8af2074c834788e167332d81309fa Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Mar 2022 20:36:15 +0200 Subject: net/mlx5: Store IPsec ESN update work in XFRM state mlx5 IPsec code updated ESN through workqueue with allocation calls in the data path, which can be saved easily if the work is created during XFRM state initialization routine. The locking used later in the work didn't protect from anything because change of HW context is possible during XFRM state add or delete only, which can cancel work and make sure that it is not running. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/accel.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index 0f2596297f6a..a2720ebbb9fd 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -127,8 +127,8 @@ struct mlx5_accel_esp_xfrm * mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, const struct mlx5_accel_esp_xfrm_attrs *attrs); void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm); -int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, - const struct mlx5_accel_esp_xfrm_attrs *attrs); +void mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, + const struct mlx5_accel_esp_xfrm_attrs *attrs); #else @@ -145,9 +145,11 @@ mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, } static inline void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm) {} -static inline int +static inline void mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, - const struct mlx5_accel_esp_xfrm_attrs *attrs) { return -EOPNOTSUPP; } + const struct mlx5_accel_esp_xfrm_attrs *attrs) +{ +} #endif /* CONFIG_MLX5_EN_IPSEC */ #endif /* __MLX5_ACCEL_H__ */ -- cgit v1.2.3 From 2ea36e2e4ad2b77bd5d45142a529b72eb5ca9156 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Mar 2022 20:55:00 +0200 Subject: net/mlx5: Remove useless validity check All callers build xfrm attributes with help of mlx5e_ipsec_build_accel_xfrm_attrs() function that ensure validity of attributes. There is no need to recheck them again. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/accel.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index a2720ebbb9fd..9c511d466e55 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -36,10 +36,6 @@ #include -enum mlx5_accel_esp_aes_gcm_keymat_iv_algo { - MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ, -}; - enum mlx5_accel_esp_flags { MLX5_ACCEL_ESP_FLAGS_TUNNEL = 0, /* Default */ MLX5_ACCEL_ESP_FLAGS_TRANSPORT = 1UL << 0, @@ -57,14 +53,9 @@ enum mlx5_accel_esp_keymats { MLX5_ACCEL_ESP_KEYMAT_AES_GCM, }; -enum mlx5_accel_esp_replay { - MLX5_ACCEL_ESP_REPLAY_NONE, - MLX5_ACCEL_ESP_REPLAY_BMP, -}; struct aes_gcm_keymat { u64 seq_iv; - enum mlx5_accel_esp_aes_gcm_keymat_iv_algo iv_algo; u32 salt; u32 icv_len; @@ -81,7 +72,6 @@ struct mlx5_accel_esp_xfrm_attrs { u32 tfc_pad; u32 flags; u32 sa_handle; - enum mlx5_accel_esp_replay replay_type; union { struct { u32 size; -- cgit v1.2.3 From c6e3b421c7079af67201351c9faff62613e06f40 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 9 Mar 2022 10:35:05 +0200 Subject: net/mlx5: Merge various control path IPsec headers into one file The mlx5 IPsec code has logical separation between code that operates with XFRM objects (ipsec.c), HW objects (ipsec_offload.c), flow steering logic (ipsec_fs.c) and data path (ipsec_rxtx.c). Such separation makes sense for C-files, but isn't needed at all for H-files as they are included in batch anyway. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/accel.h | 145 --------------------------------------------- 1 file changed, 145 deletions(-) delete mode 100644 include/linux/mlx5/accel.h (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h deleted file mode 100644 index 9c511d466e55..000000000000 --- a/include/linux/mlx5/accel.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2018 Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#ifndef __MLX5_ACCEL_H__ -#define __MLX5_ACCEL_H__ - -#include - -enum mlx5_accel_esp_flags { - MLX5_ACCEL_ESP_FLAGS_TUNNEL = 0, /* Default */ - MLX5_ACCEL_ESP_FLAGS_TRANSPORT = 1UL << 0, - MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED = 1UL << 1, - MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP = 1UL << 2, -}; - -enum mlx5_accel_esp_action { - MLX5_ACCEL_ESP_ACTION_DECRYPT, - MLX5_ACCEL_ESP_ACTION_ENCRYPT, -}; - -enum mlx5_accel_esp_keymats { - MLX5_ACCEL_ESP_KEYMAT_AES_NONE, - MLX5_ACCEL_ESP_KEYMAT_AES_GCM, -}; - - -struct aes_gcm_keymat { - u64 seq_iv; - - u32 salt; - u32 icv_len; - - u32 key_len; - u32 aes_key[256 / 32]; -}; - -struct mlx5_accel_esp_xfrm_attrs { - enum mlx5_accel_esp_action action; - u32 esn; - __be32 spi; - u32 seq; - u32 tfc_pad; - u32 flags; - u32 sa_handle; - union { - struct { - u32 size; - - } bmp; - } replay; - enum mlx5_accel_esp_keymats keymat_type; - union { - struct aes_gcm_keymat aes_gcm; - } keymat; - - union { - __be32 a4; - __be32 a6[4]; - } saddr; - - union { - __be32 a4; - __be32 a6[4]; - } daddr; - - u8 is_ipv6; -}; - -struct mlx5_accel_esp_xfrm { - struct mlx5_core_dev *mdev; - struct mlx5_accel_esp_xfrm_attrs attrs; -}; - -enum mlx5_accel_ipsec_cap { - MLX5_ACCEL_IPSEC_CAP_DEVICE = 1 << 0, - MLX5_ACCEL_IPSEC_CAP_ESP = 1 << 1, - MLX5_ACCEL_IPSEC_CAP_IPV6 = 1 << 2, - MLX5_ACCEL_IPSEC_CAP_LSO = 1 << 3, - MLX5_ACCEL_IPSEC_CAP_ESN = 1 << 4, -}; - -#ifdef CONFIG_MLX5_EN_IPSEC - -u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev); - -struct mlx5_accel_esp_xfrm * -mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, - const struct mlx5_accel_esp_xfrm_attrs *attrs); -void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm); -void mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, - const struct mlx5_accel_esp_xfrm_attrs *attrs); - -#else - -static inline u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev) -{ - return 0; -} - -static inline struct mlx5_accel_esp_xfrm * -mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, - const struct mlx5_accel_esp_xfrm_attrs *attrs) -{ - return ERR_PTR(-EOPNOTSUPP); -} -static inline void -mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm) {} -static inline void -mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, - const struct mlx5_accel_esp_xfrm_attrs *attrs) -{ -} - -#endif /* CONFIG_MLX5_EN_IPSEC */ -#endif /* __MLX5_ACCEL_H__ */ -- cgit v1.2.3 From 1c4a59b9fa98c222bd6a8fa82bff942312165c1a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 27 Mar 2022 19:23:19 +0300 Subject: net/mlx5: Remove not-supported ICV length mlx5 doesn't allow to configure any AEAD ICV length other than 128, so remove the logic that configures other unsupported values. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 7f4ec9faa180..7bab3e51c61e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -11379,8 +11379,6 @@ enum { enum { MLX5_IPSEC_OBJECT_ICV_LEN_16B, - MLX5_IPSEC_OBJECT_ICV_LEN_12B, - MLX5_IPSEC_OBJECT_ICV_LEN_8B, }; struct mlx5_ifc_ipsec_obj_bits { -- cgit v1.2.3 From 31ab09b4218879bc394c9faa6da983a82a694600 Mon Sep 17 00:00:00 2001 From: Dipen Patel Date: Fri, 22 Apr 2022 13:52:13 -0700 Subject: drivers: Add hardware timestamp engine (HTE) subsystem Some devices can timestamp system lines/signals/Buses in real-time using the hardware counter or other hardware means which can give finer granularity and help avoid jitter introduced by software timestamping. To utilize such functionality, this patchset creates HTE subsystem where devices can register themselves as providers so that the consumers devices can request specific line from the providers. The patch also adds compilation support in Makefile and menu options in Kconfig. The provider does following: - Registers chip with the framework. - Provides translation hook to convert logical line id. - Provides enable/disable, request/release callbacks. - Pushes timestamp data to HTE subsystem. The consumer does following: - Initializes line attribute. - Gets HTE timestamp descriptor. - Requests timestamp functionality. - Puts HTE timestamp descriptor. Signed-off-by: Dipen Patel Reported-by: kernel test robot Signed-off-by: Thierry Reding --- include/linux/hte.h | 271 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 include/linux/hte.h (limited to 'include/linux') diff --git a/include/linux/hte.h b/include/linux/hte.h new file mode 100644 index 000000000000..8289055061ab --- /dev/null +++ b/include/linux/hte.h @@ -0,0 +1,271 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __LINUX_HTE_H +#define __LINUX_HTE_H + +#include + +struct hte_chip; +struct hte_device; +struct of_phandle_args; + +/** + * enum hte_edge - HTE line edge flags. + * + * @HTE_EDGE_NO_SETUP: No edge setup. In this case consumer will setup edges, + * for example during request irq call. + * @HTE_RISING_EDGE_TS: Rising edge. + * @HTE_FALLING_EDGE_TS: Falling edge. + * + */ +enum hte_edge { + HTE_EDGE_NO_SETUP = 1U << 0, + HTE_RISING_EDGE_TS = 1U << 1, + HTE_FALLING_EDGE_TS = 1U << 2, +}; + +/** + * enum hte_return - HTE subsystem return values used during callback. + * + * @HTE_CB_HANDLED: The consumer handled the data. + * @HTE_RUN_SECOND_CB: The consumer needs further processing, in that case + * HTE subsystem calls secondary callback provided by the consumer where it + * is allowed to sleep. + */ +enum hte_return { + HTE_CB_HANDLED, + HTE_RUN_SECOND_CB, +}; + +/** + * struct hte_ts_data - HTE timestamp data. + * + * @tsc: Timestamp value. + * @seq: Sequence counter of the timestamps. + * @raw_level: Level of the line at the timestamp if provider supports it, + * -1 otherwise. + */ +struct hte_ts_data { + u64 tsc; + u64 seq; + int raw_level; +}; + +/** + * struct hte_clk_info - Clock source info that HTE provider uses to timestamp. + * + * @hz: Supported clock rate in HZ, for example 1KHz clock = 1000. + * @type: Supported clock type. + */ +struct hte_clk_info { + u64 hz; + clockid_t type; +}; + +/** + * typedef hte_ts_cb_t - HTE timestamp data processing primary callback. + * + * The callback is used to push timestamp data to the client and it is + * not allowed to sleep. + * + * @ts: HW timestamp data. + * @data: Client supplied data. + */ +typedef enum hte_return (*hte_ts_cb_t)(struct hte_ts_data *ts, void *data); + +/** + * typedef hte_ts_sec_cb_t - HTE timestamp data processing secondary callback. + * + * This is used when the client needs further processing where it is + * allowed to sleep. + * + * @data: Client supplied data. + * + */ +typedef enum hte_return (*hte_ts_sec_cb_t)(void *data); + +/** + * struct hte_line_attr - Line attributes. + * + * @line_id: The logical ID understood by the consumers and providers. + * @line_data: Line data related to line_id. + * @edge_flags: Edge setup flags. + * @name: Descriptive name of the entity that is being monitored for the + * hardware timestamping. If null, HTE core will construct the name. + * + */ +struct hte_line_attr { + u32 line_id; + void *line_data; + unsigned long edge_flags; + const char *name; +}; + +/** + * struct hte_ts_desc - HTE timestamp descriptor. + * + * This structure is a communication token between consumers to subsystem + * and subsystem to providers. + * + * @attr: The line attributes. + * @hte_data: Subsystem's private data, set by HTE subsystem. + */ +struct hte_ts_desc { + struct hte_line_attr attr; + void *hte_data; +}; + +/** + * struct hte_ops - HTE operations set by providers. + * + * @request: Hook for requesting a HTE timestamp. Returns 0 on success, + * non-zero for failures. + * @release: Hook for releasing a HTE timestamp. Returns 0 on success, + * non-zero for failures. + * @enable: Hook to enable the specified timestamp. Returns 0 on success, + * non-zero for failures. + * @disable: Hook to disable specified timestamp. Returns 0 on success, + * non-zero for failures. + * @get_clk_src_info: Hook to get the clock information the provider uses + * to timestamp. Returns 0 for success and negative error code for failure. On + * success HTE subsystem fills up provided struct hte_clk_info. + * + * xlated_id parameter is used to communicate between HTE subsystem and the + * providers and is translated by the provider. + */ +struct hte_ops { + int (*request)(struct hte_chip *chip, struct hte_ts_desc *desc, + u32 xlated_id); + int (*release)(struct hte_chip *chip, struct hte_ts_desc *desc, + u32 xlated_id); + int (*enable)(struct hte_chip *chip, u32 xlated_id); + int (*disable)(struct hte_chip *chip, u32 xlated_id); + int (*get_clk_src_info)(struct hte_chip *chip, + struct hte_clk_info *ci); +}; + +/** + * struct hte_chip - Abstract HTE chip. + * + * @name: functional name of the HTE IP block. + * @dev: device providing the HTE. + * @ops: callbacks for this HTE. + * @nlines: number of lines/signals supported by this chip. + * @xlate_of: Callback which translates consumer supplied logical ids to + * physical ids, return 0 for the success and negative for the failures. + * It stores (between 0 to @nlines) in xlated_id parameter for the success. + * @xlate_plat: Same as above but for the consumers with no DT node. + * @match_from_linedata: Match HTE device using the line_data. + * @of_hte_n_cells: Number of cells used to form the HTE specifier. + * @gdev: HTE subsystem abstract device, internal to the HTE subsystem. + * @data: chip specific private data. + */ +struct hte_chip { + const char *name; + struct device *dev; + const struct hte_ops *ops; + u32 nlines; + int (*xlate_of)(struct hte_chip *gc, + const struct of_phandle_args *args, + struct hte_ts_desc *desc, u32 *xlated_id); + int (*xlate_plat)(struct hte_chip *gc, struct hte_ts_desc *desc, + u32 *xlated_id); + bool (*match_from_linedata)(const struct hte_chip *chip, + const struct hte_ts_desc *hdesc); + u8 of_hte_n_cells; + + struct hte_device *gdev; + void *data; +}; + +#if IS_ENABLED(CONFIG_HTE) +/* HTE APIs for the providers */ +int devm_hte_register_chip(struct hte_chip *chip); +int hte_push_ts_ns(const struct hte_chip *chip, u32 xlated_id, + struct hte_ts_data *data); + +/* HTE APIs for the consumers */ +int hte_init_line_attr(struct hte_ts_desc *desc, u32 line_id, + unsigned long edge_flags, const char *name, + void *data); +int hte_ts_get(struct device *dev, struct hte_ts_desc *desc, int index); +int hte_ts_put(struct hte_ts_desc *desc); +int hte_request_ts_ns(struct hte_ts_desc *desc, hte_ts_cb_t cb, + hte_ts_sec_cb_t tcb, void *data); +int devm_hte_request_ts_ns(struct device *dev, struct hte_ts_desc *desc, + hte_ts_cb_t cb, hte_ts_sec_cb_t tcb, void *data); +int of_hte_req_count(struct device *dev); +int hte_enable_ts(struct hte_ts_desc *desc); +int hte_disable_ts(struct hte_ts_desc *desc); +int hte_get_clk_src_info(const struct hte_ts_desc *desc, + struct hte_clk_info *ci); + +#else /* !CONFIG_HTE */ +static inline int devm_hte_register_chip(struct hte_chip *chip) +{ + return -EOPNOTSUPP; +} + +static inline int hte_push_ts_ns(const struct hte_chip *chip, + u32 xlated_id, + const struct hte_ts_data *data) +{ + return -EOPNOTSUPP; +} + +static inline int hte_init_line_attr(struct hte_ts_desc *desc, u32 line_id, + unsigned long edge_flags, + const char *name, void *data) +{ + return -EOPNOTSUPP; +} + +static inline int hte_ts_get(struct device *dev, struct hte_ts_desc *desc, + int index) +{ + return -EOPNOTSUPP; +} + +static inline int hte_ts_put(struct hte_ts_desc *desc) +{ + return -EOPNOTSUPP; +} + +static inline int hte_request_ts_ns(struct hte_ts_desc *desc, hte_ts_cb_t cb, + hte_ts_sec_cb_t tcb, void *data) +{ + return -EOPNOTSUPP; +} + +static inline int devm_hte_request_ts_ns(struct device *dev, + struct hte_ts_desc *desc, + hte_ts_cb_t cb, + hte_ts_sec_cb_t tcb, + void *data) +{ + return -EOPNOTSUPP; +} + +static inline int of_hte_req_count(struct device *dev) +{ + return -EOPNOTSUPP; +} + +static inline int hte_enable_ts(struct hte_ts_desc *desc) +{ + return -EOPNOTSUPP; +} + +static inline int hte_disable_ts(struct hte_ts_desc *desc) +{ + return -EOPNOTSUPP; +} + +static inline int hte_get_clk_src_info(const struct hte_ts_desc *desc, + struct hte_clk_info *ci) +{ + return -EOPNOTSUPP; +} +#endif /* !CONFIG_HTE */ + +#endif -- cgit v1.2.3 From 42112dd77b74220e6a1f4a71bb51ca3f583d3842 Mon Sep 17 00:00:00 2001 From: Dipen Patel Date: Fri, 22 Apr 2022 13:52:16 -0700 Subject: gpiolib: Add HTE support Some GPIO chip can provide hardware timestamp support on its GPIO lines , in order to support that, additional API needs to be added which can talk to both GPIO chip and HTE (hardware timestamping engine) providers if there is any dependencies. This patch introduces optional hooks to enable and disable hardware timestamping related features in the GPIO controller chip. Signed-off-by: Dipen Patel Reviewed-by: Linus Walleij Reported-by: kernel test robot Signed-off-by: Thierry Reding --- include/linux/gpio/consumer.h | 16 ++++++++++++++-- include/linux/gpio/driver.h | 10 ++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index c3aa8b330e1c..7eaec081ae6c 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -109,6 +109,8 @@ int gpiod_get_direction(struct gpio_desc *desc); int gpiod_direction_input(struct gpio_desc *desc); int gpiod_direction_output(struct gpio_desc *desc, int value); int gpiod_direction_output_raw(struct gpio_desc *desc, int value); +int gpiod_enable_hw_timestamp_ns(struct gpio_desc *desc, unsigned long flags); +int gpiod_disable_hw_timestamp_ns(struct gpio_desc *desc, unsigned long flags); /* Value get/set from non-sleeping context */ int gpiod_get_value(const struct gpio_desc *desc); @@ -350,8 +352,18 @@ static inline int gpiod_direction_output_raw(struct gpio_desc *desc, int value) WARN_ON(desc); return -ENOSYS; } - - +static inline int gpiod_enable_hw_timestamp_ns(struct gpio_desc *desc, + unsigned long flags) +{ + WARN_ON(desc); + return -ENOSYS; +} +static inline int gpiod_disable_hw_timestamp_ns(struct gpio_desc *desc, + unsigned long flags) +{ + WARN_ON(desc); + return -ENOSYS; +} static inline int gpiod_get_value(const struct gpio_desc *desc) { /* GPIO can never have been requested */ diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 98c93510640e..0ce96ebdaa36 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -323,6 +323,10 @@ struct gpio_irq_chip { * @add_pin_ranges: optional routine to initialize pin ranges, to be used when * requires special mapping of the pins that provides GPIO functionality. * It is called after adding GPIO chip and before adding IRQ chip. + * @en_hw_timestamp: Dependent on GPIO chip, an optional routine to + * enable hardware timestamp. + * @dis_hw_timestamp: Dependent on GPIO chip, an optional routine to + * disable hardware timestamp. * @base: identifies the first GPIO number handled by this chip; * or, if negative during registration, requests dynamic ID allocation. * DEPRECATION: providing anything non-negative and nailing the base @@ -419,6 +423,12 @@ struct gpio_chip { int (*add_pin_ranges)(struct gpio_chip *gc); + int (*en_hw_timestamp)(struct gpio_chip *gc, + u32 offset, + unsigned long flags); + int (*dis_hw_timestamp)(struct gpio_chip *gc, + u32 offset, + unsigned long flags); int base; u16 ngpio; u16 offset; -- cgit v1.2.3 From 3ae2722c93c916b47bfe47a4326e88cc4abb9bf4 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 27 Apr 2022 14:55:57 +0200 Subject: mmc: mmci: Remove custom ios handler The custom boardfile ios handler isn't used anywhere in the kernel. Delete it. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20220427125557.1608825-1-linus.walleij@linaro.org Signed-off-by: Ulf Hansson --- include/linux/amba/mmci.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amba/mmci.h b/include/linux/amba/mmci.h index c92ebc39fc1f..6f96dc2209c0 100644 --- a/include/linux/amba/mmci.h +++ b/include/linux/amba/mmci.h @@ -13,17 +13,11 @@ * @ocr_mask: available voltages on the 4 pins from the block, this * is ignored if a regulator is used, see the MMC_VDD_* masks in * mmc/host.h - * @ios_handler: a callback function to act on specfic ios changes, - * used for example to control a levelshifter - * mask into a value to be binary (or set some other custom bits - * in MMCIPWR) or:ed and written into the MMCIPWR register of the - * block. May also control external power based on the power_mode. * @status: if no GPIO line was given to the block in this function will * be called to determine whether a card is present in the MMC slot or not */ struct mmci_platform_data { unsigned int ocr_mask; - int (*ios_handler)(struct device *, struct mmc_ios *); unsigned int (*status)(struct device *); }; -- cgit v1.2.3 From 00ce3873f730fb24c657854ec04374358e711838 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 4 May 2022 10:17:32 +0200 Subject: opp: Add apis to retrieve opps with interconnect bandwidth Add dev_pm_opp_find_bw_ceil and dev_pm_opp_find_bw_floor to retrieve opps based on interconnect associated with the opp and bandwidth. The index variable is the index of the interconnect as specified in the opp table in Devicetree. Co-developed-by: Thara Gopinath Signed-off-by: Thara Gopinath Signed-off-by: Krzysztof Kozlowski Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 0d85a63a1f78..dcea178868c9 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -129,6 +129,13 @@ struct dev_pm_opp *dev_pm_opp_find_freq_ceil_by_volt(struct device *dev, struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev, unsigned long *freq); + +struct dev_pm_opp *dev_pm_opp_find_bw_ceil(struct device *dev, + unsigned int *bw, int index); + +struct dev_pm_opp *dev_pm_opp_find_bw_floor(struct device *dev, + unsigned int *bw, int index); + void dev_pm_opp_put(struct dev_pm_opp *opp); int dev_pm_opp_add(struct device *dev, unsigned long freq, @@ -279,6 +286,18 @@ static inline struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev, return ERR_PTR(-EOPNOTSUPP); } +static inline struct dev_pm_opp *dev_pm_opp_find_bw_ceil(struct device *dev, + unsigned int *bw, int index) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline struct dev_pm_opp *dev_pm_opp_find_bw_floor(struct device *dev, + unsigned int *bw, int index) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline void dev_pm_opp_put(struct dev_pm_opp *opp) {} static inline int dev_pm_opp_add(struct device *dev, unsigned long freq, -- cgit v1.2.3 From 22079af7df5a5dfef1c4d160abfd43035211759e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 4 May 2022 15:40:22 +0530 Subject: opp: Reorder definition of ceil/floor helpers Reorder the helpers to keep all freq specific ones, followed by level and bw. No functional change. Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index dcea178868c9..6708b4ec244d 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -117,16 +117,16 @@ unsigned long dev_pm_opp_get_suspend_opp_freq(struct device *dev); struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, unsigned long freq, bool available); -struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev, - unsigned int level); -struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev, - unsigned int *level); - struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev, unsigned long *freq); struct dev_pm_opp *dev_pm_opp_find_freq_ceil_by_volt(struct device *dev, unsigned long u_volt); +struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev, + unsigned int level); +struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev, + unsigned int *level); + struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev, unsigned long *freq); @@ -250,12 +250,6 @@ static inline unsigned long dev_pm_opp_get_suspend_opp_freq(struct device *dev) return 0; } -static inline struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, - unsigned long freq, bool available) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static inline struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev, unsigned int level) { @@ -268,6 +262,12 @@ static inline struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev, return ERR_PTR(-EOPNOTSUPP); } +static inline struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, + unsigned long freq, bool available) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev, unsigned long *freq) { -- cgit v1.2.3 From 34453c2e9f799d02f5f379519495208bbd96a935 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 5 Apr 2022 19:23:24 +0100 Subject: irqchip/gic-v3: Exposes bit values for GICR_CTLR.{IR, CES} As we're about to expose GICR_CTLR.{IR,CES} to guests, populate the include file with the architectural values. Signed-off-by: Marc Zyngier Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20220405182327.205520-2-maz@kernel.org --- include/linux/irqchip/arm-gic-v3.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 12d91f0dedf9..728691365464 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -127,6 +127,8 @@ #define GICR_PIDR2 GICD_PIDR2 #define GICR_CTLR_ENABLE_LPIS (1UL << 0) +#define GICR_CTLR_CES (1UL << 1) +#define GICR_CTLR_IR (1UL << 2) #define GICR_CTLR_RWP (1UL << 3) #define GICR_TYPER_CPU_NUMBER(r) (((r) >> 8) & 0xffff) -- cgit v1.2.3 From ec69dfbdc426f22a9557e5c5408d7902fe0e0144 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 2 May 2022 14:54:06 -0700 Subject: soc: qcom: llcc: Add sc8180x and sc8280xp configurations Add LLCC configuration data for the SC8180X and SC8280XP platforms, based on the downstream tables. Signed-off-by: Bjorn Andersson Reviewed-by: Sai Prakash Ranjan Link: https://lore.kernel.org/r/20220502215406.612967-3-bjorn.andersson@linaro.org --- include/linux/soc/qcom/llcc-qcom.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index 0bc21ee58fac..9ed5384c5ca1 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -29,6 +29,8 @@ #define LLCC_AUDHW 22 #define LLCC_NPU 23 #define LLCC_WLHW 24 +#define LLCC_PIMEM 25 +#define LLCC_DRE 26 #define LLCC_CVP 28 #define LLCC_MODPE 29 #define LLCC_APTCM 30 -- cgit v1.2.3 From 170f37d6aa6ad4582eefd7459015de79e244536e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 3 May 2022 00:09:31 -0400 Subject: block: Do not call folio_next() on an unreferenced folio It is unsafe to call folio_next() on a folio unless you hold a reference on it that prevents it from being split or freed. After returning from the iterator, iomap calls folio_end_writeback() which may drop the last reference to the page, or allow the page to be split. If that happens, the iterator will not advance far enough through the bio_vec, leading to assertion failures like the BUG() in folio_end_writeback() that checks we're not trying to end writeback on a page not currently under writeback. Other assertion failures were also seen, but they're all explained by this one bug. Fix the bug by remembering where the next folio starts before returning from the iterator. There are other ways of fixing this bug, but this seems the simplest. Reported-by: Darrick J. Wong Tested-by: Darrick J. Wong Reported-by: Brian Foster Tested-by: Brian Foster Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/bio.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 278cc81cc1e7..00450fd86bb4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -269,6 +269,7 @@ struct folio_iter { size_t offset; size_t length; /* private: for use by the iterator */ + struct folio *_next; size_t _seg_count; int _i; }; @@ -283,6 +284,7 @@ static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio, PAGE_SIZE * (bvec->bv_page - &fi->folio->page); fi->_seg_count = bvec->bv_len; fi->length = min(folio_size(fi->folio) - fi->offset, fi->_seg_count); + fi->_next = folio_next(fi->folio); fi->_i = i; } @@ -290,9 +292,10 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) { fi->_seg_count -= fi->length; if (fi->_seg_count) { - fi->folio = folio_next(fi->folio); + fi->folio = fi->_next; fi->offset = 0; fi->length = min(folio_size(fi->folio), fi->_seg_count); + fi->_next = folio_next(fi->folio); } else if (fi->_i + 1 < bio->bi_vcnt) { bio_first_folio(fi, bio, fi->_i + 1); } else { -- cgit v1.2.3 From 8e1de7042596abb7cb277ea751fc13a4c2b65aea Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Sun, 13 Feb 2022 16:44:45 +0200 Subject: thunderbolt: Add support for XDomain lane bonding The USB4 Inter-Domain Service specification defines a protocol that can be used to establish lane bonding between two USB4 domains (hosts). So far we have not implemented it because the host controller DMA was not fast enough to be able to go over 20 Gbits/s even if lanes were bonded. However, starting from Intel Alder Lake CPUs the DMA can go over 20 Gbits/s so now it makes more sense to add this support to the driver. Because both ends need to negotiate the bonding we add a simple state machine that tracks the connection state and does the necessary steps described by the USB4 Inter-Domain Service specification. We only establish lane bonding when both sides of the link support it. Otherwise we default to use the single lane. Also this is only done when software connection manager is used. On systems with firmware based connection manager, it handles the high-speed tunneling so bonding lanes is specific to the implementation (Intel firmware based connection manager does not support lane bonding). Signed-off-by: Mika Westerberg --- include/linux/thunderbolt.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 124e13cb1469..e13fe15e6a51 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -198,15 +198,15 @@ void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir); * @local_property_block_len: Length of the @local_property_block in dwords * @remote_properties: Properties exported by the remote domain * @remote_property_block_gen: Generation of @remote_properties - * @get_uuid_work: Work used to retrieve @remote_uuid - * @uuid_retries: Number of times left @remote_uuid is requested before - * giving up - * @get_properties_work: Work used to get remote domain properties - * @properties_retries: Number of times left to read properties + * @state: Next XDomain discovery state to run + * @state_work: Work used to run the next state + * @state_retries: Number of retries remain for the state * @properties_changed_work: Work used to notify the remote domain that * our properties have changed * @properties_changed_retries: Number of times left to send properties * changed notification + * @bonding_possible: True if lane bonding is possible on local side + * @target_link_width: Target link width from the remote host * @link: Root switch link the remote domain is connected (ICM only) * @depth: Depth in the chain the remote domain is connected (ICM only) * @@ -244,12 +244,13 @@ struct tb_xdomain { u32 local_property_block_len; struct tb_property_dir *remote_properties; u32 remote_property_block_gen; - struct delayed_work get_uuid_work; - int uuid_retries; - struct delayed_work get_properties_work; - int properties_retries; + int state; + struct delayed_work state_work; + int state_retries; struct delayed_work properties_changed_work; int properties_changed_retries; + bool bonding_possible; + u8 target_link_width; u8 link; u8 depth; }; -- cgit v1.2.3 From 4b439e25e29ec336c0e71ef1d1b212c412526518 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 9 Apr 2022 19:17:27 +0200 Subject: mm, hugetlbfs: Allow an arch to always use generic versions of get_unmapped_area functions Unlike most architectures, powerpc can only define at runtime if it is going to use the generic arch_get_unmapped_area() or not. Today, powerpc has a copy of the generic arch_get_unmapped_area() because when selection HAVE_ARCH_UNMAPPED_AREA the generic arch_get_unmapped_area() is not available. Rename it generic_get_unmapped_area() and make it independent of HAVE_ARCH_UNMAPPED_AREA. Do the same for arch_get_unmapped_area_topdown() versus HAVE_ARCH_UNMAPPED_AREA_TOPDOWN. Do the same for hugetlb_get_unmapped_area() versus HAVE_ARCH_HUGETLB_UNMAPPED_AREA. Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin Acked-by: Andrew Morton Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/77f9d3e592f1c8511df9381aa1c4e754651da4d1.1649523076.git.christophe.leroy@csgroup.eu --- include/linux/hugetlb.h | 5 +++++ include/linux/sched/mm.h | 9 +++++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ac2a1d758a80..bc97c8458df6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -519,6 +519,11 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long flags); #endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */ +unsigned long +generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); + /* * huegtlb page specific state flags. These flags are located in page.private * of the hugetlb head page. Functions created via the below macros should be diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 1ad1f4bfa025..da3a32d05b34 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -153,6 +153,15 @@ extern unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); + +unsigned long +generic_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); +unsigned long +generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) {} -- cgit v1.2.3 From 2cb4de085f383cb9289083d1bedbaad046f640eb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 9 Apr 2022 19:17:28 +0200 Subject: mm: Add len and flags parameters to arch_get_mmap_end() Powerpc needs flags and len to make decision on arch_get_mmap_end(). So add them as parameters to arch_get_mmap_end(). Signed-off-by: Christophe Leroy Acked-by: Catalin Marinas Acked-by: Andrew Morton Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b556daabe7d2bdb2361c4d6130280da7c1ba2c14.1649523076.git.christophe.leroy@csgroup.eu --- include/linux/sched/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index da3a32d05b34..8cd975a8bfeb 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -137,7 +137,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #ifdef CONFIG_MMU #ifndef arch_get_mmap_end -#define arch_get_mmap_end(addr) (TASK_SIZE) +#define arch_get_mmap_end(addr, len, flags) (TASK_SIZE) #endif #ifndef arch_get_mmap_base -- cgit v1.2.3 From d77e745613680c54708470402e2b623dcd769681 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sat, 30 Apr 2022 04:51:44 +0200 Subject: regmap: Add bulk read/write callbacks into regmap_config Currently the regmap_config structure only allows the user to implement single element register read/write using .reg_read/.reg_write callbacks. The regmap_bus already implements bulk counterparts of both, and is being misused as a workaround for the missing bulk read/write callbacks in regmap_config by a couple of drivers. To stop this misuse, add the bulk read/write callbacks to regmap_config and call them from the regmap core code. Signed-off-by: Marek Vasut Cc: Jagan Teki Cc: Mark Brown Cc: Maxime Ripard Cc: Robert Foss Cc: Sam Ravnborg Cc: Thomas Zimmermann To: dri-devel@lists.freedesktop.org Link: https://lore.kernel.org/r/20220430025145.640305-1-marex@denx.de Signed-off-by: Mark Brown --- include/linux/regmap.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index de81a94d7b30..8952fa3d0d59 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -299,6 +299,12 @@ typedef void (*regmap_unlock)(void *); * if the function require special handling with lock and reg * handling and the operation cannot be represented as a simple * update_bits operation on a bus such as SPI, I2C, etc. + * @read: Optional callback that if filled will be used to perform all the + * bulk reads from the registers. Data is returned in the buffer used + * to transmit data. + * @write: Same as above for writing. + * @max_raw_read: Max raw read size that can be used on the device. + * @max_raw_write: Max raw write size that can be used on the device. * @fast_io: Register IO is fast. Use a spinlock instead of a mutex * to perform locking. This field is ignored if custom lock/unlock * functions are used (see fields lock/unlock of struct regmap_config). @@ -385,6 +391,12 @@ struct regmap_config { int (*reg_write)(void *context, unsigned int reg, unsigned int val); int (*reg_update_bits)(void *context, unsigned int reg, unsigned int mask, unsigned int val); + /* Bulk read/write */ + int (*read)(void *context, const void *reg_buf, size_t reg_size, + void *val_buf, size_t val_size); + int (*write)(void *context, const void *data, size_t count); + size_t max_raw_read; + size_t max_raw_write; bool fast_io; -- cgit v1.2.3 From 6d5f2207447b28dc73c25b3907e7ee32ee66bdbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 2 May 2022 19:08:27 +0200 Subject: gpio: max732x: Drop unused support for irq and setup code via platform data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only user of max732x_platform_data is arch/arm/mach-pxa/littleton.c and it only uses .gpio_base. So drop the other members from the data struct and simplify the driver accordingly. The motivating side effect of this change is that the .remove() callback cannot return a nonzero error code any more which prepares making i2c remove callbacks return void. Signed-off-by: Uwe Kleine-König Signed-off-by: Bartosz Golaszewski --- include/linux/platform_data/max732x.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/max732x.h b/include/linux/platform_data/max732x.h index f231c635faec..423999207cd5 100644 --- a/include/linux/platform_data/max732x.h +++ b/include/linux/platform_data/max732x.h @@ -7,17 +7,5 @@ struct max732x_platform_data { /* number of the first GPIO */ unsigned gpio_base; - - /* interrupt base */ - int irq_base; - - void *context; /* param to setup/teardown */ - - int (*setup)(struct i2c_client *client, - unsigned gpio, unsigned ngpio, - void *context); - int (*teardown)(struct i2c_client *client, - unsigned gpio, unsigned ngpio, - void *context); }; #endif /* __LINUX_I2C_MAX732X_H */ -- cgit v1.2.3 From 8221ecd4e4620cf2f0e942cafcdecac1685f8f16 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 14 Apr 2022 15:04:27 +0200 Subject: PCI/PM: Drop the runtime_d3cold device flag The runtime_d3cold flag is not needed any more, so drop it. Link: https://lore.kernel.org/r/8077784.T7Z3S40VBb@kreacher Signed-off-by: Rafael J. Wysocki Signed-off-by: Bjorn Helgaas Reviewed-by: Mika Westerberg --- include/linux/pci.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 60adf42460ab..3266ac08f8ec 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -379,10 +379,6 @@ struct pci_dev { unsigned int mmio_always_on:1; /* Disallow turning off io/mem decoding during BAR sizing */ unsigned int wakeup_prepared:1; - unsigned int runtime_d3cold:1; /* Whether go through runtime - D3cold, not set for devices - powered on/off by the - corresponding bridge */ unsigned int skip_bus_pm:1; /* Internal: Skip bus-level PM */ unsigned int ignore_hotplug:1; /* Ignore hotplug events */ unsigned int hotplug_user_indicators:1; /* SlotCtl indicators -- cgit v1.2.3 From 26817fb7b066b21c66cd41d75ed2137e046045be Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 4 May 2022 09:27:10 -0400 Subject: Revert "fbdev: fbmem: add a helper to determine if an aperture is used by a fw fb" This reverts commit 9a45ac2320d0a6ae01880a30d4b86025fce4061b. This was added a helper for amdgpu to workaround a runtime pm regression caused by a runtime pm fix in efifb. We now have a better workaround in amdgpu in commit f95af4a9236695 ("drm/amdgpu: don't runtime suspend if there are displays attached (v3)") so this workaround is no longer necessary. Since amdgpu was the only user of this interface, we can remove it. Reviewed-by: Javier Martinez Canillas Acked-by: Daniel Vetter Signed-off-by: Alex Deucher --- include/linux/fb.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 9a77ab615c36..147d582dab41 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -612,7 +612,6 @@ extern int remove_conflicting_pci_framebuffers(struct pci_dev *pdev, const char *name); extern int remove_conflicting_framebuffers(struct apertures_struct *a, const char *name, bool primary); -extern bool is_firmware_framebuffer(struct apertures_struct *a); extern int fb_prepare_logo(struct fb_info *fb_info, int rotate); extern int fb_show_logo(struct fb_info *fb_info, int rotate); extern char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size); -- cgit v1.2.3 From c67b627e99affe4865b13557d68152ad4c35515c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 May 2022 10:09:47 -0700 Subject: net: Make msg_zerocopy_alloc static msg_zerocopy_alloc is only used by msg_zerocopy_realloc; remove the export and make static in skbuff.c Signed-off-by: David Ahern Acked-by: Jonathan Lemon Link: https://lore.kernel.org/r/20220504170947.18773-1-dsahern@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3270cb72e4d8..5c2599e3fe7d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1665,7 +1665,6 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) } #endif -struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size); struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); -- cgit v1.2.3 From 85db6352fc8a158a893151baa1716463d34a20d0 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 4 May 2022 11:09:14 +0300 Subject: net: Fix features skip in for_each_netdev_feature() The find_next_netdev_feature() macro gets the "remaining length", not bit index. Passing "bit - 1" for the following iteration is wrong as it skips the adjacent bit. Pass "bit" instead. Fixes: 3b89ea9c5902 ("net: Fix for_each_netdev_feature on Big endian") Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Link: https://lore.kernel.org/r/20220504080914.1918-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/netdev_features.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 2c6b9e416225..7c2d77d75a88 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -169,7 +169,7 @@ enum { #define NETIF_F_HW_HSR_FWD __NETIF_F(HW_HSR_FWD) #define NETIF_F_HW_HSR_DUP __NETIF_F(HW_HSR_DUP) -/* Finds the next feature with the highest number of the range of start till 0. +/* Finds the next feature with the highest number of the range of start-1 till 0. */ static inline int find_next_netdev_feature(u64 feature, unsigned long start) { @@ -188,7 +188,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) for ((bit) = find_next_netdev_feature((mask_addr), \ NETDEV_FEATURE_COUNT); \ (bit) >= 0; \ - (bit) = find_next_netdev_feature((mask_addr), (bit) - 1)) + (bit) = find_next_netdev_feature((mask_addr), (bit))) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ -- cgit v1.2.3 From bb17d110cbf270d5247a6e261c5ad50e362d1675 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 29 Apr 2022 21:59:45 +0200 Subject: rpmsg: Fix calling device_lock() on non-initialized device driver_set_override() helper uses device_lock() so it should not be called before rpmsg_register_device() (which calls device_register()). Effect can be seen with CONFIG_DEBUG_MUTEXES: DEBUG_LOCKS_WARN_ON(lock->magic != lock) WARNING: CPU: 3 PID: 57 at kernel/locking/mutex.c:582 __mutex_lock+0x1ec/0x430 ... Call trace: __mutex_lock+0x1ec/0x430 mutex_lock_nested+0x44/0x50 driver_set_override+0x124/0x150 qcom_glink_native_probe+0x30c/0x3b0 glink_rpm_probe+0x274/0x350 platform_probe+0x6c/0xe0 really_probe+0x17c/0x3d0 __driver_probe_device+0x114/0x190 driver_probe_device+0x3c/0xf0 ... Refactor the rpmsg_register_device() function to use two-step device registering (initialization + add) and call driver_set_override() in proper moment. This moves the code around, so while at it also NULL-ify the rpdev->driver_override in error path to be sure it won't be kfree() second time. Fixes: 42cd402b8fd4 ("rpmsg: Fix kfree() of static memory on setting driver_override") Reported-by: Marek Szyprowski Signed-off-by: Krzysztof Kozlowski Tested-by: Marek Szyprowski Link: https://lore.kernel.org/r/20220429195946.1061725-2-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/rpmsg.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index 20c8cd1cde21..523c98b96cb4 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -165,6 +165,8 @@ static inline __rpmsg64 cpu_to_rpmsg64(struct rpmsg_device *rpdev, u64 val) #if IS_ENABLED(CONFIG_RPMSG) +int rpmsg_register_device_override(struct rpmsg_device *rpdev, + const char *driver_override); int rpmsg_register_device(struct rpmsg_device *rpdev); int rpmsg_unregister_device(struct device *parent, struct rpmsg_channel_info *chinfo); @@ -192,6 +194,12 @@ ssize_t rpmsg_get_mtu(struct rpmsg_endpoint *ept); #else +static inline int rpmsg_register_device_override(struct rpmsg_device *rpdev, + const char *driver_override) +{ + return -ENXIO; +} + static inline int rpmsg_register_device(struct rpmsg_device *rpdev) { return -ENXIO; -- cgit v1.2.3 From d143b9db8069f0e2a0fa34484e806a55a0dd4855 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 27 Apr 2022 11:04:42 +0200 Subject: export: fix string handling of namespace in EXPORT_SYMBOL_NS Commit c3a6cf19e695 ("export: avoid code duplication in include/linux/export.h") broke the ability for a defined string to be used as a namespace value. Fix this up by using stringify to properly encode the namespace name. Fixes: c3a6cf19e695 ("export: avoid code duplication in include/linux/export.h") Cc: Miroslav Benes Cc: Emil Velikov Cc: Jessica Yu Cc: Quentin Perret Cc: Matthias Maennich Reviewed-by: Masahiro Yamada Link: https://lore.kernel.org/r/20220427090442.2105905-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/export.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/export.h b/include/linux/export.h index 27d848712b90..5910ccb66ca2 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -2,6 +2,8 @@ #ifndef _LINUX_EXPORT_H #define _LINUX_EXPORT_H +#include + /* * Export symbols from the kernel to modules. Forked from module.h * to reduce the amount of pointless cruft we feed to gcc when only @@ -154,7 +156,6 @@ struct kernel_symbol { #endif /* CONFIG_MODULES */ #ifdef DEFAULT_SYMBOL_NAMESPACE -#include #define _EXPORT_SYMBOL(sym, sec) __EXPORT_SYMBOL(sym, sec, __stringify(DEFAULT_SYMBOL_NAMESPACE)) #else #define _EXPORT_SYMBOL(sym, sec) __EXPORT_SYMBOL(sym, sec, "") @@ -162,8 +163,8 @@ struct kernel_symbol { #define EXPORT_SYMBOL(sym) _EXPORT_SYMBOL(sym, "") #define EXPORT_SYMBOL_GPL(sym) _EXPORT_SYMBOL(sym, "_gpl") -#define EXPORT_SYMBOL_NS(sym, ns) __EXPORT_SYMBOL(sym, "", #ns) -#define EXPORT_SYMBOL_NS_GPL(sym, ns) __EXPORT_SYMBOL(sym, "_gpl", #ns) +#define EXPORT_SYMBOL_NS(sym, ns) __EXPORT_SYMBOL(sym, "", __stringify(ns)) +#define EXPORT_SYMBOL_NS_GPL(sym, ns) __EXPORT_SYMBOL(sym, "_gpl", __stringify(ns)) #endif /* !__ASSEMBLY__ */ -- cgit v1.2.3 From 6de4d4eca9a2d0195f802bc97b0e9aeeaff05900 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 28 Apr 2022 02:24:27 -0400 Subject: platform/x86: pmc_atom: remove unused pmc_atom_write() This function isn't used anywhere in the driver or anywhere in tree. So remove it. It can always be re-added if/when a use arises. Cc: Andy Shevchenko Cc: Aubrey Li Cc: Hans de Goede Cc: Mark Gross Cc: platform-driver-x86@vger.kernel.org Signed-off-by: Paul Gortmaker Link: https://lore.kernel.org/r/20220428062430.31010-2-paul.gortmaker@windriver.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/pmc_atom.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h index 022bcea9edec..6807839c718b 100644 --- a/include/linux/platform_data/x86/pmc_atom.h +++ b/include/linux/platform_data/x86/pmc_atom.h @@ -144,6 +144,5 @@ #define SLEEP_ENABLE 0x2000 extern int pmc_atom_read(int offset, u32 *value); -extern int pmc_atom_write(int offset, u32 value); #endif /* PMC_ATOM_H */ -- cgit v1.2.3 From 6df6398f7c8b481ce83f28143bc08a5231616deb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 May 2022 19:51:31 -0700 Subject: net: add netif_inherit_tso_max() To make later patches smaller create a helper for inheriting the TSO limitations of a lower device. The TSO in the name is not an accident, subsequent patches will replace GSO with TSO in more names. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eaf66e57d891..006bb5c0e413 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4895,6 +4895,9 @@ static inline void netif_set_gro_max_size(struct net_device *dev, WRITE_ONCE(dev->gro_max_size, size); } +void netif_inherit_tso_max(struct net_device *to, + const struct net_device *from); + static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) -- cgit v1.2.3 From 14d7b8122fd591693a2388b98563707ba72c6780 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 May 2022 19:51:32 -0700 Subject: net: don't allow user space to lift the device limits Up until commit 46e6b992c250 ("rtnetlink: allow GSO maximums to be set on device creation") the gso_max_segs and gso_max_size of a device were not controlled from user space. The quoted commit added the ability to control them because of the following setup: netns A | netns B veth<->veth eth0 If eth0 has TSO limitations and user wants to efficiently forward traffic between eth0 and the veths they should copy the TSO limitations of eth0 onto the veths. This would happen automatically for macvlans or ipvlan but veth users are not so lucky (given the loose coupling). Unfortunately the commit in question allowed users to also override the limits on real HW devices. It may be useful to control the max GSO size and someone may be using that ability (not that I know of any user), so create a separate set of knobs to reliably record the TSO limitations. Validate the user requests. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 006bb5c0e413..e12f7de6d6ae 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1917,8 +1917,10 @@ enum netdev_ml_priv_type { * @rtnl_link_ops: Rtnl_link_ops * * @gso_max_size: Maximum size of generic segmentation offload + * @tso_max_size: Device (as in HW) limit on the max TSO request size * @gso_max_segs: Maximum number of segments that can be passed to the * NIC for GSO + * @tso_max_segs: Device (as in HW) limit on the max TSO segment count * * @dcbnl_ops: Data Center Bridging netlink ops * @num_tc: Number of traffic classes in the net device @@ -2262,8 +2264,13 @@ struct net_device { /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SIZE 65536 unsigned int gso_max_size; +#define TSO_LEGACY_MAX_SIZE 65536 +#define TSO_MAX_SIZE UINT_MAX + unsigned int tso_max_size; #define GSO_MAX_SEGS 65535 u16 gso_max_segs; +#define TSO_MAX_SEGS U16_MAX + u16 tso_max_segs; #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; @@ -4895,6 +4902,8 @@ static inline void netif_set_gro_max_size(struct net_device *dev, WRITE_ONCE(dev->gro_max_size, size); } +void netif_set_tso_max_size(struct net_device *dev, unsigned int size); +void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs); void netif_inherit_tso_max(struct net_device *to, const struct net_device *from); -- cgit v1.2.3 From 744d49daf8bd3b17b345c836f2e6f97d49fa6ae8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 May 2022 19:51:34 -0700 Subject: net: move netif_set_gso_max helpers These are now internal to the core, no need to expose them. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e12f7de6d6ae..8cf0ac616cb9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4881,27 +4881,6 @@ static inline bool netif_needs_gso(struct sk_buff *skb, (skb->ip_summed != CHECKSUM_UNNECESSARY))); } -static inline void netif_set_gso_max_size(struct net_device *dev, - unsigned int size) -{ - /* dev->gso_max_size is read locklessly from sk_setup_caps() */ - WRITE_ONCE(dev->gso_max_size, size); -} - -static inline void netif_set_gso_max_segs(struct net_device *dev, - unsigned int segs) -{ - /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ - WRITE_ONCE(dev->gso_max_segs, segs); -} - -static inline void netif_set_gro_max_size(struct net_device *dev, - unsigned int size) -{ - /* This pairs with the READ_ONCE() in skb_gro_receive() */ - WRITE_ONCE(dev->gro_max_size, size); -} - void netif_set_tso_max_size(struct net_device *dev, unsigned int size); void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs); void netif_inherit_tso_max(struct net_device *to, -- cgit v1.2.3 From 3ff5f7840979aa36d47a6a00694826c78d63bf3c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 May 2022 14:14:36 +0200 Subject: linkage: Fix issue with missing symbol size Occasionally, typically when a function doesn't end with 'ret', an alias on that function will have 0 size. The difference between what GCC generates and our linkage magic, is that GCC doesn't appear to provide .size for the alias'ed symbol at all. And indeed, removing this directive cures the issue. Additionally, GCC also doesn't emit .type for alias symbols either, so also omit that. Fixes: e0891269a8c2 ("linkage: add SYM_FUNC_ALIAS{,_LOCAL,_WEAK}()") Suggested-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Mark Rutland Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20220506121631.437480085@infradead.org --- include/linux/linkage.h | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/linkage.h b/include/linux/linkage.h index acb1ad2356f1..1feab6136b5b 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -171,12 +171,9 @@ /* SYM_ALIAS -- use only if you have to */ #ifndef SYM_ALIAS -#define SYM_ALIAS(alias, name, sym_type, linkage) \ - linkage(alias) ASM_NL \ - .set alias, name ASM_NL \ - .type alias sym_type ASM_NL \ - .set .L__sym_size_##alias, .L__sym_size_##name ASM_NL \ - .size alias, .L__sym_size_##alias +#define SYM_ALIAS(alias, name, linkage) \ + linkage(alias) ASM_NL \ + .set alias, name ASM_NL #endif /* === code annotations === */ @@ -261,7 +258,7 @@ */ #ifndef SYM_FUNC_ALIAS #define SYM_FUNC_ALIAS(alias, name) \ - SYM_ALIAS(alias, name, SYM_T_FUNC, SYM_L_GLOBAL) + SYM_ALIAS(alias, name, SYM_L_GLOBAL) #endif /* @@ -269,7 +266,7 @@ */ #ifndef SYM_FUNC_ALIAS_LOCAL #define SYM_FUNC_ALIAS_LOCAL(alias, name) \ - SYM_ALIAS(alias, name, SYM_T_FUNC, SYM_L_LOCAL) + SYM_ALIAS(alias, name, SYM_L_LOCAL) #endif /* @@ -277,7 +274,7 @@ */ #ifndef SYM_FUNC_ALIAS_WEAK #define SYM_FUNC_ALIAS_WEAK(alias, name) \ - SYM_ALIAS(alias, name, SYM_T_FUNC, SYM_L_WEAK) + SYM_ALIAS(alias, name, SYM_L_WEAK) #endif /* SYM_CODE_START -- use for non-C (special) functions */ -- cgit v1.2.3 From 6b79738b6ed91a2d0fe958819469eeedac3bca81 Mon Sep 17 00:00:00 2001 From: Qi Liu Date: Fri, 15 Apr 2022 18:23:52 +0800 Subject: drivers/perf: hisi: Add Support for CPA PMU On HiSilicon Hip09 platform, there is a CPA (Coherency Protocol Agent) on each SICL (Super IO Cluster) which implements packet format translation, route parsing and traffic statistics. CPA PMU has 8 PMU counters and interrupt is supported to handle counter overflow. Let's support its driver under the framework of HiSilicon PMU driver. Signed-off-by: Qi Liu Reviewed-by: John Garry Reviewed-by: Shaokun Zhang Link: https://lore.kernel.org/r/20220415102352.6665-3-liuqi115@huawei.com Signed-off-by: Will Deacon --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 82e33137f917..b66c5f389159 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -222,6 +222,7 @@ enum cpuhp_state { CPUHP_AP_PERF_S390_SF_ONLINE, CPUHP_AP_PERF_ARM_CCI_ONLINE, CPUHP_AP_PERF_ARM_CCN_ONLINE, + CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE, CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE, CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE, CPUHP_AP_PERF_ARM_HISI_L3_ONLINE, -- cgit v1.2.3 From 343f4c49f2438d8920f1f76fa823ee59b91f02e4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 11 Apr 2022 11:40:14 -0500 Subject: kthread: Don't allocate kthread_struct for init and umh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If kthread_is_per_cpu runs concurrently with free_kthread_struct the kthread_struct that was just freed may be read from. This bug was introduced by commit 40966e316f86 ("kthread: Ensure struct kthread is present for all kthreads"). When kthread_struct started to be allocated for all tasks that have PF_KTHREAD set. This in turn required the kthread_struct to be freed in kernel_execve and violated the assumption that kthread_struct will have the same lifetime as the task. Looking a bit deeper this only applies to callers of kernel_execve which is just the init process and the user mode helper processes. These processes really don't want to be kernel threads but are for historical reasons. Mostly that copy_thread does not know how to take a kernel mode function to the process with for processes without PF_KTHREAD or PF_IO_WORKER set. Solve this by not allocating kthread_struct for the init process and the user mode helper processes. This is done by adding a kthread member to struct kernel_clone_args. Setting kthread in fork_idle and kernel_thread. Adding user_mode_thread that works like kernel_thread except it does not set kthread. In fork only allocating the kthread_struct if .kthread is set. I have looked at kernel/kthread.c and since commit 40966e316f86 ("kthread: Ensure struct kthread is present for all kthreads") there have been no assumptions added that to_kthread or __to_kthread will not return NULL. There are a few callers of to_kthread or __to_kthread that assume a non-NULL struct kthread pointer will be returned. These functions are kthread_data(), kthread_parmme(), kthread_exit(), kthread(), kthread_park(), kthread_unpark(), kthread_stop(). All of those functions can reasonably expected to be called when it is know that a task is a kthread so that assumption seems reasonable. Cc: stable@vger.kernel.org Fixes: 40966e316f86 ("kthread: Ensure struct kthread is present for all kthreads") Reported-by: Максим Кутявин Link: https://lkml.kernel.org/r/20220506141512.516114-1-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/sched/task.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 719c9a6cac8d..4492266935dd 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -32,6 +32,7 @@ struct kernel_clone_args { size_t set_tid_size; int cgroup; int io_thread; + int kthread; struct cgroup *cgrp; struct css_set *cset; }; @@ -89,6 +90,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); struct task_struct *fork_idle(int); struct mm_struct *copy_init_mm(void); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); +extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); int kernel_wait(pid_t pid, int *stat); -- cgit v1.2.3 From e2ef115813c34ea5380ac5b4879f515070150210 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 May 2022 14:14:37 +0200 Subject: objtool: Fix STACK_FRAME_NON_STANDARD reloc type STACK_FRAME_NON_STANDARD results in inconsistent relocation types depending on .c or .S usage: Relocation section '.rela.discard.func_stack_frame_non_standard' at offset 0x3c01090 contains 5 entries: Offset Info Type Symbol's Value Symbol's Name + Addend 0000000000000000 00020c2200000002 R_X86_64_PC32 0000000000047b40 do_suspend_lowlevel + 0 0000000000000008 0002461e00000001 R_X86_64_64 00000000000480a0 machine_real_restart + 0 0000000000000010 0000001400000001 R_X86_64_64 0000000000000000 .rodata + b3d4 0000000000000018 0002444600000002 R_X86_64_PC32 00000000000678a0 __efi64_thunk + 0 0000000000000020 0002659d00000001 R_X86_64_64 0000000000113160 __crash_kexec + 0 Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220506121631.508692613@infradead.org --- include/linux/objtool.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 586d35720f13..b9c1474a571e 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -40,6 +40,8 @@ struct unwind_hint { #ifdef CONFIG_STACK_VALIDATION +#include + #ifndef __ASSEMBLY__ #define UNWIND_HINT(sp_reg, sp_offset, type, end) \ @@ -137,7 +139,7 @@ struct unwind_hint { .macro STACK_FRAME_NON_STANDARD func:req .pushsection .discard.func_stack_frame_non_standard, "aw" - .long \func - . + _ASM_PTR \func .popsection .endm -- cgit v1.2.3 From c5febea0956fd3874e8fb59c6f84d68f128d68f8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 Apr 2022 18:07:50 -0500 Subject: fork: Pass struct kernel_clone_args into copy_thread With io_uring we have started supporting tasks that are for most purposes user space tasks that exclusively run code in kernel mode. The kernel task that exec's init and tasks that exec user mode helpers are also user mode tasks that just run kernel code until they call kernel execve. Pass kernel_clone_args into copy_thread so these oddball tasks can be supported more cleanly and easily. v2: Fix spelling of kenrel_clone_args on h8300 Link: https://lkml.kernel.org/r/20220506141512.516114-2-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/sched/task.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 4492266935dd..fcdcba231aac 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -68,8 +68,7 @@ extern void fork_init(void); extern void release_task(struct task_struct * p); -extern int copy_thread(unsigned long, unsigned long, unsigned long, - struct task_struct *, unsigned long); +extern int copy_thread(struct task_struct *, const struct kernel_clone_args *); extern void flush_thread(void); -- cgit v1.2.3 From 36cb0e1cda645ee645b85a6ce652cb46a16e14e5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 11 Apr 2022 16:17:28 -0500 Subject: fork: Explicity test for idle tasks in copy_thread The architectures ia64 and parisc have special handling for the idle thread in copy_process. Add a flag named idle to kernel_clone_args and use it to explicity test if an idle process is being created. Fullfill the expectations of the rest of the copy_thread implemetations and pass a function pointer in .stack from fork_idle(). This makes what is happening in copy_thread better defined, and is useful to make idle threads less special. Link: https://lkml.kernel.org/r/20220506141512.516114-3-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/sched/task.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index fcdcba231aac..3d6b99ce5408 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -33,6 +33,7 @@ struct kernel_clone_args { int cgroup; int io_thread; int kthread; + int idle; struct cgroup *cgrp; struct css_set *cset; }; -- cgit v1.2.3 From 5bd2e97c868a8a44470950ed01846cab6328e540 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 12 Apr 2022 10:18:48 -0500 Subject: fork: Generalize PF_IO_WORKER handling Add fn and fn_arg members into struct kernel_clone_args and test for them in copy_thread (instead of testing for PF_KTHREAD | PF_IO_WORKER). This allows any task that wants to be a user space task that only runs in kernel mode to use this functionality. The code on x86 is an exception and still retains a PF_KTHREAD test because x86 unlikely everything else handles kthreads slightly differently than user space tasks that start with a function. The functions that created tasks that start with a function have been updated to set ".fn" and ".fn_arg" instead of ".stack" and ".stack_size". These functions are fork_idle(), create_io_thread(), kernel_thread(), and user_mode_thread(). Link: https://lkml.kernel.org/r/20220506141512.516114-4-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/sched/task.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 3d6b99ce5408..505aaf9fe477 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -34,6 +34,8 @@ struct kernel_clone_args { int io_thread; int kthread; int idle; + int (*fn)(void *); + void *fn_arg; struct cgroup *cgrp; struct css_set *cset; }; -- cgit v1.2.3 From 3d1b0d351441c2be7de082b92f43d0bfdc95a8f3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 7 May 2022 13:48:21 -0400 Subject: Revert "SUNRPC: Ensure gss-proxy connects on setup" This reverts commit 892de36fd4a98fab3298d417c051d9099af5448d. The gssproxy server is unresponsive when it calls into the kernel to start the upcall service, so it will not reply to our RPC ping at all. Reported-by: "J.Bruce Fields" Fixes: 892de36fd4a9 ("SUNRPC: Ensure gss-proxy connects on setup") Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index db5149567305..267b7aeaf1a6 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -160,7 +160,6 @@ struct rpc_add_xprt_test { #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT (1UL << 9) #define RPC_CLNT_CREATE_SOFTERR (1UL << 10) #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) -#define RPC_CLNT_CREATE_IGNORE_NULL_UNAVAIL (1UL << 12) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, -- cgit v1.2.3 From fd13359f54ee854f00134abc6be32da94ec53dbf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 7 May 2022 13:53:59 -0400 Subject: SUNRPC: Ensure that the gssproxy client can start in a connected state Ensure that the gssproxy client connects to the server from the gssproxy daemon process context so that the AF_LOCAL socket connection is done using the correct path and namespaces. Fixes: 1d658336b05f ("SUNRPC: Add RPC based upcall mechanism for RPCGSS auth") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 267b7aeaf1a6..90501404fa49 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -160,6 +160,7 @@ struct rpc_add_xprt_test { #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT (1UL << 9) #define RPC_CLNT_CREATE_SOFTERR (1UL << 10) #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) +#define RPC_CLNT_CREATE_CONNECTED (1UL << 12) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, -- cgit v1.2.3 From 813c2aee51dd7d7d9092251851e33f66719513cc Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 7 May 2022 14:33:31 +0200 Subject: ARM/pxa/mfd/power/sound: Switch Tosa to GPIO descriptors The Tosa device (Sharp SL-6000) has a mishmash driver set-up for the Toshiba TC6393xb MFD that includes a battery charger and touchscreen and has some kind of relationship to the SoC sound driver for the AC97 codec. Other devices define a chip like this but seem only half-implemented, not really handling battery charging etc. This patch switches the Toshiba MFD device to provide GPIO descriptors to the battery charger and SoC codec. As a result some descriptors need to be moved out of the Tosa boardfile and new one added: all SoC GPIO resources to these drivers now comes from the main boardfile, while the MFD provide GPIOs for its portions. As a result we can request one GPIO from our own GPIO chip and drop two hairy callbacks into the board file. This platform badly needs to have its drivers split up and converted to device tree probing to handle this quite complex relationship in an orderly manner. I just do my best in solving the GPIO descriptor part of the puzzle. Please don't ask me to fix everything that is wrong with these driver to todays standards, I am just trying to fix one aspect. I do try to use modern devres resource management and handle deferred probe using new functions where appropriate. Cc: Dmitry Eremin-Solenikov Cc: Dirk Opfer Cc: Robert Jarzmik Cc: Daniel Mack Cc: Haojian Zhuang Cc: Lee Jones Cc: Liam Girdwood Reviewed-by: Dmitry Baryshkov Acked-by: Mark Brown Acked-by: Sebastian Reichel Signed-off-by: Linus Walleij Signed-off-by: Arnd Bergmann --- include/linux/gpio/machine.h | 12 ++++++++++++ include/linux/mfd/tc6393xb.h | 3 --- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h index 2647dd10b541..4d55da28e664 100644 --- a/include/linux/gpio/machine.h +++ b/include/linux/gpio/machine.h @@ -63,6 +63,18 @@ struct gpiod_hog { int dflags; }; +/* + * Helper for lookup tables with just one single lookup for a device. + */ +#define GPIO_LOOKUP_SINGLE(_name, _dev_id, _key, _chip_hwnum, _con_id, _flags) \ +static struct gpiod_lookup_table _name = { \ + .dev_id = _dev_id, \ + .table = { \ + GPIO_LOOKUP(_key, _chip_hwnum, _con_id, _flags), \ + {}, \ + }, \ +} + /* * Simple definition of a single GPIO under a con_id */ diff --git a/include/linux/mfd/tc6393xb.h b/include/linux/mfd/tc6393xb.h index fcc8e74f0e8d..d336c541b7df 100644 --- a/include/linux/mfd/tc6393xb.h +++ b/include/linux/mfd/tc6393xb.h @@ -27,9 +27,6 @@ struct tc6393xb_platform_data { int (*resume)(struct platform_device *dev); int irq_base; /* base for subdevice irqs */ - int gpio_base; - int (*setup)(struct platform_device *dev); - void (*teardown)(struct platform_device *dev); struct tmio_nand_data *nand_data; struct tmio_fb_data *fb_data; -- cgit v1.2.3 From ac70f4d80df414223130b04d9b4435bf56dda654 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Sep 2019 10:58:26 +0200 Subject: ARM: pxa: poodle: use platform data for poodle asoc driver The poodle audio driver shows its age by using a custom gpio api for the "locomo" support chip. In a perfect world, this would get converted to use gpiolib and a gpio lookup table. As the world is not perfect, just pass all the required data in a custom platform_data structure. to avoid the globally visible mach/poodle.h header. Acked-by: Mark Brown Acked-by: Robert Jarzmik Cc: alsa-devel@alsa-project.org Signed-off-by: Arnd Bergmann --- include/linux/platform_data/asoc-poodle.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 include/linux/platform_data/asoc-poodle.h (limited to 'include/linux') diff --git a/include/linux/platform_data/asoc-poodle.h b/include/linux/platform_data/asoc-poodle.h new file mode 100644 index 000000000000..2052fad55c5c --- /dev/null +++ b/include/linux/platform_data/asoc-poodle.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_PLATFORM_DATA_POODLE_AUDIO +#define __LINUX_PLATFORM_DATA_POODLE_AUDIO + +/* locomo is not a proper gpio driver, and uses its own api */ +struct poodle_audio_platform_data { + struct device *locomo_dev; + + int gpio_amp_on; + int gpio_mute_l; + int gpio_mute_r; + int gpio_232vcc_on; + int gpio_jk_b; +}; + +#endif -- cgit v1.2.3 From a2ef926143b84664f4823c82a437b29c4667a08c Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 18 Oct 2019 13:48:33 -0700 Subject: Input: wm97xx - switch to using threaded IRQ Instead of manually disabling and enabling interrupts and scheduling work to access the device, let's use threaded oneshot interrupt handler. It simplifies things. Signed-off-by: Dmitry Torokhov Signed-off-by: Arnd Bergmann --- include/linux/wm97xx.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wm97xx.h b/include/linux/wm97xx.h index 462854f4f286..85bd8dd3caea 100644 --- a/include/linux/wm97xx.h +++ b/include/linux/wm97xx.h @@ -281,7 +281,6 @@ struct wm97xx { unsigned long ts_reader_min_interval; /* Minimum interval */ unsigned int pen_irq; /* Pen IRQ number in use */ struct workqueue_struct *ts_workq; - struct work_struct pen_event_work; u16 acc_slot; /* AC97 slot used for acc touch data */ u16 acc_rate; /* acc touch data rate */ unsigned pen_is_down:1; /* Pen is down */ -- cgit v1.2.3 From e1d8f31218aab302df9e18526345af189fd061f0 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 18 Oct 2019 13:48:34 -0700 Subject: Input: wm97xx - get rid of irq_enable method in wm97xx_mach_ops Now that we are using oneshot threaded IRQ this method is not used anymore. Signed-off-by: Dmitry Torokhov [arnd: add the db1300 change as well] Cc: Manuel Lauss Signed-off-by: Arnd Bergmann --- include/linux/wm97xx.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wm97xx.h b/include/linux/wm97xx.h index 85bd8dd3caea..332d2b0f29b9 100644 --- a/include/linux/wm97xx.h +++ b/include/linux/wm97xx.h @@ -254,9 +254,6 @@ struct wm97xx_mach_ops { int (*acc_startup) (struct wm97xx *); void (*acc_shutdown) (struct wm97xx *); - /* interrupt mask control - required for accelerated operation */ - void (*irq_enable) (struct wm97xx *, int enable); - /* GPIO pin used for accelerated operation */ int irq_gpio; -- cgit v1.2.3 From 6a946f1bd5cc966c9dd377b14efa4cec388101ce Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Sep 2019 14:50:37 +0200 Subject: ARM: pxa: pcmcia: move smemc configuration back to arch Rather than poking at the smemc registers directly from the pcmcia/pxa2xx_base driver, move those bits into machine file to have a cleaner interface. Cc: Dominik Brodowski Link: https://lore.kernel.org/lkml/87d0egjzxk.fsf@belgarion.home/ Signed-off-by: Arnd Bergmann --- include/linux/soc/pxa/smemc.h | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 include/linux/soc/pxa/smemc.h (limited to 'include/linux') diff --git a/include/linux/soc/pxa/smemc.h b/include/linux/soc/pxa/smemc.h new file mode 100644 index 000000000000..cbf1a2d8af29 --- /dev/null +++ b/include/linux/soc/pxa/smemc.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __PXA_REGS_H +#define __PXA_REGS_H + +#include + +void pxa_smemc_set_pcmcia_timing(int sock, u32 mcmem, u32 mcatt, u32 mcio); +void pxa_smemc_set_pcmcia_socket(int nr); + +#endif -- cgit v1.2.3 From 5c6603e741921c75f67a724e798642ed50a6328e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Sep 2019 15:34:10 +0200 Subject: cpufreq: pxa3: move clk register access to clk driver The driver needs some low-level register access for setting the core and bus frequencies. These registers are owned by the clk driver, so move the low-level access into that driver with a slightly higher-level interface and avoid any machine header file dependencies. Cc: Michael Turquette Cc: Stephen Boyd Acked-by: Viresh Kumar Cc: linux-clk@vger.kernel.org Cc: linux-pm@vger.kernel.org Signed-off-by: Arnd Bergmann --- include/linux/clk/pxa.h | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 include/linux/clk/pxa.h (limited to 'include/linux') diff --git a/include/linux/clk/pxa.h b/include/linux/clk/pxa.h new file mode 100644 index 000000000000..e5516c608c99 --- /dev/null +++ b/include/linux/clk/pxa.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifdef CONFIG_PXA3xx +extern unsigned pxa3xx_get_clk_frequency_khz(int); +extern void pxa3xx_clk_update_accr(u32 disable, u32 enable, u32 xclkcfg, u32 mask); +#else +#define pxa3xx_get_clk_frequency_khz(x) (0) +#define pxa3xx_clk_update_accr(disable, enable, xclkcfg, mask) do { } while (0) +#endif -- cgit v1.2.3 From fd13f8117f7a2d4054bf420ec1428e918a24a480 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Sep 2019 17:42:52 +0200 Subject: ARM: pxa: move smemc register access from clk to platform The get_sdram_rows() and get_memclkdiv() helpers need smemc register that are separate from the clk registers, move them out of the clk driver, and use an extern declaration instead. Cc: Michael Turquette Cc: Stephen Boyd Cc: linux-clk@vger.kernel.org Link: https://lore.kernel.org/lkml/87pnielzo4.fsf@belgarion.home/ Signed-off-by: Arnd Bergmann --- include/linux/soc/pxa/smemc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/pxa/smemc.h b/include/linux/soc/pxa/smemc.h index cbf1a2d8af29..f1ffea236c15 100644 --- a/include/linux/soc/pxa/smemc.h +++ b/include/linux/soc/pxa/smemc.h @@ -6,5 +6,8 @@ void pxa_smemc_set_pcmcia_timing(int sock, u32 mcmem, u32 mcatt, u32 mcio); void pxa_smemc_set_pcmcia_socket(int nr); +int pxa2xx_smemc_get_sdram_rows(void); +unsigned int pxa3xx_smemc_get_memclkdiv(void); +void __iomem *pxa_smemc_get_mdrefr(void); #endif -- cgit v1.2.3 From 3c816d950a494ae6e16b1fa017af29bc53cb7791 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Sep 2019 20:54:17 +0200 Subject: ARM: pxa: move clk register definitions to driver The clock register definitions are now used (almost) exclusively in the clk driver, and that relies on no other mach/*.h header files any more. Remove the dependency on mach/pxa*-regs.h by addressing the registers as offsets from a void __iomem * pointer, which is either passed from a board file, or (for the moment) ioremapped at boot time from a hardcoded address in case of DT (this should be moved into the DT of course). Cc: linux-clk@vger.kernel.org Acked-by: Stephen Boyd Acked-by: Robert Jarzmik Signed-off-by: Arnd Bergmann --- include/linux/clk/pxa.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk/pxa.h b/include/linux/clk/pxa.h index e5516c608c99..736b8bb91bd7 100644 --- a/include/linux/clk/pxa.h +++ b/include/linux/clk/pxa.h @@ -1,5 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0-only */ +#include +#include + +extern int pxa25x_clocks_init(void __iomem *regs); +extern int pxa27x_clocks_init(void __iomem *regs); +extern int pxa3xx_clocks_init(void __iomem *regs, void __iomem *oscc_reg); + #ifdef CONFIG_PXA3xx extern unsigned pxa3xx_get_clk_frequency_khz(int); extern void pxa3xx_clk_update_accr(u32 disable, u32 enable, u32 xclkcfg, u32 mask); -- cgit v1.2.3 From 64dbc4dd7a7cc6642c522963a6194b62480e2a68 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Sep 2019 13:33:40 +0200 Subject: ARM: pxa: move plat-pxa to drivers/soc/ There are two drivers in arch/arm/plat-pxa: mfp and ssp. Both of them should ideally not be needed at all, as there are proper subsystems to replace them. OTOH, they are self-contained and can simply be normal SoC drivers, so move them over there to eliminate one more of the plat-* directories. Acked-by: Robert Jarzmik (mach-pxa) Acked-by: Lubomir Rintel (mach-mmp) Signed-off-by: Arnd Bergmann --- include/linux/soc/pxa/mfp.h | 470 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 470 insertions(+) create mode 100644 include/linux/soc/pxa/mfp.h (limited to 'include/linux') diff --git a/include/linux/soc/pxa/mfp.h b/include/linux/soc/pxa/mfp.h new file mode 100644 index 000000000000..39779cbed0c0 --- /dev/null +++ b/include/linux/soc/pxa/mfp.h @@ -0,0 +1,470 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common Multi-Function Pin Definitions + * + * Copyright (C) 2007 Marvell International Ltd. + * + * 2007-8-21: eric miao + * initial version + */ + +#ifndef __ASM_PLAT_MFP_H +#define __ASM_PLAT_MFP_H + +#define mfp_to_gpio(m) ((m) % 256) + +/* list of all the configurable MFP pins */ +enum { + MFP_PIN_INVALID = -1, + + MFP_PIN_GPIO0 = 0, + MFP_PIN_GPIO1, + MFP_PIN_GPIO2, + MFP_PIN_GPIO3, + MFP_PIN_GPIO4, + MFP_PIN_GPIO5, + MFP_PIN_GPIO6, + MFP_PIN_GPIO7, + MFP_PIN_GPIO8, + MFP_PIN_GPIO9, + MFP_PIN_GPIO10, + MFP_PIN_GPIO11, + MFP_PIN_GPIO12, + MFP_PIN_GPIO13, + MFP_PIN_GPIO14, + MFP_PIN_GPIO15, + MFP_PIN_GPIO16, + MFP_PIN_GPIO17, + MFP_PIN_GPIO18, + MFP_PIN_GPIO19, + MFP_PIN_GPIO20, + MFP_PIN_GPIO21, + MFP_PIN_GPIO22, + MFP_PIN_GPIO23, + MFP_PIN_GPIO24, + MFP_PIN_GPIO25, + MFP_PIN_GPIO26, + MFP_PIN_GPIO27, + MFP_PIN_GPIO28, + MFP_PIN_GPIO29, + MFP_PIN_GPIO30, + MFP_PIN_GPIO31, + MFP_PIN_GPIO32, + MFP_PIN_GPIO33, + MFP_PIN_GPIO34, + MFP_PIN_GPIO35, + MFP_PIN_GPIO36, + MFP_PIN_GPIO37, + MFP_PIN_GPIO38, + MFP_PIN_GPIO39, + MFP_PIN_GPIO40, + MFP_PIN_GPIO41, + MFP_PIN_GPIO42, + MFP_PIN_GPIO43, + MFP_PIN_GPIO44, + MFP_PIN_GPIO45, + MFP_PIN_GPIO46, + MFP_PIN_GPIO47, + MFP_PIN_GPIO48, + MFP_PIN_GPIO49, + MFP_PIN_GPIO50, + MFP_PIN_GPIO51, + MFP_PIN_GPIO52, + MFP_PIN_GPIO53, + MFP_PIN_GPIO54, + MFP_PIN_GPIO55, + MFP_PIN_GPIO56, + MFP_PIN_GPIO57, + MFP_PIN_GPIO58, + MFP_PIN_GPIO59, + MFP_PIN_GPIO60, + MFP_PIN_GPIO61, + MFP_PIN_GPIO62, + MFP_PIN_GPIO63, + MFP_PIN_GPIO64, + MFP_PIN_GPIO65, + MFP_PIN_GPIO66, + MFP_PIN_GPIO67, + MFP_PIN_GPIO68, + MFP_PIN_GPIO69, + MFP_PIN_GPIO70, + MFP_PIN_GPIO71, + MFP_PIN_GPIO72, + MFP_PIN_GPIO73, + MFP_PIN_GPIO74, + MFP_PIN_GPIO75, + MFP_PIN_GPIO76, + MFP_PIN_GPIO77, + MFP_PIN_GPIO78, + MFP_PIN_GPIO79, + MFP_PIN_GPIO80, + MFP_PIN_GPIO81, + MFP_PIN_GPIO82, + MFP_PIN_GPIO83, + MFP_PIN_GPIO84, + MFP_PIN_GPIO85, + MFP_PIN_GPIO86, + MFP_PIN_GPIO87, + MFP_PIN_GPIO88, + MFP_PIN_GPIO89, + MFP_PIN_GPIO90, + MFP_PIN_GPIO91, + MFP_PIN_GPIO92, + MFP_PIN_GPIO93, + MFP_PIN_GPIO94, + MFP_PIN_GPIO95, + MFP_PIN_GPIO96, + MFP_PIN_GPIO97, + MFP_PIN_GPIO98, + MFP_PIN_GPIO99, + MFP_PIN_GPIO100, + MFP_PIN_GPIO101, + MFP_PIN_GPIO102, + MFP_PIN_GPIO103, + MFP_PIN_GPIO104, + MFP_PIN_GPIO105, + MFP_PIN_GPIO106, + MFP_PIN_GPIO107, + MFP_PIN_GPIO108, + MFP_PIN_GPIO109, + MFP_PIN_GPIO110, + MFP_PIN_GPIO111, + MFP_PIN_GPIO112, + MFP_PIN_GPIO113, + MFP_PIN_GPIO114, + MFP_PIN_GPIO115, + MFP_PIN_GPIO116, + MFP_PIN_GPIO117, + MFP_PIN_GPIO118, + MFP_PIN_GPIO119, + MFP_PIN_GPIO120, + MFP_PIN_GPIO121, + MFP_PIN_GPIO122, + MFP_PIN_GPIO123, + MFP_PIN_GPIO124, + MFP_PIN_GPIO125, + MFP_PIN_GPIO126, + MFP_PIN_GPIO127, + + MFP_PIN_GPIO128, + MFP_PIN_GPIO129, + MFP_PIN_GPIO130, + MFP_PIN_GPIO131, + MFP_PIN_GPIO132, + MFP_PIN_GPIO133, + MFP_PIN_GPIO134, + MFP_PIN_GPIO135, + MFP_PIN_GPIO136, + MFP_PIN_GPIO137, + MFP_PIN_GPIO138, + MFP_PIN_GPIO139, + MFP_PIN_GPIO140, + MFP_PIN_GPIO141, + MFP_PIN_GPIO142, + MFP_PIN_GPIO143, + MFP_PIN_GPIO144, + MFP_PIN_GPIO145, + MFP_PIN_GPIO146, + MFP_PIN_GPIO147, + MFP_PIN_GPIO148, + MFP_PIN_GPIO149, + MFP_PIN_GPIO150, + MFP_PIN_GPIO151, + MFP_PIN_GPIO152, + MFP_PIN_GPIO153, + MFP_PIN_GPIO154, + MFP_PIN_GPIO155, + MFP_PIN_GPIO156, + MFP_PIN_GPIO157, + MFP_PIN_GPIO158, + MFP_PIN_GPIO159, + MFP_PIN_GPIO160, + MFP_PIN_GPIO161, + MFP_PIN_GPIO162, + MFP_PIN_GPIO163, + MFP_PIN_GPIO164, + MFP_PIN_GPIO165, + MFP_PIN_GPIO166, + MFP_PIN_GPIO167, + MFP_PIN_GPIO168, + MFP_PIN_GPIO169, + MFP_PIN_GPIO170, + MFP_PIN_GPIO171, + MFP_PIN_GPIO172, + MFP_PIN_GPIO173, + MFP_PIN_GPIO174, + MFP_PIN_GPIO175, + MFP_PIN_GPIO176, + MFP_PIN_GPIO177, + MFP_PIN_GPIO178, + MFP_PIN_GPIO179, + MFP_PIN_GPIO180, + MFP_PIN_GPIO181, + MFP_PIN_GPIO182, + MFP_PIN_GPIO183, + MFP_PIN_GPIO184, + MFP_PIN_GPIO185, + MFP_PIN_GPIO186, + MFP_PIN_GPIO187, + MFP_PIN_GPIO188, + MFP_PIN_GPIO189, + MFP_PIN_GPIO190, + MFP_PIN_GPIO191, + + MFP_PIN_GPIO255 = 255, + + MFP_PIN_GPIO0_2, + MFP_PIN_GPIO1_2, + MFP_PIN_GPIO2_2, + MFP_PIN_GPIO3_2, + MFP_PIN_GPIO4_2, + MFP_PIN_GPIO5_2, + MFP_PIN_GPIO6_2, + MFP_PIN_GPIO7_2, + MFP_PIN_GPIO8_2, + MFP_PIN_GPIO9_2, + MFP_PIN_GPIO10_2, + MFP_PIN_GPIO11_2, + MFP_PIN_GPIO12_2, + MFP_PIN_GPIO13_2, + MFP_PIN_GPIO14_2, + MFP_PIN_GPIO15_2, + MFP_PIN_GPIO16_2, + MFP_PIN_GPIO17_2, + + MFP_PIN_ULPI_STP, + MFP_PIN_ULPI_NXT, + MFP_PIN_ULPI_DIR, + + MFP_PIN_nXCVREN, + MFP_PIN_DF_CLE_nOE, + MFP_PIN_DF_nADV1_ALE, + MFP_PIN_DF_SCLK_E, + MFP_PIN_DF_SCLK_S, + MFP_PIN_nBE0, + MFP_PIN_nBE1, + MFP_PIN_DF_nADV2_ALE, + MFP_PIN_DF_INT_RnB, + MFP_PIN_DF_nCS0, + MFP_PIN_DF_nCS1, + MFP_PIN_nLUA, + MFP_PIN_nLLA, + MFP_PIN_DF_nWE, + MFP_PIN_DF_ALE_nWE, + MFP_PIN_DF_nRE_nOE, + MFP_PIN_DF_ADDR0, + MFP_PIN_DF_ADDR1, + MFP_PIN_DF_ADDR2, + MFP_PIN_DF_ADDR3, + MFP_PIN_DF_IO0, + MFP_PIN_DF_IO1, + MFP_PIN_DF_IO2, + MFP_PIN_DF_IO3, + MFP_PIN_DF_IO4, + MFP_PIN_DF_IO5, + MFP_PIN_DF_IO6, + MFP_PIN_DF_IO7, + MFP_PIN_DF_IO8, + MFP_PIN_DF_IO9, + MFP_PIN_DF_IO10, + MFP_PIN_DF_IO11, + MFP_PIN_DF_IO12, + MFP_PIN_DF_IO13, + MFP_PIN_DF_IO14, + MFP_PIN_DF_IO15, + MFP_PIN_DF_nCS0_SM_nCS2, + MFP_PIN_DF_nCS1_SM_nCS3, + MFP_PIN_SM_nCS0, + MFP_PIN_SM_nCS1, + MFP_PIN_DF_WEn, + MFP_PIN_DF_REn, + MFP_PIN_DF_CLE_SM_OEn, + MFP_PIN_DF_ALE_SM_WEn, + MFP_PIN_DF_RDY0, + MFP_PIN_DF_RDY1, + + MFP_PIN_SM_SCLK, + MFP_PIN_SM_BE0, + MFP_PIN_SM_BE1, + MFP_PIN_SM_ADV, + MFP_PIN_SM_ADVMUX, + MFP_PIN_SM_RDY, + + MFP_PIN_MMC1_DAT7, + MFP_PIN_MMC1_DAT6, + MFP_PIN_MMC1_DAT5, + MFP_PIN_MMC1_DAT4, + MFP_PIN_MMC1_DAT3, + MFP_PIN_MMC1_DAT2, + MFP_PIN_MMC1_DAT1, + MFP_PIN_MMC1_DAT0, + MFP_PIN_MMC1_CMD, + MFP_PIN_MMC1_CLK, + MFP_PIN_MMC1_CD, + MFP_PIN_MMC1_WP, + + /* additional pins on PXA930 */ + MFP_PIN_GSIM_UIO, + MFP_PIN_GSIM_UCLK, + MFP_PIN_GSIM_UDET, + MFP_PIN_GSIM_nURST, + MFP_PIN_PMIC_INT, + MFP_PIN_RDY, + + /* additional pins on MMP2 */ + MFP_PIN_TWSI1_SCL, + MFP_PIN_TWSI1_SDA, + MFP_PIN_TWSI4_SCL, + MFP_PIN_TWSI4_SDA, + MFP_PIN_CLK_REQ, + + MFP_PIN_MAX, +}; + +/* + * a possible MFP configuration is represented by a 32-bit integer + * + * bit 0.. 9 - MFP Pin Number (1024 Pins Maximum) + * bit 10..12 - Alternate Function Selection + * bit 13..15 - Drive Strength + * bit 16..18 - Low Power Mode State + * bit 19..20 - Low Power Mode Edge Detection + * bit 21..22 - Run Mode Pull State + * + * to facilitate the definition, the following macros are provided + * + * MFP_CFG_DEFAULT - default MFP configuration value, with + * alternate function = 0, + * drive strength = fast 3mA (MFP_DS03X) + * low power mode = default + * edge detection = none + * + * MFP_CFG - default MFPR value with alternate function + * MFP_CFG_DRV - default MFPR value with alternate function and + * pin drive strength + * MFP_CFG_LPM - default MFPR value with alternate function and + * low power mode + * MFP_CFG_X - default MFPR value with alternate function, + * pin drive strength and low power mode + */ + +typedef unsigned long mfp_cfg_t; + +#define MFP_PIN(x) ((x) & 0x3ff) + +#define MFP_AF0 (0x0 << 10) +#define MFP_AF1 (0x1 << 10) +#define MFP_AF2 (0x2 << 10) +#define MFP_AF3 (0x3 << 10) +#define MFP_AF4 (0x4 << 10) +#define MFP_AF5 (0x5 << 10) +#define MFP_AF6 (0x6 << 10) +#define MFP_AF7 (0x7 << 10) +#define MFP_AF_MASK (0x7 << 10) +#define MFP_AF(x) (((x) >> 10) & 0x7) + +#define MFP_DS01X (0x0 << 13) +#define MFP_DS02X (0x1 << 13) +#define MFP_DS03X (0x2 << 13) +#define MFP_DS04X (0x3 << 13) +#define MFP_DS06X (0x4 << 13) +#define MFP_DS08X (0x5 << 13) +#define MFP_DS10X (0x6 << 13) +#define MFP_DS13X (0x7 << 13) +#define MFP_DS_MASK (0x7 << 13) +#define MFP_DS(x) (((x) >> 13) & 0x7) + +#define MFP_LPM_DEFAULT (0x0 << 16) +#define MFP_LPM_DRIVE_LOW (0x1 << 16) +#define MFP_LPM_DRIVE_HIGH (0x2 << 16) +#define MFP_LPM_PULL_LOW (0x3 << 16) +#define MFP_LPM_PULL_HIGH (0x4 << 16) +#define MFP_LPM_FLOAT (0x5 << 16) +#define MFP_LPM_INPUT (0x6 << 16) +#define MFP_LPM_STATE_MASK (0x7 << 16) +#define MFP_LPM_STATE(x) (((x) >> 16) & 0x7) + +#define MFP_LPM_EDGE_NONE (0x0 << 19) +#define MFP_LPM_EDGE_RISE (0x1 << 19) +#define MFP_LPM_EDGE_FALL (0x2 << 19) +#define MFP_LPM_EDGE_BOTH (0x3 << 19) +#define MFP_LPM_EDGE_MASK (0x3 << 19) +#define MFP_LPM_EDGE(x) (((x) >> 19) & 0x3) + +#define MFP_PULL_NONE (0x0 << 21) +#define MFP_PULL_LOW (0x1 << 21) +#define MFP_PULL_HIGH (0x2 << 21) +#define MFP_PULL_BOTH (0x3 << 21) +#define MFP_PULL_FLOAT (0x4 << 21) +#define MFP_PULL_MASK (0x7 << 21) +#define MFP_PULL(x) (((x) >> 21) & 0x7) + +#define MFP_CFG_DEFAULT (MFP_AF0 | MFP_DS03X | MFP_LPM_DEFAULT |\ + MFP_LPM_EDGE_NONE | MFP_PULL_NONE) + +#define MFP_CFG(pin, af) \ + ((MFP_CFG_DEFAULT & ~MFP_AF_MASK) |\ + (MFP_PIN(MFP_PIN_##pin) | MFP_##af)) + +#define MFP_CFG_DRV(pin, af, drv) \ + ((MFP_CFG_DEFAULT & ~(MFP_AF_MASK | MFP_DS_MASK)) |\ + (MFP_PIN(MFP_PIN_##pin) | MFP_##af | MFP_##drv)) + +#define MFP_CFG_LPM(pin, af, lpm) \ + ((MFP_CFG_DEFAULT & ~(MFP_AF_MASK | MFP_LPM_STATE_MASK)) |\ + (MFP_PIN(MFP_PIN_##pin) | MFP_##af | MFP_LPM_##lpm)) + +#define MFP_CFG_X(pin, af, drv, lpm) \ + ((MFP_CFG_DEFAULT & ~(MFP_AF_MASK | MFP_DS_MASK | MFP_LPM_STATE_MASK)) |\ + (MFP_PIN(MFP_PIN_##pin) | MFP_##af | MFP_##drv | MFP_LPM_##lpm)) + +#if defined(CONFIG_PXA3xx) || defined(CONFIG_ARCH_MMP) +/* + * each MFP pin will have a MFPR register, since the offset of the + * register varies between processors, the processor specific code + * should initialize the pin offsets by mfp_init() + * + * mfp_init_base() - accepts a virtual base for all MFPR registers and + * initialize the MFP table to a default state + * + * mfp_init_addr() - accepts a table of "mfp_addr_map" structure, which + * represents a range of MFP pins from "start" to "end", with the offset + * beginning at "offset", to define a single pin, let "end" = -1. + * + * use + * + * MFP_ADDR_X() to define a range of pins + * MFP_ADDR() to define a single pin + * MFP_ADDR_END to signal the end of pin offset definitions + */ +struct mfp_addr_map { + unsigned int start; + unsigned int end; + unsigned long offset; +}; + +#define MFP_ADDR_X(start, end, offset) \ + { MFP_PIN_##start, MFP_PIN_##end, offset } + +#define MFP_ADDR(pin, offset) \ + { MFP_PIN_##pin, -1, offset } + +#define MFP_ADDR_END { MFP_PIN_INVALID, 0 } + +void mfp_init_base(void __iomem *mfpr_base); +void mfp_init_addr(struct mfp_addr_map *map); + +/* + * mfp_{read, write}() - for direct read/write access to the MFPR register + * mfp_config() - for configuring a group of MFPR registers + * mfp_config_lpm() - configuring all low power MFPR registers for suspend + * mfp_config_run() - configuring all run time MFPR registers after resume + */ +unsigned long mfp_read(int mfp); +void mfp_write(int mfp, unsigned long mfpr_val); +void mfp_config(unsigned long *mfp_cfgs, int num); +void mfp_config_run(void); +void mfp_config_lpm(void); +#endif /* CONFIG_PXA3xx || CONFIG_ARCH_MMP */ + +#endif /* __ASM_PLAT_MFP_H */ -- cgit v1.2.3 From 3b5eed3c71a2fb60aa4405ad92a2a6ad2677f220 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 3 May 2022 13:54:58 -0700 Subject: netfs: Eliminate Clang randstruct warning Clang's structure layout randomization feature gets upset when it sees struct inode (which is randomized) cast to struct netfs_i_context. This is due to seeing the inode pointer as being treated as an array of inodes, rather than "something else, following struct inode". Since netfs can't use container_of() (since it doesn't know what the true containing struct is), it uses this direct offset instead. Adjust the code to better reflect what is happening: an arbitrary pointer is being adjusted and cast to something else: use a "void *" for the math. The resulting binary output is the same, but Clang no longer sees an unexpected cross-structure cast: In file included from ../fs/nfs/inode.c:50: In file included from ../fs/nfs/fscache.h:15: In file included from ../include/linux/fscache.h:18: ../include/linux/netfs.h:298:9: error: casting from randomized structure pointer type 'struct inode *' to 'struct netfs_i_context *' return (struct netfs_i_context *)(inode + 1); ^ 1 error generated. Cc: David Howells Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220503205503.3054173-2-keescook@chromium.org Reviewed-by: Jeff Layton Link: https://lore.kernel.org/lkml/7562f8eccd7cc0e447becfe9912179088784e3b9.camel@kernel.org --- include/linux/netfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c7bf1eaf51d5..0c33b715cbfd 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -295,7 +295,7 @@ extern void netfs_stats_show(struct seq_file *); */ static inline struct netfs_i_context *netfs_i_context(struct inode *inode) { - return (struct netfs_i_context *)(inode + 1); + return (void *)inode + sizeof(*inode); } /** @@ -307,7 +307,7 @@ static inline struct netfs_i_context *netfs_i_context(struct inode *inode) */ static inline struct inode *netfs_inode(struct netfs_i_context *ctx) { - return ((struct inode *)ctx) - 1; + return (void *)ctx - sizeof(struct inode); } /** -- cgit v1.2.3 From 595b893e2087de306d0781795fb8ec47873596a6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 3 May 2022 13:55:00 -0700 Subject: randstruct: Reorganize Kconfigs and attribute macros In preparation for Clang supporting randstruct, reorganize the Kconfigs, move the attribute macros, and generalize the feature to be named CONFIG_RANDSTRUCT for on/off, CONFIG_RANDSTRUCT_FULL for the full randomization mode, and CONFIG_RANDSTRUCT_PERFORMANCE for the cache-line sized mode. Cc: linux-hardening@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220503205503.3054173-4-keescook@chromium.org --- include/linux/compiler-gcc.h | 8 -------- include/linux/compiler_types.h | 14 +++++++------- include/linux/vermagic.h | 8 ++++---- 3 files changed, 11 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 52299c957c98..a0c55eeaeaf1 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -66,14 +66,6 @@ __builtin_unreachable(); \ } while (0) -#if defined(RANDSTRUCT_PLUGIN) && !defined(__CHECKER__) -#define __randomize_layout __attribute__((randomize_layout)) -#define __no_randomize_layout __attribute__((no_randomize_layout)) -/* This anon struct can add padding, so only enable it under randstruct. */ -#define randomized_struct_fields_start struct { -#define randomized_struct_fields_end } __randomize_layout; -#endif - /* * GCC 'asm goto' miscompiles certain code sequences: * diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 1c2c33ae1b37..d08dfcb0ac68 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -242,15 +242,15 @@ struct ftrace_likely_data { # define __latent_entropy #endif -#ifndef __randomize_layout +#if defined(RANDSTRUCT) && !defined(__CHECKER__) +# define __randomize_layout __designated_init __attribute__((randomize_layout)) +# define __no_randomize_layout __attribute__((no_randomize_layout)) +/* This anon struct can add padding, so only enable it under randstruct. */ +# define randomized_struct_fields_start struct { +# define randomized_struct_fields_end } __randomize_layout; +#else # define __randomize_layout __designated_init -#endif - -#ifndef __no_randomize_layout # define __no_randomize_layout -#endif - -#ifndef randomized_struct_fields_start # define randomized_struct_fields_start # define randomized_struct_fields_end #endif diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index 329d63babaeb..efb51a2da599 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -32,11 +32,11 @@ #else #define MODULE_VERMAGIC_MODVERSIONS "" #endif -#ifdef RANDSTRUCT_PLUGIN +#ifdef RANDSTRUCT #include -#define MODULE_RANDSTRUCT_PLUGIN "RANDSTRUCT_PLUGIN_" RANDSTRUCT_HASHED_SEED +#define MODULE_RANDSTRUCT "RANDSTRUCT_" RANDSTRUCT_HASHED_SEED #else -#define MODULE_RANDSTRUCT_PLUGIN +#define MODULE_RANDSTRUCT #endif #define VERMAGIC_STRING \ @@ -44,6 +44,6 @@ MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS \ MODULE_ARCH_VERMAGIC \ - MODULE_RANDSTRUCT_PLUGIN + MODULE_RANDSTRUCT #endif /* _LINUX_VERMAGIC_H */ -- cgit v1.2.3 From be2b34fa9be31c60a95989f984c9a5d40cd781b6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 3 May 2022 13:55:02 -0700 Subject: randstruct: Move seed generation into scripts/basic/ To enable Clang randstruct support, move the structure layout randomization seed generation out of scripts/gcc-plugins/ into scripts/basic/ so it happens early enough that it can be used by either compiler implementation. The gcc-plugin still builds its own header file, but now does so from the common "randstruct.seed" file. Cc: linux-hardening@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220503205503.3054173-6-keescook@chromium.org --- include/linux/vermagic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index efb51a2da599..a54046bf37e5 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -33,7 +33,7 @@ #define MODULE_VERMAGIC_MODVERSIONS "" #endif #ifdef RANDSTRUCT -#include +#include #define MODULE_RANDSTRUCT "RANDSTRUCT_" RANDSTRUCT_HASHED_SEED #else #define MODULE_RANDSTRUCT -- cgit v1.2.3 From 9ec79840d6afaf472294588a6bbe145bcdffa28b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 27 Apr 2022 18:31:19 +0100 Subject: stackleak: rework stack low bound handling In stackleak_task_init(), stackleak_track_stack(), and __stackleak_erase(), we open-code skipping the STACK_END_MAGIC at the bottom of the stack. Each case is implemented slightly differently, and only the __stackleak_erase() case is commented. In stackleak_task_init() and stackleak_track_stack() we unconditionally add sizeof(unsigned long) to the lowest stack address. In stackleak_task_init() we use end_of_stack() for this, and in stackleak_track_stack() we use task_stack_page(). In __stackleak_erase() we handle this by detecting if `kstack_ptr` has hit the stack end boundary, and if so, conditionally moving it above the magic. This patch adds a new stackleak_task_low_bound() helper which is used in all three cases, which unconditionally adds sizeof(unsigned long) to the lowest address on the task stack, with commentary as to why. This uses end_of_stack() as stackleak_task_init() did prior to this patch, as this is consistent with the code in kernel/fork.c which initializes the STACK_END_MAGIC value. In __stackleak_erase() we no longer need to check whether we've spilled into the STACK_END_MAGIC value, as stackleak_track_stack() ensures that `current->lowest_stack` stops immediately above this, and similarly the poison scan will stop immediately above this. For stackleak_task_init() and stackleak_track_stack() this results in no change to code generation. For __stackleak_erase() the generated assembly is slightly simpler and shorter. Signed-off-by: Mark Rutland Cc: Alexander Popov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220427173128.2603085-5-mark.rutland@arm.com --- include/linux/stackleak.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h index ccaab2043fcd..67430faa5c51 100644 --- a/include/linux/stackleak.h +++ b/include/linux/stackleak.h @@ -15,9 +15,22 @@ #ifdef CONFIG_GCC_PLUGIN_STACKLEAK #include +/* + * The lowest address on tsk's stack which we can plausibly erase. + */ +static __always_inline unsigned long +stackleak_task_low_bound(const struct task_struct *tsk) +{ + /* + * The lowest unsigned long on the task stack contains STACK_END_MAGIC, + * which we must not corrupt. + */ + return (unsigned long)end_of_stack(tsk) + sizeof(unsigned long); +} + static inline void stackleak_task_init(struct task_struct *t) { - t->lowest_stack = (unsigned long)end_of_stack(t) + sizeof(unsigned long); + t->lowest_stack = stackleak_task_low_bound(t); # ifdef CONFIG_STACKLEAK_METRICS t->prev_lowest_stack = t->lowest_stack; # endif -- cgit v1.2.3 From 0cfa2ccd285d98ad62218add2eebdcfff69fb2c0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 27 Apr 2022 18:31:21 +0100 Subject: stackleak: rework stack high bound handling Prior to returning to userspace, we reset current->lowest_stack to a reasonable high bound. Currently we do this by subtracting the arbitrary value `THREAD_SIZE/64` from the top of the stack, for reasons lost to history. Looking at configurations today: * On i386 where THREAD_SIZE is 8K, the bound will be 128 bytes. The pt_regs at the top of the stack is 68 bytes (with 0 to 16 bytes of padding above), and so this covers an additional portion of 44 to 60 bytes. * On x86_64 where THREAD_SIZE is at least 16K (up to 32K with KASAN) the bound will be at least 256 bytes (up to 512 with KASAN). The pt_regs at the top of the stack is 168 bytes, and so this cover an additional 88 bytes of stack (up to 344 with KASAN). * On arm64 where THREAD_SIZE is at least 16K (up to 64K with 64K pages and VMAP_STACK), the bound will be at least 256 bytes (up to 1024 with KASAN). The pt_regs at the top of the stack is 336 bytes, so this can fall within the pt_regs, or can cover an additional 688 bytes of stack. Clearly the `THREAD_SIZE/64` value doesn't make much sense -- in the worst case, this will cause more than 600 bytes of stack to be erased for every syscall, even if actual stack usage were substantially smaller. This patches makes this slightly less nonsensical by consistently resetting current->lowest_stack to the base of the task pt_regs. For clarity and for consistency with the handling of the low bound, the generation of the high bound is split into a helper with commentary explaining why. Since the pt_regs at the top of the stack will be clobbered upon the next exception entry, we don't need to poison these at exception exit. By using task_pt_regs() as the high stack boundary instead of current_top_of_stack() we avoid some redundant poisoning, and the compiler can share the address generation between the poisoning and resetting of `current->lowest_stack`, making the generated code more optimal. It's not clear to me whether the existing `THREAD_SIZE/64` offset was a dodgy heuristic to skip the pt_regs, or whether it was attempting to minimize the number of times stackleak_check_stack() would have to update `current->lowest_stack` when stack usage was shallow at the cost of unconditionally poisoning a small portion of the stack for every exit to userspace. For now I've simply removed the offset, and if we need/want to minimize updates for shallow stack usage it should be easy to add a better heuristic atop, with appropriate commentary so we know what's going on. Signed-off-by: Mark Rutland Cc: Alexander Popov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220427173128.2603085-7-mark.rutland@arm.com --- include/linux/stackleak.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h index 67430faa5c51..467661aeb413 100644 --- a/include/linux/stackleak.h +++ b/include/linux/stackleak.h @@ -28,6 +28,20 @@ stackleak_task_low_bound(const struct task_struct *tsk) return (unsigned long)end_of_stack(tsk) + sizeof(unsigned long); } +/* + * The address immediately after the highest address on tsk's stack which we + * can plausibly erase. + */ +static __always_inline unsigned long +stackleak_task_high_bound(const struct task_struct *tsk) +{ + /* + * The task's pt_regs lives at the top of the task stack and will be + * overwritten by exception entry, so there's no need to erase them. + */ + return (unsigned long)task_pt_regs(tsk); +} + static inline void stackleak_task_init(struct task_struct *t) { t->lowest_stack = stackleak_task_low_bound(t); -- cgit v1.2.3 From 77cf2b6dee6680536a3109d894f1b1ccda3fc5bf Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 27 Apr 2022 18:31:22 +0100 Subject: stackleak: rework poison scanning Currently we over-estimate the region of stack which must be erased. To determine the region to be erased, we scan downwards for a contiguous block of poison values (or the low bound of the stack). There are a few minor problems with this today: * When we find a block of poison values, we include this block within the region to erase. As this is included within the region to erase, this causes us to redundantly overwrite 'STACKLEAK_SEARCH_DEPTH' (128) bytes with poison. * As the loop condition checks 'poison_count <= depth', it will run an additional iteration after finding the contiguous block of poison, decrementing 'erase_low' once more than necessary. As this is included within the region to erase, this causes us to redundantly overwrite an additional unsigned long with poison. * As we always decrement 'erase_low' after checking an element on the stack, we always include the element below this within the region to erase. As this is included within the region to erase, this causes us to redundantly overwrite an additional unsigned long with poison. Note that this is not a functional problem. As the loop condition checks 'erase_low > task_stack_low', we'll never clobber the STACK_END_MAGIC. As we always decrement 'erase_low' after this, we'll never fail to erase the element immediately above the STACK_END_MAGIC. In total, this can cause us to erase `128 + 2 * sizeof(unsigned long)` bytes more than necessary, which is unfortunate. This patch reworks the logic to find the address immediately above the poisoned region, by finding the lowest non-poisoned address. This is factored into a stackleak_find_top_of_poison() helper both for clarity and so that this can be shared with the LKDTM test in subsequent patches. Signed-off-by: Mark Rutland Cc: Alexander Popov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220427173128.2603085-8-mark.rutland@arm.com --- include/linux/stackleak.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h index 467661aeb413..c36e7a3b45e7 100644 --- a/include/linux/stackleak.h +++ b/include/linux/stackleak.h @@ -42,6 +42,32 @@ stackleak_task_high_bound(const struct task_struct *tsk) return (unsigned long)task_pt_regs(tsk); } +/* + * Find the address immediately above the poisoned region of the stack, where + * that region falls between 'low' (inclusive) and 'high' (exclusive). + */ +static __always_inline unsigned long +stackleak_find_top_of_poison(const unsigned long low, const unsigned long high) +{ + const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long); + unsigned int poison_count = 0; + unsigned long poison_high = high; + unsigned long sp = high; + + while (sp > low && poison_count < depth) { + sp -= sizeof(unsigned long); + + if (*(unsigned long *)sp == STACKLEAK_POISON) { + poison_count++; + } else { + poison_count = 0; + poison_high = sp; + } + } + + return poison_high; +} + static inline void stackleak_task_init(struct task_struct *t) { t->lowest_stack = stackleak_task_low_bound(t); -- cgit v1.2.3 From 2e3afb42dd480d755cd7d97ca04586a5616c1a5e Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 8 May 2022 11:37:09 +0200 Subject: blk-mq: remove the error_count from struct request The last two users were floppy.c and ataflop.c respectively, it was verified that no other drivers makes use of this, so let's remove it. Suggested-by: Linus Torvalds Cc: Minh Yuan Cc: Denis Efremov , Cc: Geert Uytterhoeven Cc: Christoph Hellwig Signed-off-by: Willy Tarreau Signed-off-by: Linus Torvalds --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7aa5c54901a9..9f07061418db 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -163,7 +163,6 @@ struct request { struct rb_node rb_node; /* sort/lookup */ struct bio_vec special_vec; void *completion_data; - int error_count; /* for legacy drivers, don't use */ }; -- cgit v1.2.3 From 56f5746c414d92ae8e8314f46760822b4ecf8be3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Feb 2022 09:40:54 -0500 Subject: namei: Merge page_symlink() and __page_symlink() There are no callers of __page_symlink() left, so we can remove that entry point. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..e108aff23a28 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3109,8 +3109,6 @@ extern int page_readlink(struct dentry *, char __user *, int); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); extern void page_put_link(void *); -extern int __page_symlink(struct inode *inode, const char *symname, int len, - int nofs); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); -- cgit v1.2.3 From 236d93c4bf2d6da83241cc8e4625e89d9604cb43 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Feb 2022 10:40:11 -0500 Subject: fs: Remove AOP_FLAG_NOFS With all users of this flag gone, we can stop testing whether it's set. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/fs.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e108aff23a28..f81bc5cbcbb6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -275,10 +275,6 @@ enum positive_aop_returns { AOP_TRUNCATED_PAGE = 0x80001, }; -#define AOP_FLAG_NOFS 0x0002 /* used by filesystem to direct - * helper code (eg buffer layer) - * to clear GFP_FS from alloc */ - /* * oh the beauties of C type declarations. */ -- cgit v1.2.3 From de2a931150177957d37e9c975025604f4a1fe853 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Feb 2022 10:47:09 -0500 Subject: fs: Remove aop_flags parameter from netfs_write_begin() There are no more aop flags left, so remove the parameter. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c7bf1eaf51d5..1c29f317d907 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -276,7 +276,7 @@ struct readahead_control; extern void netfs_readahead(struct readahead_control *); extern int netfs_readpage(struct file *, struct page *); extern int netfs_write_begin(struct file *, struct address_space *, - loff_t, unsigned int, unsigned int, struct folio **, + loff_t, unsigned int, struct folio **, void **); extern void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool); -- cgit v1.2.3 From b3992d1e2ebcd478e0614494a6abd95e902a029b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Feb 2022 11:25:12 -0500 Subject: fs: Remove aop flags parameter from block_write_begin() There are no more aop flags left, so remove the parameter. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index bcb4fe9b8575..63e49dfa7738 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -226,7 +226,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, int block_read_full_page(struct page*, get_block_t*); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, - unsigned flags, struct page **pagep, get_block_t *get_block); + struct page **pagep, get_block_t *get_block); int __block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(struct file *, struct address_space *, -- cgit v1.2.3 From be3bbbc588118bdc10e21fdd7bfa6ee6b8c2555d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Feb 2022 11:25:12 -0500 Subject: fs: Remove aop flags parameter from cont_write_begin() There are no more aop flags left, so remove the parameter. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 63e49dfa7738..127b60fad77e 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -238,7 +238,7 @@ int generic_write_end(struct file *, struct address_space *, void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); void clean_page_buffers(struct page *page); int cont_write_begin(struct file *, struct address_space *, loff_t, - unsigned, unsigned, struct page **, void **, + unsigned, struct page **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); int block_commit_write(struct page *page, unsigned from, unsigned to); -- cgit v1.2.3 From b7446e7cf15f0926866c8e5de90ab278998bf8c8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Feb 2022 11:25:12 -0500 Subject: fs: Remove aop flags parameter from grab_cache_page_write_begin() There are no more aop flags left, so remove the parameter. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/pagemap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 993994cd943a..65ae8f96554b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -735,7 +735,7 @@ static inline unsigned find_get_pages_tag(struct address_space *mapping, } struct page *grab_cache_page_write_begin(struct address_space *mapping, - pgoff_t index, unsigned flags); + pgoff_t index); /* * Returns locked page at given index in given cache, creating it if needed. -- cgit v1.2.3 From 8371f30cf774a20fd627a0f7b1ecf00e8257f3bc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Feb 2022 11:54:56 -0500 Subject: fs: Remove aop flags parameter from nobh_write_begin() There are no more aop flags left, so remove the parameter. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 127b60fad77e..6e5a64005fef 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -258,7 +258,7 @@ static inline vm_fault_t block_page_mkwrite_return(int err) } sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); int block_truncate_page(struct address_space *, loff_t, get_block_t *); -int nobh_write_begin(struct address_space *, loff_t, unsigned, unsigned, +int nobh_write_begin(struct address_space *, loff_t, unsigned len, struct page **, void **, get_block_t*); int nobh_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, -- cgit v1.2.3 From 9d6b0cd7579844761ed68926eb3073bab1dca87b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Feb 2022 14:31:43 -0500 Subject: fs: Remove flags parameter from aops->write_begin There are no more aop flags left, so remove the parameter. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index f81bc5cbcbb6..a0e73432526f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -346,7 +346,7 @@ struct address_space_operations { void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, @@ -3179,7 +3179,7 @@ extern int noop_fsync(struct file *, loff_t, loff_t, int); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); extern int simple_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata); extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); -- cgit v1.2.3 From 84a1041c60ff8f648a09d28af7b2e50a8f6345ed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 3 Mar 2022 15:00:20 -0500 Subject: fs: Remove pagecache_write_begin() and pagecache_write_end() These wrappers have no more users; remove them. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/fs.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index a0e73432526f..b35ce086a7a1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -380,18 +380,6 @@ struct address_space_operations { extern const struct address_space_operations empty_aops; -/* - * pagecache_write_begin/pagecache_write_end must be used by general code - * to write into the pagecache. - */ -int pagecache_write_begin(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata); - -int pagecache_write_end(struct file *, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); - /** * struct address_space - Contents of a cacheable, mappable object. * @host: Owner, either the inode or the block_device. -- cgit v1.2.3 From 65aa6b5a18294b3713a90c120312ed5d63a16b82 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Apr 2022 11:38:20 -0400 Subject: filemap: Remove obsolete comment in lock_page We no longer need the page's inode pinned. This comment dates back to commit db37648cd6ce ("[PATCH] mm: non syncing lock_page()") which added lock_page_nosync(). That was removed by commit 7eaceaccab5f ("block: remove per-queue plugging") which also made this comment obsolete. Signed-off-by: Miaohe Lin Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/pagemap.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 65ae8f96554b..ab47579af434 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -908,9 +908,6 @@ static inline void folio_lock(struct folio *folio) __folio_lock(folio); } -/* - * lock_page may only be called if we have the page's inode pinned. - */ static inline void lock_page(struct page *page) { struct folio *folio; -- cgit v1.2.3 From cd125eeab2de8ef8ca3a0f3a284bc695375c73af Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 4 Apr 2022 13:24:36 -0400 Subject: filemap: Update the folio_lock documentation Add kernel-doc for several functions relating to take the folio lock. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/pagemap.h | 59 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ab47579af434..60657132080f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -888,6 +888,18 @@ bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, void unlock_page(struct page *page); void folio_unlock(struct folio *folio); +/** + * folio_trylock() - Attempt to lock a folio. + * @folio: The folio to attempt to lock. + * + * Sometimes it is undesirable to wait for a folio to be unlocked (eg + * when the locks are being taken in the wrong order, or if making + * progress through a batch of folios is more important than processing + * them in order). Usually folio_lock() is the correct function to call. + * + * Context: Any context. + * Return: Whether the lock was successfully acquired. + */ static inline bool folio_trylock(struct folio *folio) { return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0))); @@ -901,6 +913,28 @@ static inline int trylock_page(struct page *page) return folio_trylock(page_folio(page)); } +/** + * folio_lock() - Lock this folio. + * @folio: The folio to lock. + * + * The folio lock protects against many things, probably more than it + * should. It is primarily held while a folio is being brought uptodate, + * either from its backing file or from swap. It is also held while a + * folio is being truncated from its address_space, so holding the lock + * is sufficient to keep folio->mapping stable. + * + * The folio lock is also held while write() is modifying the page to + * provide POSIX atomicity guarantees (as long as the write does not + * cross a page boundary). Other modifications to the data in the folio + * do not hold the folio lock and can race with writes, eg DMA and stores + * to mapped pages. + * + * Context: May sleep. If you need to acquire the locks of two or + * more folios, they must be in order of ascending index, if they are + * in the same address_space. If they are in different address_spaces, + * acquire the lock of the folio which belongs to the address_space which + * has the lowest address in memory first. + */ static inline void folio_lock(struct folio *folio) { might_sleep(); @@ -908,6 +942,17 @@ static inline void folio_lock(struct folio *folio) __folio_lock(folio); } +/** + * lock_page() - Lock the folio containing this page. + * @page: The page to lock. + * + * See folio_lock() for a description of what the lock protects. + * This is a legacy function and new code should probably use folio_lock() + * instead. + * + * Context: May sleep. Pages in the same folio share a lock, so do not + * attempt to lock two pages which share a folio. + */ static inline void lock_page(struct page *page) { struct folio *folio; @@ -918,6 +963,16 @@ static inline void lock_page(struct page *page) __folio_lock(folio); } +/** + * folio_lock_killable() - Lock this folio, interruptible by a fatal signal. + * @folio: The folio to lock. + * + * Attempts to lock the folio, like folio_lock(), except that the sleep + * to acquire the lock is interruptible by a fatal signal. + * + * Context: May sleep; see folio_lock(). + * Return: 0 if the lock was acquired; -EINTR if a fatal signal was received. + */ static inline int folio_lock_killable(struct folio *folio) { might_sleep(); @@ -964,8 +1019,8 @@ int folio_wait_bit_killable(struct folio *folio, int bit_nr); * Wait for a folio to be unlocked. * * This must be called with the caller "holding" the folio, - * ie with increased "page->count" so that the folio won't - * go away during the wait.. + * ie with increased folio reference count so that the folio won't + * go away during the wait. */ static inline void folio_wait_locked(struct folio *folio) { -- cgit v1.2.3 From 520f301c54faa3484e820b80d4505d48ee587163 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Jan 2022 14:35:22 -0500 Subject: fs: Convert is_dirty_writeback() to take a folio Pass a folio instead of a page to aops->is_dirty_writeback(). Convert both implementations and the caller. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/buffer_head.h | 2 +- include/linux/fs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6e5a64005fef..805c4e12700a 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -146,7 +146,7 @@ BUFFER_FNS(Defer_Completion, defer_completion) #define page_has_buffers(page) PagePrivate(page) #define folio_buffers(folio) folio_get_private(folio) -void buffer_check_dirty_writeback(struct page *page, +void buffer_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback); /* diff --git a/include/linux/fs.h b/include/linux/fs.h index b35ce086a7a1..2be852661a29 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -369,7 +369,7 @@ struct address_space_operations { int (*launder_folio)(struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); - void (*is_dirty_writeback) (struct page *, bool *, bool *); + void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); int (*error_remove_page)(struct address_space *, struct page *); /* swapfile support */ -- cgit v1.2.3 From 2ebdd1df316636c2faf25a1780e12553adf09cf7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 17 Mar 2021 22:38:26 -0400 Subject: mm/readahead: Convert page_cache_async_readahead to take a folio Removes a couple of calls to compound_head and saves a few bytes. Also convert verity's read_file_data_page() to be folio-based. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/pagemap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 60657132080f..b70192f56454 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1242,7 +1242,7 @@ void page_cache_sync_readahead(struct address_space *mapping, * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @file: Used by the filesystem for authentication. - * @page: The page at @index which triggered the readahead call. + * @folio: The folio at @index which triggered the readahead call. * @index: Index of first page to be read. * @req_count: Total number of pages being read by the caller. * @@ -1254,10 +1254,10 @@ void page_cache_sync_readahead(struct address_space *mapping, static inline void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *file, - struct page *page, pgoff_t index, unsigned long req_count) + struct folio *folio, pgoff_t index, unsigned long req_count) { DEFINE_READAHEAD(ractl, file, ra, mapping, index); - page_cache_async_ra(&ractl, page_folio(page), req_count); + page_cache_async_ra(&ractl, folio, req_count); } static inline struct folio *__readahead_folio(struct readahead_control *ractl) -- cgit v1.2.3 From 6d97af487dee3176cb1342d4ab16637e495440ad Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 4 May 2022 08:23:50 +0200 Subject: entry: Rename arch_check_user_regs() to arch_enter_from_user_mode() arch_check_user_regs() is used at the moment to verify that struct pt_regs contains valid values when entering the kernel from userspace. s390 needs a place in the generic entry code to modify a cpu data structure when switching from userspace to kernel mode. As arch_check_user_regs() is exactly this, rename it to arch_enter_from_user_mode(). When entering the kernel from userspace, arch_check_user_regs() is used to verify that struct pt_regs contains valid values. Note that the NMI codepath doesn't call this function. s390 needs a place in the generic entry code to modify a cpu data structure when switching from userspace to kernel mode. As arch_check_user_regs() is exactly this, rename it to arch_enter_from_user_mode(). Signed-off-by: Sven Schnelle Reviewed-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Link: https://lore.kernel.org/r/20220504062351.2954280-2-tmricht@linux.ibm.com Signed-off-by: Heiko Carstens --- include/linux/entry-common.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index ab78bd4c2eb0..c92ac75d6556 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -63,7 +63,7 @@ ARCH_EXIT_TO_USER_MODE_WORK) /** - * arch_check_user_regs - Architecture specific sanity check for user mode regs + * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs * @regs: Pointer to currents pt_regs * * Defaults to an empty implementation. Can be replaced by architecture @@ -73,10 +73,10 @@ * section. Use __always_inline so the compiler cannot push it out of line * and make it instrumentable. */ -static __always_inline void arch_check_user_regs(struct pt_regs *regs); +static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs); -#ifndef arch_check_user_regs -static __always_inline void arch_check_user_regs(struct pt_regs *regs) {} +#ifndef arch_enter_from_user_mode +static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {} #endif /** -- cgit v1.2.3 From 2fbdf45d7d26361a0c3ec8833fd96edf0f5812da Mon Sep 17 00:00:00 2001 From: Ricardo Martinez Date: Fri, 6 May 2022 11:12:57 -0700 Subject: list: Add list_next_entry_circular() and list_prev_entry_circular() Add macros to get the next or previous entries and wraparound if needed. For example, calling list_next_entry_circular() on the last element should return the first element in the list. Signed-off-by: Ricardo Martinez Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/list.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index dd6c2041d09c..c147eeb2d39d 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -563,6 +563,19 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) +/** + * list_next_entry_circular - get the next element in list + * @pos: the type * to cursor. + * @head: the list head to take the element from. + * @member: the name of the list_head within the struct. + * + * Wraparound if pos is the last element (return the first element). + * Note, that list is expected to be not empty. + */ +#define list_next_entry_circular(pos, head, member) \ + (list_is_last(&(pos)->member, head) ? \ + list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member)) + /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor @@ -571,6 +584,19 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_prev_entry(pos, member) \ list_entry((pos)->member.prev, typeof(*(pos)), member) +/** + * list_prev_entry_circular - get the prev element in list + * @pos: the type * to cursor. + * @head: the list head to take the element from. + * @member: the name of the list_head within the struct. + * + * Wraparound if pos is the first element (return the last element). + * Note, that list is expected to be not empty. + */ +#define list_prev_entry_circular(pos, head, member) \ + (list_is_first(&(pos)->member, head) ? \ + list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member)) + /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. -- cgit v1.2.3 From a4ff365346c98081311c299955d91d657123e8aa Mon Sep 17 00:00:00 2001 From: Ricardo Martinez Date: Fri, 6 May 2022 11:12:58 -0700 Subject: net: skb: introduce skb_data_area_size() Helper to calculate the linear data space in the skb. Signed-off-by: Ricardo Martinez Reviewed-by: Sergey Ryazanov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5c2599e3fe7d..d58669d6cb91 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1665,6 +1665,11 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) } #endif +static inline unsigned int skb_data_area_size(struct sk_buff *skb) +{ + return skb_end_pointer(skb) - skb->data; +} + struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); -- cgit v1.2.3 From ca4567f1e6f660f86fcd04f3563c0045b0d4772f Mon Sep 17 00:00:00 2001 From: Alaa Mohamed Date: Thu, 5 May 2022 17:09:57 +0200 Subject: rtnetlink: add extack support in fdb del handlers Add extack support to .ndo_fdb_del in netdevice.h and all related methods. Signed-off-by: Alaa Mohamed Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8cf0ac616cb9..74c97a34921d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1513,7 +1513,7 @@ struct net_device_ops { struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, - u16 vid); + u16 vid, struct netlink_ext_ack *extack); int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, -- cgit v1.2.3 From 90532850eb210dff70423eaf20e323a91ef542d8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 6 May 2022 06:23:52 +0200 Subject: net: phy: introduce genphy_c45_pma_baset1_setup_master_slave() Move baset1 specific part of genphy_c45_pma_setup_forced() code to separate function to make it reusable by PHY drivers. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 2d12054932ba..d3f924d3b235 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1614,6 +1614,7 @@ int genphy_c45_read_link(struct phy_device *phydev); int genphy_c45_read_lpa(struct phy_device *phydev); int genphy_c45_read_pma(struct phy_device *phydev); int genphy_c45_pma_setup_forced(struct phy_device *phydev); +int genphy_c45_pma_baset1_setup_master_slave(struct phy_device *phydev); int genphy_c45_an_config_aneg(struct phy_device *phydev); int genphy_c45_an_disable_aneg(struct phy_device *phydev); int genphy_c45_read_mdix(struct phy_device *phydev); -- cgit v1.2.3 From b9a366f3d874ce1c9c0148ec399f2ce4ab05a467 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 6 May 2022 06:23:54 +0200 Subject: net: phy: introduce genphy_c45_pma_baset1_read_master_slave() Move baset1 specific part of genphy_c45_read_pma() code to separate function to make it reusable by PHY drivers. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index d3f924d3b235..4713c95d65fb 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1619,6 +1619,7 @@ int genphy_c45_an_config_aneg(struct phy_device *phydev); int genphy_c45_an_disable_aneg(struct phy_device *phydev); int genphy_c45_read_mdix(struct phy_device *phydev); int genphy_c45_pma_read_abilities(struct phy_device *phydev); +int genphy_c45_pma_baset1_read_master_slave(struct phy_device *phydev); int genphy_c45_read_status(struct phy_device *phydev); int genphy_c45_config_aneg(struct phy_device *phydev); int genphy_c45_loopback(struct phy_device *phydev, bool enable); -- cgit v1.2.3 From 2013ad8836aceb828c0fb5b16fa8bb98fd3d9b28 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 6 May 2022 06:23:56 +0200 Subject: net: phy: export genphy_c45_baset1_read_status() Export genphy_c45_baset1_read_status() to make it reusable by PHY drivers. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 4713c95d65fb..508f1149665b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1621,6 +1621,7 @@ int genphy_c45_read_mdix(struct phy_device *phydev); int genphy_c45_pma_read_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_master_slave(struct phy_device *phydev); int genphy_c45_read_status(struct phy_device *phydev); +int genphy_c45_baset1_read_status(struct phy_device *phydev); int genphy_c45_config_aneg(struct phy_device *phydev); int genphy_c45_loopback(struct phy_device *phydev, bool enable); int genphy_c45_pma_resume(struct phy_device *phydev); -- cgit v1.2.3 From 0257be79fc4a16a3252ce80aa13b3640f728c425 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Fri, 29 Apr 2022 12:20:18 +0200 Subject: mtd: spi-nor: expose internal parameters via debugfs There is no way to gather all information to verify support for a new flash chip. Also if you want to convert an existing flash chip to the new SFDP parsing, there is not enough information to determine if the flash will work like before. To ease this development, expose internal parameters via the debugfs. Signed-off-by: Michael Walle Signed-off-by: Pratyush Yadav Reviewed-by: Pratyush Yadav Link: https://lore.kernel.org/r/20220429102018.2361038-2-michael@walle.cc --- include/linux/mtd/spi-nor.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index e505c4a5c530..1ede4c89805a 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -363,6 +363,7 @@ struct spi_nor_flash_parameter; * @write_proto: the SPI protocol for write operations * @reg_proto: the SPI protocol for read_reg/write_reg/erase operations * @sfdp: the SFDP data of the flash + * @debugfs_root: pointer to the debugfs directory * @controller_ops: SPI NOR controller driver specific operations. * @params: [FLASH-SPECIFIC] SPI NOR flash parameters and settings. * The structure includes legacy flash parameters and @@ -392,6 +393,7 @@ struct spi_nor { u32 flags; enum spi_nor_cmd_ext cmd_ext_type; struct sfdp *sfdp; + struct dentry *debugfs_root; const struct spi_nor_controller_ops *controller_ops; -- cgit v1.2.3 From b1c5f3085149e9643b125eb10aae0e74644d7dcc Mon Sep 17 00:00:00 2001 From: Ricky WU Date: Mon, 21 Mar 2022 11:18:30 +0000 Subject: misc: rtsx: add rts5261 efuse function move rts5261_fetch_vendor_settings() to rts5261_init_from_hw() make sure it be called from S3 or D3 add more register setting when efuse is set read efuse setting to register on init flow Signed-off-by: Ricky Wu Link: https://lore.kernel.org/r/18101ecb0f0749ccb9f564eda171ba40@realtek.com Signed-off-by: Greg Kroah-Hartman --- include/linux/rtsx_pci.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h index 3d780b44e678..534038d962e4 100644 --- a/include/linux/rtsx_pci.h +++ b/include/linux/rtsx_pci.h @@ -1067,6 +1067,9 @@ #define PCR_SETTING_REG1 0x724 #define PCR_SETTING_REG2 0x814 #define PCR_SETTING_REG3 0x747 +#define PCR_SETTING_REG4 0x818 +#define PCR_SETTING_REG5 0x81C + #define rtsx_pci_init_cmd(pcr) ((pcr)->ci = 0) -- cgit v1.2.3 From dbc2f62061c6bfba0aee93161ee3194dcee84bd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Fri, 29 Apr 2022 17:26:46 +0100 Subject: nvmem: core: support passing DT node in cell info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some hardware may have NVMEM cells described in Device Tree using individual nodes. Let drivers pass such nodes to the NVMEM subsystem so they can be later used by NVMEM consumers. Signed-off-by: Rafał Miłecki Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220429162701.2222-2-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-consumer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h index c0c0cefc3b92..980f9c9ac0bc 100644 --- a/include/linux/nvmem-consumer.h +++ b/include/linux/nvmem-consumer.h @@ -25,6 +25,7 @@ struct nvmem_cell_info { unsigned int bytes; unsigned int bit_offset; unsigned int nbits; + struct device_node *np; }; /** -- cgit v1.2.3 From 5efe7448a1426250b5747c10ad438517f44f1e51 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 08:43:23 -0400 Subject: fs: Introduce aops->read_folio Change all the callers of ->readpage to call ->read_folio in preference, if it exists. This is a transitional duplication, and will be removed by the end of the series. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2be852661a29..5ad942183a2c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -336,6 +336,7 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); + int (*read_folio)(struct file *, struct folio *); /* Write back some dirty pages from this mapping. */ int (*writepages)(struct address_space *, struct writeback_control *); -- cgit v1.2.3 From 6c62371b7fd77628feb5b806bc29433caecedff8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 08:49:28 -0400 Subject: fs: Convert netfs_readpage to netfs_read_folio This is straightforward because netfs already worked in terms of folios. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 1c29f317d907..4bd5ee709daa 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -274,7 +274,7 @@ struct netfs_cache_ops { struct readahead_control; extern void netfs_readahead(struct readahead_control *); -extern int netfs_readpage(struct file *, struct page *); +int netfs_read_folio(struct file *, struct folio *); extern int netfs_write_begin(struct file *, struct address_space *, loff_t, unsigned int, struct folio **, void **); -- cgit v1.2.3 From 7479c505b4ab5ed5f81f35fdd68c44c58d6f0439 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 08:54:32 -0400 Subject: fs: Convert iomap_readpage to iomap_read_folio A straightforward conversion as iomap_readpage already worked in folios. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b76f0dd149fb..5b2aa45ddda3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -225,7 +225,7 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); -int iomap_readpage(struct page *page, const struct iomap_ops *ops); +int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); int iomap_releasepage(struct page *page, gfp_t gfp_mask); -- cgit v1.2.3 From 2c69e2057962b6bd76d72446453862eb59325b49 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 10:40:40 -0400 Subject: fs: Convert block_read_full_page() to block_read_full_folio() This function is NOT converted to handle large folios, so include an assert that the filesystem isn't passing one in. Otherwise, use the folio functions instead of the page functions, where they exist. Convert all filesystems which use block_read_full_page(). Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 805c4e12700a..31d82fd9abe8 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -223,7 +223,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block, int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block, struct writeback_control *wbc, bh_end_io_t *handler); -int block_read_full_page(struct page*, get_block_t*); +int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, get_block_t *get_block); -- cgit v1.2.3 From f132ab7d3ab03c5bae28d31fb80ba77c4da05500 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 11:47:39 -0400 Subject: fs: Convert mpage_readpage to mpage_read_folio mpage_readpage still works in terms of pages, and has not been audited for correctness with large folios, so include an assertion that the filesystem is not passing it large folios. Convert all the filesystems to call mpage_read_folio() instead of mpage_readpage(). Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mpage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mpage.h b/include/linux/mpage.h index f4f5e90a6844..43986f7ec4dd 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h @@ -16,7 +16,7 @@ struct writeback_control; struct readahead_control; void mpage_readahead(struct readahead_control *, get_block_t get_block); -int mpage_readpage(struct page *page, get_block_t get_block); +int mpage_read_folio(struct folio *folio, get_block_t get_block); int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block); int mpage_writepage(struct page *page, get_block_t *get_block, -- cgit v1.2.3 From 65d023af7f29eb1250a6105141a74776bae7e1f8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 11:12:16 -0400 Subject: nfs: Convert nfs to read_folio This is a "weak" conversion which converts straight back to using pages. A full conversion should be performed at some point, hopefully by someone familiar with the filesystem. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/nfs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index b48b9259e02c..1bba71757d62 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -594,7 +594,7 @@ static inline bool nfs_have_writebacks(const struct inode *inode) /* * linux/fs/nfs/read.c */ -extern int nfs_readpage(struct file *, struct page *); +int nfs_read_folio(struct file *, struct folio *); void nfs_readahead(struct readahead_control *); /* -- cgit v1.2.3 From 7e0a126519b82648b254afcd95a168c15f65ea40 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 11:53:28 -0400 Subject: mm,fs: Remove aops->readpage With all implementations of aops->readpage converted to aops->read_folio, we can stop checking whether it's set and remove the member from aops. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/fs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 5ad942183a2c..f812f5aa07dd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -262,7 +262,7 @@ struct iattr { * trying again. The aop will be taking reasonable * precautions not to livelock. If the caller held a page * reference, it should drop it before retrying. Returned - * by readpage(). + * by read_folio(). * * address_space_operation functions return these large constants to indicate * special semantics to the caller. These are much larger than the bytes in a @@ -335,7 +335,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); - int (*readpage)(struct file *, struct page *); int (*read_folio)(struct file *, struct folio *); /* Write back some dirty pages from this mapping. */ -- cgit v1.2.3 From e9b5b23e957ef9260fec811d8d8081125889308a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 1 May 2022 21:39:29 -0400 Subject: fs: Change the type of filler_t By making filler_t the same as read_folio, we can use the same function for both in gfs2. We can push the use of folios down one more level in jffs2 and nfs. We also increase type safety for future users of the various read_cache_page() family of functions by forcing the parameter to be a pointer to struct file (or NULL). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Andreas Gruenbacher --- include/linux/pagemap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b70192f56454..831b28dab01a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -492,7 +492,7 @@ static inline gfp_t readahead_gfp_mask(struct address_space *x) return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN; } -typedef int filler_t(void *, struct page *); +typedef int filler_t(struct file *, struct folio *); pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); @@ -747,9 +747,9 @@ static inline struct page *grab_cache_page(struct address_space *mapping, } struct folio *read_cache_folio(struct address_space *, pgoff_t index, - filler_t *filler, void *data); + filler_t *filler, struct file *file); struct page *read_cache_page(struct address_space *, pgoff_t index, - filler_t *filler, void *data); + filler_t *filler, struct file *file); extern struct page * read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); -- cgit v1.2.3 From 218d921b581eadf312c8ef0e09113b111f104eeb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 30 Apr 2022 22:08:54 -0700 Subject: fscrypt: add new helper functions for test_dummy_encryption Unfortunately the design of fscrypt_set_test_dummy_encryption() doesn't work properly for the new mount API, as it combines too many steps into one function: - Parse the argument to test_dummy_encryption - Check the setting against the filesystem instance - Apply the setting to the filesystem instance The new mount API has split these into separate steps. ext4 partially worked around this by duplicating some of the logic, but it still had some bugs. To address this, add some new helper functions that split up the steps of fscrypt_set_test_dummy_encryption(): - fscrypt_parse_test_dummy_encryption() - fscrypt_dummy_policies_equal() - fscrypt_add_test_dummy_key() While we're add it, also add a function fscrypt_is_dummy_policy_set() which will be useful to avoid some #ifdef's. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20220501050857.538984-5-ebiggers@kernel.org --- include/linux/fscrypt.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index efc7f96e5e26..e60d57c99cb6 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -32,6 +32,7 @@ union fscrypt_policy; struct fscrypt_info; +struct fs_parameter; struct seq_file; struct fscrypt_str { @@ -289,10 +290,19 @@ struct fscrypt_dummy_policy { const union fscrypt_policy *policy; }; +int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, + struct fscrypt_dummy_policy *dummy_policy); +bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, + const struct fscrypt_dummy_policy *p2); int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, struct fscrypt_dummy_policy *dummy_policy); void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb); +static inline bool +fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) +{ + return dummy_policy->policy != NULL; +} static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { @@ -303,6 +313,8 @@ fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) /* keyring.c */ void fscrypt_sb_free(struct super_block *sb); int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); +int fscrypt_add_test_dummy_key(struct super_block *sb, + const struct fscrypt_dummy_policy *dummy_policy); int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg); int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); @@ -477,12 +489,32 @@ static inline int fscrypt_set_context(struct inode *inode, void *fs_data) struct fscrypt_dummy_policy { }; +static inline int +fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, + struct fscrypt_dummy_policy *dummy_policy) +{ + return -EINVAL; +} + +static inline bool +fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, + const struct fscrypt_dummy_policy *p2) +{ + return true; +} + static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb) { } +static inline bool +fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) +{ + return false; +} + static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { @@ -498,6 +530,13 @@ static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg) return -EOPNOTSUPP; } +static inline int +fscrypt_add_test_dummy_key(struct super_block *sb, + const struct fscrypt_dummy_policy *dummy_policy) +{ + return 0; +} + static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; -- cgit v1.2.3 From 623a1ddfeb232526275ddd0c8378771e6712aad4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:42 -0700 Subject: mm/hugetlb: take src_mm->write_protect_seq in copy_hugetlb_page_range() Let's do it just like copy_page_range(), taking the seqlock and making sure the mmap_lock is held in write mode. This allows for add a VM_BUG_ON to page_needs_cow_for_dma() and properly synchronizes concurrent fork() with GUP-fast of hugetlb pages, which will be relevant for further changes. Link: https://lkml.kernel.org/r/20220428083441.37290-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b9316b2c11ce..a02812178562 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1576,6 +1576,8 @@ static inline bool page_maybe_dma_pinned(struct page *page) /* * This should most likely only be called during fork() to see whether we * should break the cow immediately for a page on the src mm. + * + * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. */ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, struct page *page) @@ -1583,6 +1585,8 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, if (!is_cow_mapping(vma->vm_flags)) return false; + VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); + if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) return false; -- cgit v1.2.3 From fb3d824d1a46c5bb0584ea88f32dc2495544aebf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:43 -0700 Subject: mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap() ... and move the special check for pinned pages into page_try_dup_anon_rmap() to prepare for tracking exclusive anonymous pages via a new pageflag, clearing it only after making sure that there are no GUP pins on the anonymous page. We really only care about pins on anonymous pages, because they are prone to getting replaced in the COW handler once mapped R/O. For !anon pages in cow-mappings (!VM_SHARED && VM_MAYWRITE) we shouldn't really care about that, at least not that I could come up with an example. Let's drop the is_cow_mapping() check from page_needs_cow_for_dma(), as we know we're dealing with anonymous pages. Also, drop the handling of pinned pages from copy_huge_pud() and add a comment if ever supporting anonymous pages on the PUD level. This is a preparation for tracking exclusivity of anonymous pages in the rmap code, and disallowing marking a page shared (-> failing to duplicate) if there are GUP pins on a page. Link: https://lkml.kernel.org/r/20220428083441.37290-5-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +---- include/linux/rmap.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a02812178562..9ee3ae51e8e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1575,16 +1575,13 @@ static inline bool page_maybe_dma_pinned(struct page *page) /* * This should most likely only be called during fork() to see whether we - * should break the cow immediately for a page on the src mm. + * should break the cow immediately for an anon page on the src mm. * * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. */ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, struct page *page) { - if (!is_cow_mapping(vma->vm_flags)) - return false; - VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 8573aae50d96..73f41544084f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -12,6 +12,7 @@ #include #include #include +#include /* * The anon_vma heads a list of private "related" vmas, to scan if @@ -182,11 +183,57 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); -static inline void page_dup_rmap(struct page *page, bool compound) +static inline void __page_dup_rmap(struct page *page, bool compound) { atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); } +static inline void page_dup_file_rmap(struct page *page, bool compound) +{ + __page_dup_rmap(page, compound); +} + +/** + * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped + * anonymous page + * @page: the page to duplicate the mapping for + * @compound: the page is mapped as compound or as a small page + * @vma: the source vma + * + * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq. + * + * Duplicating the mapping can only fail if the page may be pinned; device + * private pages cannot get pinned and consequently this function cannot fail. + * + * If duplicating the mapping succeeds, the page has to be mapped R/O into + * the parent and the child. It must *not* get mapped writable after this call. + * + * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. + */ +static inline int page_try_dup_anon_rmap(struct page *page, bool compound, + struct vm_area_struct *vma) +{ + VM_BUG_ON_PAGE(!PageAnon(page), page); + + /* + * If this page may have been pinned by the parent process, + * don't allow to duplicate the mapping but instead require to e.g., + * copy the page immediately for the child so that we'll always + * guarantee the pinned page won't be randomly replaced in the + * future on write faults. + */ + if (likely(!is_device_private_page(page) && + unlikely(page_needs_cow_for_dma(vma, page)))) + return -EBUSY; + + /* + * It's okay to share the anon page between both processes, mapping + * the page R/O into both processes. + */ + __page_dup_rmap(page, compound); + return 0; +} + /* * Called from mm/vmscan.c to handle paging out */ -- cgit v1.2.3 From 14f9135d547060d1d0c182501201f8da19895fe3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:43 -0700 Subject: mm/rmap: convert RMAP flags to a proper distinct rmap_t type We want to pass the flags to more than one anon rmap function, getting rid of special "do_page_add_anon_rmap()". So let's pass around a distinct __bitwise type and refine documentation. Link: https://lkml.kernel.org/r/20220428083441.37290-6-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/rmap.h | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 73f41544084f..c53a38151089 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -160,9 +160,23 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, struct anon_vma *page_get_anon_vma(struct page *page); -/* bitflags for do_page_add_anon_rmap() */ -#define RMAP_EXCLUSIVE 0x01 -#define RMAP_COMPOUND 0x02 +/* RMAP flags, currently only relevant for some anon rmap operations. */ +typedef int __bitwise rmap_t; + +/* + * No special request: if the page is a subpage of a compound page, it is + * mapped via a PTE. The mapped (sub)page is possibly shared between processes. + */ +#define RMAP_NONE ((__force rmap_t)0) + +/* The (sub)page is exclusive to a single process. */ +#define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) + +/* + * The compound page is not mapped via PTEs, but instead via a single PMD and + * should be accounted accordingly. + */ +#define RMAP_COMPOUND ((__force rmap_t)BIT(1)) /* * rmap interfaces called when adding or removing pte of page @@ -171,7 +185,7 @@ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, bool compound); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address, int flags); + unsigned long address, rmap_t flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, bool compound); void page_add_file_rmap(struct page *, struct vm_area_struct *, -- cgit v1.2.3 From f1e2db12e45baaa2d366f87c885968096c2ff5aa Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:43 -0700 Subject: mm/rmap: remove do_page_add_anon_rmap() ... and instead convert page_add_anon_rmap() to accept flags. Passing flags instead of bools is usually nicer either way, and we want to more often also pass RMAP_EXCLUSIVE in follow up patches when detecting that an anonymous page is exclusive: for example, when restoring an anonymous page from a writable migration entry. This is a preparation for marking an anonymous page inside page_add_anon_rmap() as exclusive when RMAP_EXCLUSIVE is passed. Link: https://lkml.kernel.org/r/20220428083441.37290-7-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c53a38151089..643801a937f3 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -183,8 +183,6 @@ typedef int __bitwise rmap_t; */ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address, bool compound); -void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, bool compound); -- cgit v1.2.3 From 28c5209dfd5f86f4398ce01bfac8508b2c4d4050 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:43 -0700 Subject: mm/rmap: pass rmap flags to hugepage_add_anon_rmap() Let's prepare for passing RMAP_EXCLUSIVE, similarly as we do for page_add_anon_rmap() now. RMAP_COMPOUND is implicit for hugetlb pages and ignored. Link: https://lkml.kernel.org/r/20220428083441.37290-8-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 643801a937f3..90e1e6925789 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -191,7 +191,7 @@ void page_add_file_rmap(struct page *, struct vm_area_struct *, void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address); + unsigned long address, rmap_t flags); void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); -- cgit v1.2.3 From 40f2bbf71161fa9195c7869004290003af152375 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:43 -0700 Subject: mm/rmap: drop "compound" parameter from page_add_new_anon_rmap() New anonymous pages are always mapped natively: only THP/khugepaged code maps a new compound anonymous page and passes "true". Otherwise, we're just dealing with simple, non-compound pages. Let's give the interface clearer semantics and document these. Remove the PageTransCompound() sanity check from page_add_new_anon_rmap(). Link: https://lkml.kernel.org/r/20220428083441.37290-9-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/rmap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 90e1e6925789..e4156921eea9 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -185,11 +185,12 @@ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address, bool compound); + unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, bool compound); void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); + void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, -- cgit v1.2.3 From 78fbe906cc900b33ce078102e13e0e99b5b8c406 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:44 -0700 Subject: mm/page-flags: reuse PG_mappedtodisk as PG_anon_exclusive for PageAnon() pages The basic question we would like to have a reliable and efficient answer to is: is this anonymous page exclusive to a single process or might it be shared? We need that information for ordinary/single pages, hugetlb pages, and possibly each subpage of a THP. Introduce a way to mark an anonymous page as exclusive, with the ultimate goal of teaching our COW logic to not do "wrong COWs", whereby GUP pins lose consistency with the pages mapped into the page table, resulting in reported memory corruptions. Most pageflags already have semantics for anonymous pages, however, PG_mappedtodisk should never apply to pages in the swapcache, so let's reuse that flag. As PG_has_hwpoisoned also uses that flag on the second tail page of a compound page, convert it to PG_error instead, which is marked as PF_NO_TAIL, so never used for tail pages. Use custom page flag modification functions such that we can do additional sanity checks. The semantics we'll put into some kernel doc in the future are: " PG_anon_exclusive is *usually* only expressive in combination with a page table entry. Depending on the page table entry type it might store the following information: Is what's mapped via this page table entry exclusive to the single process and can be mapped writable without further checks? If not, it might be shared and we might have to COW. For now, we only expect PTE-mapped THPs to make use of PG_anon_exclusive in subpages. For other anonymous compound folios (i.e., hugetlb), only the head page is logically mapped and holds this information. For example, an exclusive, PMD-mapped THP only has PG_anon_exclusive set on the head page. When replacing the PMD by a page table full of PTEs, PG_anon_exclusive, if set on the head page, will be set on all tail pages accordingly. Note that converting from a PTE-mapping to a PMD mapping using the same compound page is currently not possible and consequently doesn't require care. If GUP wants to take a reliable pin (FOLL_PIN) on an anonymous page, it should only pin if the relevant PG_anon_exclusive is set. In that case, the pin will be fully reliable and stay consistent with the pages mapped into the page table, as the bit cannot get cleared (e.g., by fork(), KSM) while the page is pinned. For anonymous pages that are mapped R/W, PG_anon_exclusive can be assumed to always be set because such pages cannot possibly be shared. The page table lock protecting the page table entry is the primary synchronization mechanism for PG_anon_exclusive; GUP-fast that does not take the PT lock needs special care when trying to clear the flag. Page table entry types and PG_anon_exclusive: * Present: PG_anon_exclusive applies. * Swap: the information is lost. PG_anon_exclusive was cleared. * Migration: the entry holds this information instead. PG_anon_exclusive was cleared. * Device private: PG_anon_exclusive applies. * Device exclusive: PG_anon_exclusive applies. * HW Poison: PG_anon_exclusive is stale and not changed. If the page may be pinned (FOLL_PIN), clearing PG_anon_exclusive is not allowed and the flag will stick around until the page is freed and folio->mapping is cleared. " We won't be clearing PG_anon_exclusive on destructive unmapping (i.e., zapping) of page table entries, page freeing code will handle that when also invalidate page->mapping to not indicate PageAnon() anymore. Letting information about exclusivity stick around will be an important property when adding sanity checks to unpinning code. Note that we properly clear the flag in free_pages_prepare() via PAGE_FLAGS_CHECK_AT_PREP for each individual subpage of a compound page, so there is no need to manually clear the flag. Link: https://lkml.kernel.org/r/20220428083441.37290-12-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1ea896887ee4..b70124b9c7c1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -142,6 +142,15 @@ enum pageflags { PG_readahead = PG_reclaim, + /* + * Depending on the way an anonymous folio can be mapped into a page + * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped + * THP), PG_anon_exclusive may be set only for the head page or for + * tail pages of an anonymous folio. For now, we only expect it to be + * set on tail pages for PTE-mapped THP. + */ + PG_anon_exclusive = PG_mappedtodisk, + /* Filesystems */ PG_checked = PG_owner_priv_1, @@ -176,7 +185,7 @@ enum pageflags { * Indicates that at least one subpage is hwpoisoned in the * THP. */ - PG_has_hwpoisoned = PG_mappedtodisk, + PG_has_hwpoisoned = PG_error, #endif /* non-lru isolated movable page */ @@ -1002,6 +1011,34 @@ extern bool is_free_buddy_page(struct page *page); PAGEFLAG(Isolated, isolated, PF_ANY); +static __always_inline int PageAnonExclusive(struct page *page) +{ + VM_BUG_ON_PGFLAGS(!PageAnon(page), page); + VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); + return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); +} + +static __always_inline void SetPageAnonExclusive(struct page *page) +{ + VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page); + VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); + set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); +} + +static __always_inline void ClearPageAnonExclusive(struct page *page) +{ + VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page); + VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); + clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); +} + +static __always_inline void __ClearPageAnonExclusive(struct page *page) +{ + VM_BUG_ON_PGFLAGS(!PageAnon(page), page); + VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); + __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); +} + #ifdef CONFIG_MMU #define __PG_MLOCKED (1UL << PG_mlocked) #else -- cgit v1.2.3 From 6c287605fd56466e645693eff3ae7c08fba56e0a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:44 -0700 Subject: mm: remember exclusively mapped anonymous pages with PG_anon_exclusive Let's mark exclusively mapped anonymous pages with PG_anon_exclusive as exclusive, and use that information to make GUP pins reliable and stay consistent with the page mapped into the page table even if the page table entry gets write-protected. With that information at hand, we can extend our COW logic to always reuse anonymous pages that are exclusive. For anonymous pages that might be shared, the existing logic applies. As already documented, PG_anon_exclusive is usually only expressive in combination with a page table entry. Especially PTE vs. PMD-mapped anonymous pages require more thought, some examples: due to mremap() we can easily have a single compound page PTE-mapped into multiple page tables exclusively in a single process -- multiple page table locks apply. Further, due to MADV_WIPEONFORK we might not necessarily write-protect all PTEs, and only some subpages might be pinned. Long story short: once PTE-mapped, we have to track information about exclusivity per sub-page, but until then, we can just track it for the compound page in the head page and not having to update a whole bunch of subpages all of the time for a simple PMD mapping of a THP. For simplicity, this commit mostly talks about "anonymous pages", while it's for THP actually "the part of an anonymous folio referenced via a page table entry". To not spill PG_anon_exclusive code all over the mm code-base, we let the anon rmap code to handle all PG_anon_exclusive logic it can easily handle. If a writable, present page table entry points at an anonymous (sub)page, that (sub)page must be PG_anon_exclusive. If GUP wants to take a reliably pin (FOLL_PIN) on an anonymous page references via a present page table entry, it must only pin if PG_anon_exclusive is set for the mapped (sub)page. This commit doesn't adjust GUP, so this is only implicitly handled for FOLL_WRITE, follow-up commits will teach GUP to also respect it for FOLL_PIN without FOLL_WRITE, to make all GUP pins of anonymous pages fully reliable. Whenever an anonymous page is to be shared (fork(), KSM), or when temporarily unmapping an anonymous page (swap, migration), the relevant PG_anon_exclusive bit has to be cleared to mark the anonymous page possibly shared. Clearing will fail if there are GUP pins on the page: * For fork(), this means having to copy the page and not being able to share it. fork() protects against concurrent GUP using the PT lock and the src_mm->write_protect_seq. * For KSM, this means sharing will fail. For swap this means, unmapping will fail, For migration this means, migration will fail early. All three cases protect against concurrent GUP using the PT lock and a proper clear/invalidate+flush of the relevant page table entry. This fixes memory corruptions reported for FOLL_PIN | FOLL_WRITE, when a pinned page gets mapped R/O and the successive write fault ends up replacing the page instead of reusing it. It improves the situation for O_DIRECT/vmsplice/... that still use FOLL_GET instead of FOLL_PIN, if fork() is *not* involved, however swapout and fork() are still problematic. Properly using FOLL_PIN instead of FOLL_GET for these GUP users will fix the issue for them. I. Details about basic handling I.1. Fresh anonymous pages page_add_new_anon_rmap() and hugepage_add_new_anon_rmap() will mark the given page exclusive via __page_set_anon_rmap(exclusive=1). As that is the mechanism fresh anonymous pages come into life (besides migration code where we copy the page->mapping), all fresh anonymous pages will start out as exclusive. I.2. COW reuse handling of anonymous pages When a COW handler stumbles over a (sub)page that's marked exclusive, it simply reuses it. Otherwise, the handler tries harder under page lock to detect if the (sub)page is exclusive and can be reused. If exclusive, page_move_anon_rmap() will mark the given (sub)page exclusive. Note that hugetlb code does not yet check for PageAnonExclusive(), as it still uses the old COW logic that is prone to the COW security issue because hugetlb code cannot really tolerate unnecessary/wrong COW as huge pages are a scarce resource. I.3. Migration handling try_to_migrate() has to try marking an exclusive anonymous page shared via page_try_share_anon_rmap(). If it fails because there are GUP pins on the page, unmap fails. migrate_vma_collect_pmd() and __split_huge_pmd_locked() are handled similarly. Writable migration entries implicitly point at shared anonymous pages. For readable migration entries that information is stored via a new "readable-exclusive" migration entry, specific to anonymous pages. When restoring a migration entry in remove_migration_pte(), information about exlusivity is detected via the migration entry type, and RMAP_EXCLUSIVE is set accordingly for page_add_anon_rmap()/hugepage_add_anon_rmap() to restore that information. I.4. Swapout handling try_to_unmap() has to try marking the mapped page possibly shared via page_try_share_anon_rmap(). If it fails because there are GUP pins on the page, unmap fails. For now, information about exclusivity is lost. In the future, we might want to remember that information in the swap entry in some cases, however, it requires more thought, care, and a way to store that information in swap entries. I.5. Swapin handling do_swap_page() will never stumble over exclusive anonymous pages in the swap cache, as try_to_migrate() prohibits that. do_swap_page() always has to detect manually if an anonymous page is exclusive and has to set RMAP_EXCLUSIVE for page_add_anon_rmap() accordingly. I.6. THP handling __split_huge_pmd_locked() has to move the information about exclusivity from the PMD to the PTEs. a) In case we have a readable-exclusive PMD migration entry, simply insert readable-exclusive PTE migration entries. b) In case we have a present PMD entry and we don't want to freeze ("convert to migration entries"), simply forward PG_anon_exclusive to all sub-pages, no need to temporarily clear the bit. c) In case we have a present PMD entry and want to freeze, handle it similar to try_to_migrate(): try marking the page shared first. In case we fail, we ignore the "freeze" instruction and simply split ordinarily. try_to_migrate() will properly fail because the THP is still mapped via PTEs. When splitting a compound anonymous folio (THP), the information about exclusivity is implicitly handled via the migration entries: no need to replicate PG_anon_exclusive manually. I.7. fork() handling fork() handling is relatively easy, because PG_anon_exclusive is only expressive for some page table entry types. a) Present anonymous pages page_try_dup_anon_rmap() will mark the given subpage shared -- which will fail if the page is pinned. If it failed, we have to copy (or PTE-map a PMD to handle it on the PTE level). Note that device exclusive entries are just a pointer at a PageAnon() page. fork() will first convert a device exclusive entry to a present page table and handle it just like present anonymous pages. b) Device private entry Device private entries point at PageAnon() pages that cannot be mapped directly and, therefore, cannot get pinned. page_try_dup_anon_rmap() will mark the given subpage shared, which cannot fail because they cannot get pinned. c) HW poison entries PG_anon_exclusive will remain untouched and is stale -- the page table entry is just a placeholder after all. d) Migration entries Writable and readable-exclusive entries are converted to readable entries: possibly shared. I.8. mprotect() handling mprotect() only has to properly handle the new readable-exclusive migration entry: When write-protecting a migration entry that points at an anonymous page, remember the information about exclusivity via the "readable-exclusive" migration entry type. II. Migration and GUP-fast Whenever replacing a present page table entry that maps an exclusive anonymous page by a migration entry, we have to mark the page possibly shared and synchronize against GUP-fast by a proper clear/invalidate+flush to make the following scenario impossible: 1. try_to_migrate() places a migration entry after checking for GUP pins and marks the page possibly shared. 2. GUP-fast pins the page due to lack of synchronization 3. fork() converts the "writable/readable-exclusive" migration entry into a readable migration entry 4. Migration fails due to the GUP pin (failing to freeze the refcount) 5. Migration entries are restored. PG_anon_exclusive is lost -> We have a pinned page that is not marked exclusive anymore. Note that we move information about exclusivity from the page to the migration entry as it otherwise highly overcomplicates fork() and PTE-mapping a THP. III. Swapout and GUP-fast Whenever replacing a present page table entry that maps an exclusive anonymous page by a swap entry, we have to mark the page possibly shared and synchronize against GUP-fast by a proper clear/invalidate+flush to make the following scenario impossible: 1. try_to_unmap() places a swap entry after checking for GUP pins and clears exclusivity information on the page. 2. GUP-fast pins the page due to lack of synchronization. -> We have a pinned page that is not marked exclusive anymore. If we'd ever store information about exclusivity in the swap entry, similar to migration handling, the same considerations as in II would apply. This is future work. Link: https://lkml.kernel.org/r/20220428083441.37290-13-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/rmap.h | 40 ++++++++++++++++++++++++++++++++++++++++ include/linux/swap.h | 15 +++++++++++---- include/linux/swapops.h | 25 +++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e4156921eea9..cbe279a6f0de 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -228,6 +228,13 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, { VM_BUG_ON_PAGE(!PageAnon(page), page); + /* + * No need to check+clear for already shared pages, including KSM + * pages. + */ + if (!PageAnonExclusive(page)) + goto dup; + /* * If this page may have been pinned by the parent process, * don't allow to duplicate the mapping but instead require to e.g., @@ -239,14 +246,47 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, unlikely(page_needs_cow_for_dma(vma, page)))) return -EBUSY; + ClearPageAnonExclusive(page); /* * It's okay to share the anon page between both processes, mapping * the page R/O into both processes. */ +dup: __page_dup_rmap(page, compound); return 0; } +/** + * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly + * shared to prepare for KSM or temporary unmapping + * @page: the exclusive anonymous page to try marking possibly shared + * + * The caller needs to hold the PT lock and has to have the page table entry + * cleared/invalidated+flushed, to properly sync against GUP-fast. + * + * This is similar to page_try_dup_anon_rmap(), however, not used during fork() + * to duplicate a mapping, but instead to prepare for KSM or temporarily + * unmapping a page (swap, migration) via page_remove_rmap(). + * + * Marking the page shared can only fail if the page may be pinned; device + * private pages cannot get pinned and consequently this function cannot fail. + * + * Returns 0 if marking the page possibly shared succeeded. Returns -EBUSY + * otherwise. + */ +static inline int page_try_share_anon_rmap(struct page *page) +{ + VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page); + + /* See page_try_dup_anon_rmap(). */ + if (likely(!is_device_private_page(page) && + unlikely(page_maybe_dma_pinned(page)))) + return -EBUSY; + + ClearPageAnonExclusive(page); + return 0; +} + /* * Called from mm/vmscan.c to handle paging out */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 27093b477c5f..e6d70a4156e8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -78,12 +78,19 @@ static inline int current_is_kswapd(void) #endif /* - * NUMA node memory migration support + * Page migration support. + * + * SWP_MIGRATION_READ_EXCLUSIVE is only applicable to anonymous pages and + * indicates that the referenced (part of) an anonymous page is exclusive to + * a single process. For SWP_MIGRATION_WRITE, that information is implicit: + * (part of) an anonymous page that are mapped writable are exclusive to a + * single process. */ #ifdef CONFIG_MIGRATION -#define SWP_MIGRATION_NUM 2 -#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM) -#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1) +#define SWP_MIGRATION_NUM 3 +#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM) +#define SWP_MIGRATION_READ_EXCLUSIVE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1) +#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 2) #else #define SWP_MIGRATION_NUM 0 #endif diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 5af852b68805..6648b97244e7 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -194,6 +194,7 @@ static inline bool is_writable_device_exclusive_entry(swp_entry_t entry) static inline int is_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ || + swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE || swp_type(entry) == SWP_MIGRATION_WRITE); } @@ -202,11 +203,26 @@ static inline int is_writable_migration_entry(swp_entry_t entry) return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); } +static inline int is_readable_migration_entry(swp_entry_t entry) +{ + return unlikely(swp_type(entry) == SWP_MIGRATION_READ); +} + +static inline int is_readable_exclusive_migration_entry(swp_entry_t entry) +{ + return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE); +} + static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_READ, offset); } +static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) +{ + return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, offset); +} + static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_WRITE, offset); @@ -224,6 +240,11 @@ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) return swp_entry(0, 0); } +static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) +{ + return swp_entry(0, 0); +} + static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(0, 0); @@ -244,6 +265,10 @@ static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; } +static inline int is_readable_migration_entry(swp_entry_t entry) +{ + return 0; +} #endif -- cgit v1.2.3 From 7f5abe609b3dcbc62a36e18b1437f4f3521ecb75 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:44 -0700 Subject: mm/rmap: fail try_to_migrate() early when setting a PMD migration entry fails Let's fail right away in case we cannot clear PG_anon_exclusive because the anon THP may be pinned. Right now, we continue trying to install migration entries and the caller of try_to_migrate() will realize that the page is still mapped and has to restore the migration entries. Let's just fail fast just like for PTE migration entries. Link: https://lkml.kernel.org/r/20220428083441.37290-14-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/swapops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 6648b97244e7..e476a8fed537 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -299,7 +299,7 @@ static inline bool is_pfn_swap_entry(swp_entry_t entry) struct page_vma_mapped_walk; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION -extern void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, +extern int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page); extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, @@ -332,7 +332,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd) return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); } #else -static inline void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, +static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) { BUILD_BUG(); -- cgit v1.2.3 From c89357e27f20dda3fff6791d27bb6c91eae99f4a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:45 -0700 Subject: mm: support GUP-triggered unsharing of anonymous pages Whenever GUP currently ends up taking a R/O pin on an anonymous page that might be shared -- mapped R/O and !PageAnonExclusive() -- any write fault on the page table entry will end up replacing the mapped anonymous page due to COW, resulting in the GUP pin no longer being consistent with the page actually mapped into the page table. The possible ways to deal with this situation are: (1) Ignore and pin -- what we do right now. (2) Fail to pin -- which would be rather surprising to callers and could break user space. (3) Trigger unsharing and pin the now exclusive page -- reliable R/O pins. We want to implement 3) because it provides the clearest semantics and allows for checking in unpin_user_pages() and friends for possible BUGs: when trying to unpin a page that's no longer exclusive, clearly something went very wrong and might result in memory corruptions that might be hard to debug. So we better have a nice way to spot such issues. To implement 3), we need a way for GUP to trigger unsharing: FAULT_FLAG_UNSHARE. FAULT_FLAG_UNSHARE is only applicable to R/O mapped anonymous pages and resembles COW logic during a write fault. However, in contrast to a write fault, GUP-triggered unsharing will, for example, still maintain the write protection. Let's implement FAULT_FLAG_UNSHARE by hooking into the existing write fault handlers for all applicable anonymous page types: ordinary pages, THP and hugetlb. * If FAULT_FLAG_UNSHARE finds a R/O-mapped anonymous page that has been marked exclusive in the meantime by someone else, there is nothing to do. * If FAULT_FLAG_UNSHARE finds a R/O-mapped anonymous page that's not marked exclusive, it will try detecting if the process is the exclusive owner. If exclusive, it can be set exclusive similar to reuse logic during write faults via page_move_anon_rmap() and there is nothing else to do; otherwise, we either have to copy and map a fresh, anonymous exclusive page R/O (ordinary pages, hugetlb), or split the THP. This commit is heavily based on patches by Andrea. Link: https://lkml.kernel.org/r/20220428083441.37290-16-david@redhat.com Signed-off-by: Andrea Arcangeli Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Co-developed-by: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 87eddd509de2..7216d77a5884 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -819,6 +819,9 @@ typedef struct { * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to unshare (and mark + * exclusive) a possibly shared anonymous page that is + * mapped R/O. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two @@ -838,6 +841,10 @@ typedef struct { * continuous faults with flags (b). We should always try to detect pending * signals before a retry to make sure the continuous page faults can still be * interrupted if necessary. + * + * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal. + * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when + * no existing R/O-mapped anonymous page is encountered. */ enum fault_flag { FAULT_FLAG_WRITE = 1 << 0, @@ -850,6 +857,7 @@ enum fault_flag { FAULT_FLAG_REMOTE = 1 << 7, FAULT_FLAG_INSTRUCTION = 1 << 8, FAULT_FLAG_INTERRUPTIBLE = 1 << 9, + FAULT_FLAG_UNSHARE = 1 << 10, }; #endif /* _LINUX_MM_TYPES_H */ -- cgit v1.2.3 From a7f226604170acd6b142b76472c1a49c12ebb83d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:45 -0700 Subject: mm/gup: trigger FAULT_FLAG_UNSHARE when R/O-pinning a possibly shared anonymous page Whenever GUP currently ends up taking a R/O pin on an anonymous page that might be shared -- mapped R/O and !PageAnonExclusive() -- any write fault on the page table entry will end up replacing the mapped anonymous page due to COW, resulting in the GUP pin no longer being consistent with the page actually mapped into the page table. The possible ways to deal with this situation are: (1) Ignore and pin -- what we do right now. (2) Fail to pin -- which would be rather surprising to callers and could break user space. (3) Trigger unsharing and pin the now exclusive page -- reliable R/O pins. Let's implement 3) because it provides the clearest semantics and allows for checking in unpin_user_pages() and friends for possible BUGs: when trying to unpin a page that's no longer exclusive, clearly something went very wrong and might result in memory corruptions that might be hard to debug. So we better have a nice way to spot such issues. This change implies that whenever user space *wrote* to a private mapping (IOW, we have an anonymous page mapped), that GUP pins will always remain consistent: reliable R/O GUP pins of anonymous pages. As a side note, this commit fixes the COW security issue for hugetlb with FOLL_PIN as documented in: https://lore.kernel.org/r/3ae33b08-d9ef-f846-56fb-645e3b9b4c66@redhat.com The vmsplice reproducer still applies, because vmsplice uses FOLL_GET instead of FOLL_PIN. Note that follow_huge_pmd() doesn't apply because we cannot end up in there with FOLL_PIN. This commit is heavily based on prototype patches by Andrea. Link: https://lkml.kernel.org/r/20220428083441.37290-17-david@redhat.com Signed-off-by: Andrea Arcangeli Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Co-developed-by: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Rientjes Cc: Don Dutile Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9ee3ae51e8e3..5dc5005ccc9b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3002,6 +3002,45 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) return 0; } +/* + * Indicates for which pages that are write-protected in the page table, + * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the + * GUP pin will remain consistent with the pages mapped into the page tables + * of the MM. + * + * Temporary unmapping of PageAnonExclusive() pages or clearing of + * PageAnonExclusive() has to protect against concurrent GUP: + * * Ordinary GUP: Using the PT lock + * * GUP-fast and fork(): mm->write_protect_seq + * * GUP-fast and KSM or temporary unmapping (swap, migration): + * clear/invalidate+flush of the page table entry + * + * Must be called with the (sub)page that's actually referenced via the + * page table entry, which might not necessarily be the head page for a + * PTE-mapped THP. + */ +static inline bool gup_must_unshare(unsigned int flags, struct page *page) +{ + /* + * FOLL_WRITE is implicitly handled correctly as the page table entry + * has to be writable -- and if it references (part of) an anonymous + * folio, that part is required to be marked exclusive. + */ + if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) + return false; + /* + * Note: PageAnon(page) is stable until the page is actually getting + * freed. + */ + if (!PageAnon(page)) + return false; + /* + * Note that PageKsm() pages cannot be exclusive, and consequently, + * cannot get pinned. + */ + return !PageAnonExclusive(page); +} + typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); -- cgit v1.2.3 From 1493a1913e34b0ac366e33f9ebad721e69fd06ac Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:45 -0700 Subject: mm/swap: remember PG_anon_exclusive via a swp pte bit Patch series "mm: COW fixes part 3: reliable GUP R/W FOLL_GET of anonymous pages", v2. This series fixes memory corruptions when a GUP R/W reference (FOLL_WRITE | FOLL_GET) was taken on an anonymous page and COW logic fails to detect exclusivity of the page to then replacing the anonymous page by a copy in the page table: The GUP reference lost synchronicity with the pages mapped into the page tables. This series focuses on x86, arm64, s390x and ppc64/book3s -- other architectures are fairly easy to support by implementing __HAVE_ARCH_PTE_SWP_EXCLUSIVE. This primarily fixes the O_DIRECT memory corruptions that can happen on concurrent swapout, whereby we lose DMA reads to a page (modifying the user page by writing to it). O_DIRECT currently uses FOLL_GET for short-term (!FOLL_LONGTERM) DMA from/to a user page. In the long run, we want to convert it to properly use FOLL_PIN, and John is working on it, but that might take a while and might not be easy to backport. In the meantime, let's restore what used to work before we started modifying our COW logic: make R/W FOLL_GET references reliable as long as there is no fork() after GUP involved. This is just the natural follow-up of part 2, that will also further reduce "wrong COW" on the swapin path, for example, when we cannot remove a page from the swapcache due to concurrent writeback, or if we have two threads faulting on the same swapped-out page. Fixing O_DIRECT is just a nice side-product This issue, including other related COW issues, has been summarized in [3] under 2): " 2. Intra Process Memory Corruptions due to Wrong COW (FOLL_GET) It was discovered that we can create a memory corruption by reading a file via O_DIRECT to a part (e.g., first 512 bytes) of a page, concurrently writing to an unrelated part (e.g., last byte) of the same page, and concurrently write-protecting the page via clear_refs SOFTDIRTY tracking [6]. For the reproducer, the issue is that O_DIRECT grabs a reference of the target page (via FOLL_GET) and clear_refs write-protects the relevant page table entry. On successive write access to the page from the process itself, we wrongly COW the page when resolving the write fault, resulting in a loss of synchronicity and consequently a memory corruption. While some people might think that using clear_refs in this combination is a corner cases, it turns out to be a more generic problem unfortunately. For example, it was just recently discovered that we can similarly create a memory corruption without clear_refs, simply by concurrently swapping out the buffer pages [7]. Note that we nowadays even use the swap infrastructure in Linux without an actual swap disk/partition: the prime example is zram which is enabled as default under Fedora [10]. The root issue is that a write-fault on a page that has additional references results in a COW and thereby a loss of synchronicity and consequently a memory corruption if two parties believe they are referencing the same page. " We don't particularly care about R/O FOLL_GET references: they were never reliable and O_DIRECT doesn't expect to observe modifications from a page after DMA was started. Note that: * this only fixes the issue on x86, arm64, s390x and ppc64/book3s ("enterprise architectures"). Other architectures have to implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE to achieve the same. * this does *not * consider any kind of fork() after taking the reference: fork() after GUP never worked reliably with FOLL_GET. * Not losing PG_anon_exclusive during swapout was the last remaining piece. KSM already makes sure that there are no other references on a page before considering it for sharing. Page migration maintains PG_anon_exclusive and simply fails when there are additional references (freezing the refcount fails). Only swapout code dropped the PG_anon_exclusive flag because it requires more work to remember + restore it. With this series in place, most COW issues of [3] are fixed on said architectures. Other architectures can implement __HAVE_ARCH_PTE_SWP_EXCLUSIVE fairly easily. [1] https://lkml.kernel.org/r/20220329160440.193848-1-david@redhat.com [2] https://lkml.kernel.org/r/20211217113049.23850-1-david@redhat.com [3] https://lore.kernel.org/r/3ae33b08-d9ef-f846-56fb-645e3b9b4c66@redhat.com This patch (of 8): Currently, we clear PG_anon_exclusive in try_to_unmap() and forget about it. We do this, to keep fork() logic on swap entries easy and efficient: for example, if we wouldn't clear it when unmapping, we'd have to lookup the page in the swapcache for each and every swap entry during fork() and clear PG_anon_exclusive if set. Instead, we want to store that information directly in the swap pte, protected by the page table lock, similarly to how we handle SWP_MIGRATION_READ_EXCLUSIVE for migration entries. However, for actual swap entries, we don't want to mess with the swap type (e.g., still one bit) because it overcomplicates swap code. In try_to_unmap(), we already reject to unmap in case the page might be pinned, because we must not lose PG_anon_exclusive on pinned pages ever. Checking if there are other unexpected references reliably *before* completely unmapping a page is unfortunately not really possible: THP heavily overcomplicate the situation. Once fully unmapped it's easier -- we, for example, make sure that there are no unexpected references *after* unmapping a page before starting writeback on that page. So, we currently might end up unmapping a page and clearing PG_anon_exclusive if that page has additional references, for example, due to a FOLL_GET. do_swap_page() has to re-determine if a page is exclusive, which will easily fail if there are other references on a page, most prominently GUP references via FOLL_GET. This can currently result in memory corruptions when taking a FOLL_GET | FOLL_WRITE reference on a page even when fork() is never involved: try_to_unmap() will succeed, and when refaulting the page, it cannot be marked exclusive and will get replaced by a copy in the page tables on the next write access, resulting in writes via the GUP reference to the page being lost. In an ideal world, everybody that uses GUP and wants to modify page content, such as O_DIRECT, would properly use FOLL_PIN. However, that conversion will take a while. It's easier to fix what used to work in the past (FOLL_GET | FOLL_WRITE) remembering PG_anon_exclusive. In addition, by remembering PG_anon_exclusive we can further reduce unnecessary COW in some cases, so it's the natural thing to do. So let's transfer the PG_anon_exclusive information to the swap pte and store it via an architecture-dependant pte bit; use that information when restoring the swap pte in do_swap_page() and unuse_pte(). During fork(), we simply have to clear the pte bit and are done. Of course, there is one corner case to handle: swap backends that don't support concurrent page modifications while the page is under writeback. Special case these, and drop the exclusive marker. Add a comment why that is just fine (also, reuse_swap_page() would have done the same in the past). In the future, we'll hopefully have all architectures support __HAVE_ARCH_PTE_SWP_EXCLUSIVE, such that we can get rid of the empty stubs and the define completely. Then, we can also convert SWP_MIGRATION_READ_EXCLUSIVE. For architectures it's fairly easy to support: either simply use a yet unused pte bit that can be used for swap entries, steal one from the arch type bits if they exceed 5, or steal one from the offset bits. Note: R/O FOLL_GET references were never really reliable, especially when taking one on a shared page and then writing to the page (e.g., GUP after fork()). FOLL_GET, including R/W references, were never really reliable once fork was involved (e.g., GUP before fork(), GUP during fork()). KSM steps back in case it stumbles over unexpected references and is, therefore, fine. [david@redhat.com: fix SWP_STABLE_WRITES test] Link: https://lkml.kernel.org/r/ac725bcb-313a-4fff-250a-68ba9a8f85fb@redhat.comLink: https://lkml.kernel.org/r/20220329164329.208407-1-david@redhat.com Link: https://lkml.kernel.org/r/20220329164329.208407-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Shakeel Butt Cc: John Hubbard Cc: Jason Gunthorpe Cc: Mike Kravetz Cc: Mike Rapoport Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox (Oracle) Cc: Jann Horn Cc: Michal Hocko Cc: Nadav Amit Cc: Rik van Riel Cc: Roman Gushchin Cc: Andrea Arcangeli Cc: Peter Xu Cc: Don Dutile Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: Jan Kara Cc: Liang Zhang Cc: Pedro Demarchi Gomes Cc: Oded Gabbay Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Gerald Schaefer Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 29 +++++++++++++++++++++++++++++ include/linux/swapops.h | 2 ++ 2 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index f4f4077b97aa..53750224e176 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1003,6 +1003,35 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define arch_start_context_switch(prev) do {} while (0) #endif +/* + * When replacing an anonymous page by a real (!non) swap entry, we clear + * PG_anon_exclusive from the page and instead remember whether the flag was + * set in the swp pte. During fork(), we have to mark the entry as !exclusive + * (possibly shared). On swapin, we use that information to restore + * PG_anon_exclusive, which is very helpful in cases where we might have + * additional (e.g., FOLL_GET) references on a page and wouldn't be able to + * detect exclusivity. + * + * These functions don't apply to non-swap entries (e.g., migration, hwpoison, + * ...). + */ +#ifndef __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return pte; +} + +static inline int pte_swp_exclusive(pte_t pte) +{ + return false; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return pte; +} +#endif + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index e476a8fed537..d1b728904d4e 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -26,6 +26,8 @@ /* Clear all flags but only keep swp_entry_t related information */ static inline pte_t pte_swp_clear_flags(pte_t pte) { + if (pte_swp_exclusive(pte)) + pte = pte_swp_clear_exclusive(pte); if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_soft_dirty(pte); if (pte_swp_uffd_wp(pte)) -- cgit v1.2.3 From 014bb1de4fc17d54907d54418126a9a9736f4aff Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:47 -0700 Subject: mm: create new mm/swap.h header file Patch series "MM changes to improve swap-over-NFS support". Assorted improvements for swap-via-filesystem. This is a resend of these patches, rebased on current HEAD. The only substantial changes is that swap_dirty_folio has replaced swap_set_page_dirty. Currently swap-via-fs (SWP_FS_OPS) doesn't work for any filesystem. It has previously worked for NFS but that broke a few releases back. This series changes to use a new ->swap_rw rather than ->readpage and ->direct_IO. It also makes other improvements. There is a companion series already in linux-next which fixes various issues with NFS. Once both series land, a final patch is needed which changes NFS over to use ->swap_rw. This patch (of 10): Many functions declared in include/linux/swap.h are only used within mm/ Create a new "mm/swap.h" and move some of these declarations there. Remove the redundant 'extern' from the function declarations. [akpm@linux-foundation.org: mm/memory-failure.c needs mm/swap.h] Link: https://lkml.kernel.org/r/164859751830.29473.5309689752169286816.stgit@noble.brown Link: https://lkml.kernel.org/r/164859778120.29473.11725907882296224053.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Trond Myklebust Cc: Hugh Dickins Cc: Mel Gorman Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/swap.h | 121 --------------------------------------------------- 1 file changed, 121 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index e6d70a4156e8..def67112dce4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -427,62 +427,19 @@ extern void kswapd_stop(int nid); #ifdef CONFIG_SWAP -#include /* for bio_end_io_t */ - -/* linux/mm/page_io.c */ -extern int swap_readpage(struct page *page, bool do_poll); -extern int swap_writepage(struct page *page, struct writeback_control *wbc); -extern void end_swap_bio_write(struct bio *bio); -extern int __swap_writepage(struct page *page, struct writeback_control *wbc, - bio_end_io_t end_write_func); bool swap_dirty_folio(struct address_space *mapping, struct folio *folio); - int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); int generic_swapfile_activate(struct swap_info_struct *, struct file *, sector_t *); -/* linux/mm/swap_state.c */ -/* One swap address space for each 64M swap space */ -#define SWAP_ADDRESS_SPACE_SHIFT 14 -#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) -extern struct address_space *swapper_spaces[]; -#define swap_address_space(entry) \ - (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ - >> SWAP_ADDRESS_SPACE_SHIFT]) static inline unsigned long total_swapcache_pages(void) { return global_node_page_state(NR_SWAPCACHE); } -extern void show_swap_cache_info(void); -extern int add_to_swap(struct page *page); -extern void *get_shadow_from_swap_cache(swp_entry_t entry); -extern int add_to_swap_cache(struct page *page, swp_entry_t entry, - gfp_t gfp, void **shadowp); -extern void __delete_from_swap_cache(struct page *page, - swp_entry_t entry, void *shadow); -extern void delete_from_swap_cache(struct page *); -extern void clear_shadow_from_swap_cache(int type, unsigned long begin, - unsigned long end); -extern void free_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); -extern struct page *lookup_swap_cache(swp_entry_t entry, - struct vm_area_struct *vma, - unsigned long addr); -struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); -extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, - struct vm_area_struct *vma, unsigned long addr, - bool do_poll); -extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t, - struct vm_area_struct *vma, unsigned long addr, - bool *new_page_allocated); -extern struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, - struct vm_fault *vmf); -extern struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, - struct vm_fault *vmf); - /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; @@ -535,12 +492,6 @@ static inline void put_swap_device(struct swap_info_struct *si) } #else /* CONFIG_SWAP */ - -static inline int swap_readpage(struct page *page, bool do_poll) -{ - return 0; -} - static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) { return NULL; @@ -555,11 +506,6 @@ static inline void put_swap_device(struct swap_info_struct *si) { } -static inline struct address_space *swap_address_space(swp_entry_t entry) -{ - return NULL; -} - #define get_nr_swap_pages() 0L #define total_swap_pages 0L #define total_swapcache_pages() 0UL @@ -574,14 +520,6 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); -static inline void free_swap_cache(struct page *page) -{ -} - -static inline void show_swap_cache_info(void) -{ -} - /* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */ #define free_swap_and_cache(e) is_pfn_swap_entry(e) @@ -607,65 +545,6 @@ static inline void put_swap_page(struct page *page, swp_entry_t swp) { } -static inline struct page *swap_cluster_readahead(swp_entry_t entry, - gfp_t gfp_mask, struct vm_fault *vmf) -{ - return NULL; -} - -static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, - struct vm_fault *vmf) -{ - return NULL; -} - -static inline int swap_writepage(struct page *p, struct writeback_control *wbc) -{ - return 0; -} - -static inline struct page *lookup_swap_cache(swp_entry_t swp, - struct vm_area_struct *vma, - unsigned long addr) -{ - return NULL; -} - -static inline -struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) -{ - return find_get_page(mapping, index); -} - -static inline int add_to_swap(struct page *page) -{ - return 0; -} - -static inline void *get_shadow_from_swap_cache(swp_entry_t entry) -{ - return NULL; -} - -static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, - gfp_t gfp_mask, void **shadowp) -{ - return -1; -} - -static inline void __delete_from_swap_cache(struct page *page, - swp_entry_t entry, void *shadow) -{ -} - -static inline void delete_from_swap_cache(struct page *page) -{ -} - -static inline void clear_shadow_from_swap_cache(int type, unsigned long begin, - unsigned long end) -{ -} static inline int page_swapcount(struct page *page) { -- cgit v1.2.3 From 4c4a763406ef903b78334bd2ccea168d2f7a741a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:47 -0700 Subject: mm: drop swap_dirty_folio folios that are written to swap are owned by the MM subsystem - not any filesystem. When such a folio is passed to a filesystem to be written out to a swap-file, the filesystem handles the data, but the folio itself does not belong to the filesystem. So calling the filesystem's ->dirty_folio() address_space operation makes no sense. This is for folios in the given address space, and a folio to be written to swap does not exist in the given address space. So drop swap_dirty_folio() which calls the address-space's ->dirty_folio(), and always use noop_dirty_folio(), which is appropriate for folios being swapped out. Link: https://lkml.kernel.org/r/164859778123.29473.6900942583784889976.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index def67112dce4..8170254c56b0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -427,7 +427,6 @@ extern void kswapd_stop(int nid); #ifdef CONFIG_SWAP -bool swap_dirty_folio(struct address_space *mapping, struct folio *folio); int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); int generic_swapfile_activate(struct swap_info_struct *, struct file *, -- cgit v1.2.3 From 4b60c0ff2f2021ab99b7fb9da63b7ed1579ef1d8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:48 -0700 Subject: mm: move responsibility for setting SWP_FS_OPS to ->swap_activate If a filesystem wishes to handle all swap IO itself (via ->direct_IO and ->readpage), rather than just providing devices addresses for submit_bio(), SWP_FS_OPS must be set. Currently the protocol for setting this it to have ->swap_activate return zero. In that case SWP_FS_OPS is set, and add_swap_extent() is called for the entire file. This is a little clumsy as different return values for ->swap_activate have quite different meanings, and it makes it hard to search for which filesystems require SWP_FS_OPS to be set. So remove the special meaning of a zero return, and require the filesystem to set SWP_FS_OPS if it so desires, and to always call add_swap_extent() as required. Currently only NFS and CIFS return zero for add_swap_extent(). Link: https://lkml.kernel.org/r/164859778123.29473.17908205846599043598.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 8170254c56b0..7daae5a4b3e1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -577,6 +577,12 @@ static inline swp_entry_t get_swap_page(struct page *page) return entry; } +static inline int add_swap_extent(struct swap_info_struct *sis, + unsigned long start_page, + unsigned long nr_pages, sector_t start_block) +{ + return -EINVAL; +} #endif /* CONFIG_SWAP */ #ifdef CONFIG_THP_SWAP -- cgit v1.2.3 From e1209d3a7a67c281260ba9989621060ba7328b8c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:48 -0700 Subject: mm: introduce ->swap_rw and use it for reads from SWP_FS_OPS swap-space swap currently uses ->readpage to read swap pages. This can only request one page at a time from the filesystem, which is not most efficient. swap uses ->direct_IO for writes which while this is adequate is an inappropriate over-loading. ->direct_IO may need to had handle allocate space for holes or other details that are not relevant for swap. So this patch introduces a new address_space operation: ->swap_rw. In this patch it is used for reads, and a subsequent patch will switch writes to use it. No filesystem yet supports ->swap_rw, but that is not a problem because no filesystem actually works with filesystem-based swap. Only two filesystems set SWP_FS_OPS: - cifs sets the flag, but ->direct_IO always fails so swap cannot work. - nfs sets the flag, but ->direct_IO calls generic_write_checks() which has failed on swap files for several releases. To ensure that a NULL ->swap_rw isn't called, ->activate_swap() for both NFS and cifs are changed to fail if ->swap_rw is not set. This can be removed if/when the function is added. Future patches will restore swap-over-NFS functionality. To submit an async read with ->swap_rw() we need to allocate a structure to hold the kiocb and other details. swap_readpage() cannot handle transient failure, so we create a mempool to provide the structures. Link: https://lkml.kernel.org/r/164859778125.29473.13430559328221330589.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..dbc5d10328df 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -380,6 +380,7 @@ struct address_space_operations { int (*swap_activate)(struct swap_info_struct *sis, struct file *file, sector_t *span); void (*swap_deactivate)(struct file *file); + int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; extern const struct address_space_operations empty_aops; -- cgit v1.2.3 From eb79f3af9395fbe448e91b0940a6c395b7d06be4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:48 -0700 Subject: nfs: rename nfs_direct_IO and use as ->swap_rw The nfs_direct_IO() exists to support SWAP IO, but hasn't worked for a while. We now need a ->swap_rw function which behaves slightly differently, returning zero for success rather than a byte count. So modify nfs_direct_IO accordingly, rename it, and use it as the ->swap_rw function. Link: https://lkml.kernel.org/r/165119301493.15698.7491285551903597618.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: Geert Uytterhoeven (on Renesas RSK+RZA1 with 32 MiB of SDRAM) Cc: David Howells Cc: Hugh Dickins Cc: Mel Gorman Cc: Miaohe Lin Cc: Trond Myklebust Signed-off-by: Andrew Morton --- include/linux/nfs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index b48b9259e02c..fd5543486a3f 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -507,7 +507,7 @@ static inline const struct cred *nfs_file_cred(struct file *file) /* * linux/fs/nfs/direct.c */ -extern ssize_t nfs_direct_IO(struct kiocb *, struct iov_iter *); +int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter); ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, bool swap); ssize_t nfs_file_direct_write(struct kiocb *iocb, -- cgit v1.2.3 From 2282679fb20bf036a714ed49fadd0230c278a203 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:49 -0700 Subject: mm: submit multipage write for SWP_FS_OPS swap-space swap_writepage() is given one page at a time, but may be called repeatedly in succession. For block-device swapspace, the blk_plug functionality allows the multiple pages to be combined together at lower layers. That cannot be used for SWP_FS_OPS as blk_plug may not exist - it is only active when CONFIG_BLOCK=y. Consequently all swap reads over NFS are single page reads. With this patch we pass a pointer-to-pointer via the wbc. swap_writepage can store state between calls - much like the pointer passed explicitly to swap_readpage. After calling swap_writepage() some number of times, the state will be passed to swap_write_unplug() which can submit the combined request. Link: https://lkml.kernel.org/r/164859778128.29473.5191868522654408537.stgit@noble.brown Signed-off-by: NeilBrown Reviewed-by: Christoph Hellwig Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/writeback.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fec248ab1fec..32b35f21cb97 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -80,6 +80,13 @@ struct writeback_control { unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */ + /* To enable batching of swap writes to non-block-device backends, + * "plug" can be set point to a 'struct swap_iocb *'. When all swap + * writes have been submitted, if with swap_iocb is not NULL, + * swap_write_unplug() should be called. + */ + struct swap_iocb **swap_plug; + #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ struct inode *inode; /* inode being written out */ -- cgit v1.2.3 From a2ad63daa88b9d6846976fd2a0b5e4f5cfc58377 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 May 2022 18:20:49 -0700 Subject: VFS: add FMODE_CAN_ODIRECT file flag Currently various places test if direct IO is possible on a file by checking for the existence of the direct_IO address space operation. This is a poor choice, as the direct_IO operation may not be used - it is only used if the generic_file_*_iter functions are called for direct IO and some filesystems - particularly NFS - don't do this. Instead, introduce a new f_mode flag: FMODE_CAN_ODIRECT and change the various places to check this (avoiding pointer dereferences). do_dentry_open() will set this flag if ->direct_IO is present, so filesystems do not need to be changed. NFS *is* changed, to set the flag explicitly and discard the direct_IO entry in the address_space_operations for files. Other filesystems which currently use noop_direct_IO could usefully be changed to set this flag instead. Link: https://lkml.kernel.org/r/164859778128.29473.15189737957277399416.stgit@noble.brown Reviewed-by: Christoph Hellwig Signed-off-by: NeilBrown Tested-by: David Howells Tested-by: Geert Uytterhoeven Cc: Hugh Dickins Cc: Mel Gorman Cc: Trond Myklebust Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index dbc5d10328df..b81cacc51d2f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -162,6 +162,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File is stream-like */ #define FMODE_STREAM ((__force fmode_t)0x200000) +/* File supports DIRECT IO */ +#define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) -- cgit v1.2.3 From fa29000b6b2603ec2bfdc4c73249fcb00cd54f85 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 17:00:05 -0400 Subject: fs: Add aops->release_folio This replaces aops->releasepage. Update the documentation, and call it if it exists. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jeff Layton --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index f812f5aa07dd..ad768f13f485 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -355,6 +355,7 @@ struct address_space_operations { /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); + bool (*release_folio)(struct folio *, gfp_t); int (*releasepage) (struct page *, gfp_t); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); -- cgit v1.2.3 From 8597447dc565a6a3fa7bc503674452b7ae2b914c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 30 Apr 2022 23:01:08 -0400 Subject: iomap: Convert to release_folio Change all the filesystems which used iomap_releasepage to use the new function. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jeff Layton --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 5b2aa45ddda3..0d674695b6d3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -228,7 +228,7 @@ ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); -int iomap_releasepage(struct page *page, gfp_t gfp_mask); +bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); #ifdef CONFIG_MIGRATION int iomap_migrate_page(struct address_space *mapping, struct page *newpage, -- cgit v1.2.3 From 704ead2bed202579f025a4754e52e9ab21ff3ada Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 1 May 2022 00:27:53 -0400 Subject: fs: Remove last vestiges of releasepage All users are now converted to release_folio Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jeff Layton --- include/linux/fs.h | 1 - include/linux/page-flags.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ad768f13f485..1cee64d9724b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -356,7 +356,6 @@ struct address_space_operations { sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); bool (*release_folio)(struct folio *, gfp_t); - int (*releasepage) (struct page *, gfp_t); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 9d8eeaa67d05..af10149a6c31 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -516,7 +516,7 @@ PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. - * - PG_private and PG_private_2 cause releasepage() and co to be invoked + * - PG_private and PG_private_2 cause release_folio() and co to be invoked */ PAGEFLAG(Private, private, PF_ANY) PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) -- cgit v1.2.3 From c56a6eb03deb187c989a966fda5a254249b56c2a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 1 May 2022 00:46:03 -0400 Subject: jbd2: Convert jbd2_journal_try_to_free_buffers to take a folio Also convert it to return a bool since it's called from release_folio(). Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Theodore Ts'o Reviewed-by: Jeff Layton --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index de9536680b2b..e79d6e0b14e8 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1529,7 +1529,7 @@ extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); int jbd2_journal_invalidate_folio(journal_t *, struct folio *, size_t offset, size_t length); -extern int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page); +bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio); extern int jbd2_journal_stop(handle_t *); extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); extern void jbd2_journal_lock_updates (journal_t *); -- cgit v1.2.3 From 68189fef88c7d02eb92e038be3d6428ebd0d2945 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 1 May 2022 01:08:08 -0400 Subject: fs: Change try_to_free_buffers() to take a folio All but two of the callers already have a folio; pass a folio into try_to_free_buffers(). This removes the last user of cancel_dirty_page() so remove that wrapper function too. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jeff Layton --- include/linux/buffer_head.h | 4 ++-- include/linux/pagemap.h | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 31d82fd9abe8..c9d1463bb20f 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -158,7 +158,7 @@ void mark_buffer_write_io_error(struct buffer_head *bh); void touch_buffer(struct buffer_head *bh); void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); -int try_to_free_buffers(struct page *); +bool try_to_free_buffers(struct folio *); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry); void create_empty_buffers(struct page *, unsigned long, @@ -402,7 +402,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio); #else /* CONFIG_BLOCK */ static inline void buffer_init(void) {} -static inline int try_to_free_buffers(struct page *page) { return 1; } +static inline bool try_to_free_buffers(struct folio *folio) { return true; } static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 831b28dab01a..82dfb279e0c4 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1067,10 +1067,6 @@ static inline void folio_cancel_dirty(struct folio *folio) if (folio_test_dirty(folio)) __folio_cancel_dirty(folio); } -static inline void cancel_dirty_page(struct page *page) -{ - folio_cancel_dirty(page_folio(page)); -} bool folio_clear_dirty_for_io(struct folio *folio); bool clear_page_dirty_for_io(struct page *page); void folio_invalidate(struct folio *folio, size_t offset, size_t length); -- cgit v1.2.3 From d2329aa0c78f4a8dd368bb706f196ab99f692eaa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 1 May 2022 07:35:31 -0400 Subject: fs: Add free_folio address space operation Include documentation and convert the callers to use ->free_folio as well as ->freepage. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1cee64d9724b..915844e6293e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -356,6 +356,7 @@ struct address_space_operations { sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); bool (*release_folio)(struct folio *, gfp_t); + void (*free_folio)(struct folio *folio); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* -- cgit v1.2.3 From 8560cb1a7d75048af275dd23fb0cf05382b3c2b9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 5 May 2022 00:43:09 -0400 Subject: fs: Remove aops->freepage All implementations now use free_folio so we can delete the callers and the method. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 915844e6293e..6f305f1097a5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -357,7 +357,6 @@ struct address_space_operations { void (*invalidate_folio) (struct folio *, size_t offset, size_t len); bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *folio); - void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* * migrate the contents of a page to the specified target. If -- cgit v1.2.3 From 8324a02c342a36336114a497130826612ed5520d Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Sun, 27 Mar 2022 17:45:32 +0300 Subject: net/mlx5: Add exit route when waiting for FW Currently, removing a device needs to get the driver interface lock before doing any cleanup. If the driver is waiting in a loop for FW init, there is no way to cancel the wait, instead the device cleanup waits for the loop to conclude and release the lock. To allow immediate response to remove device commands, check the TEARDOWN flag while waiting for FW init, and exit the loop if it has been set. Signed-off-by: Gavin Li Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ff47d49d8be4..f327d0544038 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -632,6 +632,7 @@ enum mlx5_device_state { enum mlx5_interface_state { MLX5_INTERFACE_STATE_UP = BIT(0), + MLX5_BREAK_FW_WAIT = BIT(1), }; enum mlx5_pci_status { -- cgit v1.2.3 From 34a30d7635a8e37275a7b63bec09035ed762969b Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 1 Mar 2022 15:42:01 +0000 Subject: net/mlx5: Lag, expose number of lag ports Downstream patches will add support for hardware lag with more than 2 ports. Add a way for users to query the number of lag ports. Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f327d0544038..62ea1120de9c 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1142,6 +1142,7 @@ int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, int num_counters, size_t *offsets); struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev); +u8 mlx5_lag_get_num_ports(struct mlx5_core_dev *dev); struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev); void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up); int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, -- cgit v1.2.3 From 4cd14d44b11dabf195d1e66dadbb954336224658 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 1 Mar 2022 17:34:58 +0000 Subject: net/mlx5: Support devices with more than 2 ports Increase the define MLX5_MAX_PORTS to 4 as the driver is ready to support NICs with 4 ports. Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 62ea1120de9c..fdb9d07a05a4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -84,7 +84,7 @@ enum mlx5_sqp_t { }; enum { - MLX5_MAX_PORTS = 2, + MLX5_MAX_PORTS = 4, }; enum { -- cgit v1.2.3 From 7f46a0b7327ae261f9981888708dbca22c283900 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 15 Mar 2022 16:56:50 +0000 Subject: net/mlx5: Lag, add debugfs to query hardware lag state Lag state has become very complicated with many modes, flags, types and port selections methods and future work will add additional features. Add a debugfs to query the current lag state. A new directory named "lag" will be created under the mlx5 debugfs directory. As the driver has debugfs per pci function the location will be: /mlx5//lag For example: /sys/kernel/debug/mlx5/0000:08:00.0/lag The following files are exposed: - state: Returns "active" or "disabled". If "active" it means hardware lag is active. - members: Returns the BDFs of all the members of lag object. - type: Returns the type of the lag currently configured. Valid only if hardware lag is active. * "roce" - Members are bare metal PFs. * "switchdev" - Members are in switchdev mode. * "multipath" - ECMP offloads. - port_sel_mode: Returns the egress port selection method, valid only if hardware lag is active. * "queue_affinity" - Egress port is selected by the QP/SQ affinity. * "hash" - Egress port is selected by hash done on each packet. Controlled by: xmit_hash_policy of the bond device. - flags: Returns flags that are specific per lag @type. Valid only if hardware lag is active. * "shared_fdb" - "on" or "off", if "on" single FDB is used. - mapping: Returns the mapping which is used to select egress port. Valid only if hardware lag is active. If @port_sel_mode is "hash" returns the active egress ports. The hash result will select only active ports. if @port_sel_mode is "queue_affinity" returns the mapping between the configured port affinity of the QP/SQ and actual egress port. For example: * 1:1 - Mapping means if the configured affinity is port 1 traffic will egress via port 1. * 1:2 - Mapping means if the configured affinity is port 1 traffic will egress via port 2. This can happen if port 1 is down or in active/backup mode and port 1 is backup. Signed-off-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fdb9d07a05a4..d6bac3976913 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -558,6 +558,7 @@ struct mlx5_debugfs_entries { struct dentry *cq_debugfs; struct dentry *cmdif_debugfs; struct dentry *pages_debugfs; + struct dentry *lag_debugfs; }; struct mlx5_ft_pool; -- cgit v1.2.3 From 42704b26b0f1d891f6cf4ebc877dbac0d17c690d Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Fri, 6 May 2022 22:01:37 +0200 Subject: ptp: Add cycles support for virtual clocks ptp vclocks require a free running time for their timecounter. Currently only a physical clock forced to free running is supported. If vclocks are used, then the physical clock cannot be synchronized anymore. The synchronized time is not available in hardware in this case. As a result, timed transmission with TAPRIO hardware support is not possible anymore. If hardware would support a free running time additionally to the physical clock, then the physical clock does not need to be forced to free running. Thus, the physical clocks can still be synchronized while vclocks are in use. The physical clock could be used to synchronize the time domain of the TSN network and trigger TAPRIO. In parallel vclocks can be used to synchronize other time domains. Introduce support for a free running cycle counter called cycles to physical clocks. Rework ptp vclocks to use this free running cycle counter. Default implementation is based on time of physical clock. Thus, behavior of ptp vclocks based on physical clocks without free running cycle counter is identical to previous behavior. Signed-off-by: Gerhard Engleder Acked-by: Richard Cochran Signed-off-by: Paolo Abeni --- include/linux/ptp_clock_kernel.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index e8cc8b6bbf50..ad309202cf9f 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -108,6 +108,32 @@ struct ptp_system_timestamp { * @settime64: Set the current time on the hardware clock. * parameter ts: Time value to set. * + * @getcycles64: Reads the current free running cycle counter from the hardware + * clock. + * If @getcycles64 and @getcyclesx64 are not supported, then + * @gettime64 or @gettimex64 will be used as default + * implementation. + * parameter ts: Holds the result. + * + * @getcyclesx64: Reads the current free running cycle counter from the + * hardware clock and optionally also the system clock. + * If @getcycles64 and @getcyclesx64 are not supported, then + * @gettimex64 will be used as default implementation if + * available. + * parameter ts: Holds the PHC timestamp. + * parameter sts: If not NULL, it holds a pair of timestamps + * from the system clock. The first reading is made right before + * reading the lowest bits of the PHC timestamp and the second + * reading immediately follows that. + * + * @getcrosscycles: Reads the current free running cycle counter from the + * hardware clock and system clock simultaneously. + * If @getcycles64 and @getcyclesx64 are not supported, then + * @getcrosststamp will be used as default implementation if + * available. + * parameter cts: Contains timestamp (device,system) pair, + * where system time is realtime and monotonic. + * * @enable: Request driver to enable or disable an ancillary feature. * parameter request: Desired resource to enable or disable. * parameter on: Caller passes one to enable or zero to disable. @@ -155,6 +181,11 @@ struct ptp_clock_info { int (*getcrosststamp)(struct ptp_clock_info *ptp, struct system_device_crosststamp *cts); int (*settime64)(struct ptp_clock_info *p, const struct timespec64 *ts); + int (*getcycles64)(struct ptp_clock_info *ptp, struct timespec64 *ts); + int (*getcyclesx64)(struct ptp_clock_info *ptp, struct timespec64 *ts, + struct ptp_system_timestamp *sts); + int (*getcrosscycles)(struct ptp_clock_info *ptp, + struct system_device_crosststamp *cts); int (*enable)(struct ptp_clock_info *ptp, struct ptp_clock_request *request, int on); int (*verify)(struct ptp_clock_info *ptp, unsigned int pin, -- cgit v1.2.3 From 51eb7492af276b5b4d27cfa4474d40bdac7b9cf8 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Fri, 6 May 2022 22:01:38 +0200 Subject: ptp: Request cycles for TX timestamp The free running cycle counter of physical clocks called cycles shall be used for hardware timestamps to enable synchronisation. Introduce new flag SKBTX_HW_TSTAMP_USE_CYCLES, which signals driver to provide a TX timestamp based on cycles if cycles are supported. Signed-off-by: Gerhard Engleder Acked-by: Richard Cochran Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d58669d6cb91..4a4f25975ca2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -615,6 +615,9 @@ enum { /* device driver is going to provide hardware time stamp */ SKBTX_IN_PROGRESS = 1 << 2, + /* generate hardware time stamp based on cycles if supported */ + SKBTX_HW_TSTAMP_USE_CYCLES = 1 << 3, + /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, @@ -624,7 +627,9 @@ enum { #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ SKBTX_SCHED_TSTAMP) -#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP) +#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \ + SKBTX_HW_TSTAMP_USE_CYCLES | \ + SKBTX_ANY_SW_TSTAMP) /* Definitions for flags in struct skb_shared_info */ enum { -- cgit v1.2.3 From d58809d854c9ee19e4cd41023e137e65e9dc3f94 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Fri, 6 May 2022 22:01:39 +0200 Subject: ptp: Pass hwtstamp to ptp_convert_timestamp() ptp_convert_timestamp() converts only the timestamp hwtstamp, which is a field of the argument with the type struct skb_shared_hwtstamps *. So a pointer to the hwtstamp field of this structure is sufficient. Rework ptp_convert_timestamp() to use an argument of type ktime_t *. This allows to add additional timestamp manipulation stages before the call of ptp_convert_timestamp(). Signed-off-by: Gerhard Engleder Acked-by: Richard Cochran Signed-off-by: Paolo Abeni --- include/linux/ptp_clock_kernel.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index ad309202cf9f..92b44161408e 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -384,17 +384,16 @@ int ptp_get_vclocks_index(int pclock_index, int **vclock_index); /** * ptp_convert_timestamp() - convert timestamp to a ptp vclock time * - * @hwtstamps: skb_shared_hwtstamps structure pointer + * @hwtstamp: timestamp * @vclock_index: phc index of ptp vclock. * * Returns converted timestamp, or 0 on error. */ -ktime_t ptp_convert_timestamp(const struct skb_shared_hwtstamps *hwtstamps, - int vclock_index); +ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index); #else static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index) { return 0; } -static inline ktime_t ptp_convert_timestamp(const struct skb_shared_hwtstamps *hwtstamps, +static inline ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index) { return 0; } -- cgit v1.2.3 From 97dc7cd92ac67f6e05df418df1772ba4a7fbf693 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Fri, 6 May 2022 22:01:40 +0200 Subject: ptp: Support late timestamp determination If a physical clock supports a free running cycle counter, then timestamps shall be based on this time too. For TX it is known in advance before the transmission if a timestamp based on the free running cycle counter is needed. For RX it is impossible to know which timestamp is needed before the packet is received and assigned to a socket. Support late timestamp determination by a network device. Therefore, an address/cookie is stored within the new netdev_data field of struct skb_shared_hwtstamps. This address/cookie is provided to a new network device function called ndo_get_tstamp(), which returns a timestamp based on the normal/adjustable time or based on the free running cycle counter. If function is not supported, then timestamp handling is not changed. This mechanism is intended for RX, but TX use is also possible. Signed-off-by: Gerhard Engleder Acked-by: Jonathan Lemon Acked-by: Richard Cochran Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 21 +++++++++++++++++++++ include/linux/skbuff.h | 14 +++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 74c97a34921d..51fe143091cf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1356,6 +1356,12 @@ struct netdev_net_notifier { * The caller must be under RCU read context. * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); * Get the forwarding path to reach the real device from the HW destination address + * ktime_t (*ndo_get_tstamp)(struct net_device *dev, + * const struct skb_shared_hwtstamps *hwtstamps, + * bool cycles); + * Get hardware timestamp based on normal/adjustable time or free running + * cycle counter. This function is required if physical clock supports a + * free running cycle counter. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1578,6 +1584,9 @@ struct net_device_ops { struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); + ktime_t (*ndo_get_tstamp)(struct net_device *dev, + const struct skb_shared_hwtstamps *hwtstamps, + bool cycles); }; /** @@ -4761,6 +4770,18 @@ static inline void netdev_rx_csum_fault(struct net_device *dev, void net_enable_timestamp(void); void net_disable_timestamp(void); +static inline ktime_t netdev_get_tstamp(struct net_device *dev, + const struct skb_shared_hwtstamps *hwtstamps, + bool cycles) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_get_tstamp) + return ops->ndo_get_tstamp(dev, hwtstamps, cycles); + + return hwtstamps->hwtstamp; +} + static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev, bool more) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4a4f25975ca2..97de40bcfef6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -588,8 +588,10 @@ static inline bool skb_frag_must_loop(struct page *p) /** * struct skb_shared_hwtstamps - hardware time stamps - * @hwtstamp: hardware time stamp transformed into duration - * since arbitrary point in time + * @hwtstamp: hardware time stamp transformed into duration + * since arbitrary point in time + * @netdev_data: address/cookie of network device driver used as + * reference to actual hardware time stamp * * Software time stamps generated by ktime_get_real() are stored in * skb->tstamp. @@ -601,7 +603,10 @@ static inline bool skb_frag_must_loop(struct page *p) * &skb_shared_info. Use skb_hwtstamps() to get a pointer. */ struct skb_shared_hwtstamps { - ktime_t hwtstamp; + union { + ktime_t hwtstamp; + void *netdev_data; + }; }; /* Definitions for tx_flags in struct skb_shared_info */ @@ -621,6 +626,9 @@ enum { /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, + /* determine hardware time stamp based on time or cycles */ + SKBTX_HW_TSTAMP_NETDEV = 1 << 5, + /* generate software time stamp when entering packet scheduling */ SKBTX_SCHED_TSTAMP = 1 << 6, }; -- cgit v1.2.3 From 57ce2e406fe1fa005e8fdbf20936c791252ac7bc Mon Sep 17 00:00:00 2001 From: Nava kishore Manne Date: Sat, 23 Apr 2022 22:32:32 +0530 Subject: fpga: fix for coding style issues fixes the below checks reported by checkpatch.pl: - Lines should not end with a '(' - Alignment should match open parenthesis Signed-off-by: Nava kishore Manne Link: https://lore.kernel.org/r/20220423170235.2115479-3-nava.manne@xilinx.com Signed-off-by: Xu Yilun --- include/linux/fpga/fpga-region.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fpga/fpga-region.h b/include/linux/fpga/fpga-region.h index 3b87f232425c..9d4d32909340 100644 --- a/include/linux/fpga/fpga-region.h +++ b/include/linux/fpga/fpga-region.h @@ -52,9 +52,9 @@ struct fpga_region { #define to_fpga_region(d) container_of(d, struct fpga_region, dev) -struct fpga_region *fpga_region_class_find( - struct device *start, const void *data, - int (*match)(struct device *, const void *)); +struct fpga_region * +fpga_region_class_find(struct device *start, const void *data, + int (*match)(struct device *, const void *)); int fpga_region_program_fpga(struct fpga_region *region); -- cgit v1.2.3 From 846e437387e74c44ddc9f3eeec472fd37ca3cdb9 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 10 May 2022 12:02:03 +0300 Subject: net/mlx5: Expose mlx5_sriov_blocking_notifier_register / unregister APIs Expose mlx5_sriov_blocking_notifier_register / unregister APIs to let a VF register to be notified for its enablement / disablement by the PF. Upon VF probe it will call mlx5_sriov_blocking_notifier_register() with its notifier block and upon VF remove it will call mlx5_sriov_blocking_notifier_unregister() to drop its registration. This can give a VF the ability to clean some resources upon disable before that the command interface goes down and on the other hand sets some stuff before that it's enabled. This may be used by a VF which is migration capable in few cases.(e.g. PF load/unload upon an health recovery). Link: https://lore.kernel.org/r/20220510090206.90374-2-yishaih@nvidia.com Signed-off-by: Yishai Hadas Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ff47d49d8be4..6fac5427d899 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -445,6 +445,11 @@ struct mlx5_qp_table { struct radix_tree_root tree; }; +enum { + MLX5_PF_NOTIFY_DISABLE_VF, + MLX5_PF_NOTIFY_ENABLE_VF, +}; + struct mlx5_vf_context { int enabled; u64 port_guid; @@ -455,6 +460,7 @@ struct mlx5_vf_context { u8 port_guid_valid:1; u8 node_guid_valid:1; enum port_state_policy policy; + struct blocking_notifier_head notifier; }; struct mlx5_core_sriov { @@ -1152,6 +1158,12 @@ int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type struct mlx5_core_dev *mlx5_vf_get_core_dev(struct pci_dev *pdev); void mlx5_vf_put_core_dev(struct mlx5_core_dev *mdev); +int mlx5_sriov_blocking_notifier_register(struct mlx5_core_dev *mdev, + int vf_id, + struct notifier_block *nb); +void mlx5_sriov_blocking_notifier_unregister(struct mlx5_core_dev *mdev, + int vf_id, + struct notifier_block *nb); #ifdef CONFIG_MLX5_CORE_IPOIB struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev, struct ib_device *ibdev, -- cgit v1.2.3 From 1ff297584fad2eef390f212b860e0fbb7363e0e8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 May 2022 10:29:36 -0700 Subject: randomize_kstack: Improve docs on requirements/rationale There were some recent questions about where and why to use the random_kstack routines when applying them to new architectures[1]. Update the header comments to reflect the design choices for the routines. [1] https://lore.kernel.org/lkml/1652173338.7bltwybi0c.astroid@bobo.none Cc: Nicholas Piggin Cc: Xiu Jianfeng Signed-off-by: Kees Cook --- include/linux/randomize_kstack.h | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index 1468caf001c0..5d868505a94e 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -40,10 +40,14 @@ DECLARE_PER_CPU(u32, kstack_offset); */ #define KSTACK_OFFSET_MAX(x) ((x) & 0x3FF) -/* - * These macros must be used during syscall entry when interrupts and +/** + * add_random_kstack_offset - Increase stack utilization by previously + * chosen random offset + * + * This should be used in the syscall entry path when interrupts and * preempt are disabled, and after user registers have been stored to - * the stack. + * the stack. For testing the resulting entropy, please see: + * tools/testing/selftests/lkdtm/stack-entropy.sh */ #define add_random_kstack_offset() do { \ if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \ @@ -55,6 +59,23 @@ DECLARE_PER_CPU(u32, kstack_offset); } \ } while (0) +/** + * choose_random_kstack_offset - Choose the random offset for the next + * add_random_kstack_offset() + * + * This should only be used during syscall exit when interrupts and + * preempt are disabled. This position in the syscall flow is done to + * frustrate attacks from userspace attempting to learn the next offset: + * - Maximize the timing uncertainty visible from userspace: if the + * offset is chosen at syscall entry, userspace has much more control + * over the timing between choosing offsets. "How long will we be in + * kernel mode?" tends to be more difficult to predict than "how long + * will we be in user mode?" + * - Reduce the lifetime of the new offset sitting in memory during + * kernel mode execution. Exposure of "thread-local" memory content + * (e.g. current, percpu, etc) tends to be easier than arbitrary + * location memory exposure. + */ #define choose_random_kstack_offset(rand) do { \ if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \ &randomize_kstack_offset)) { \ -- cgit v1.2.3 From 9f88361273082825d9f0d13a543d49f9fa0d44a8 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Tue, 10 May 2022 17:52:30 +0200 Subject: bpf: Add bpf_link iterator Implement bpf_link iterator to traverse links via bpf_seq_file operations. The changeset is mostly shamelessly copied from commit a228a64fc1e4 ("bpf: Add bpf_prog iterator") Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com> Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20220510155233.9815-2-9erthalion6@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be94833d390a..551b7198ae8a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1544,6 +1544,7 @@ void bpf_link_put(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd); struct bpf_link *bpf_link_get_from_fd(u32 ufd); +struct bpf_link *bpf_link_get_curr_or_next(u32 *id); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); -- cgit v1.2.3 From d721def7392a7348ffb9f3583b264239cbd3702c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 10 May 2022 14:26:12 +0200 Subject: kallsyms: Make kallsyms_on_each_symbol generally available Making kallsyms_on_each_symbol generally available, so it can be used outside CONFIG_LIVEPATCH option in following changes. Rather than adding another ifdef option let's make the function generally available (when CONFIG_KALLSYMS option is defined). Cc: Christoph Hellwig Reviewed-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220510122616.2652285-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/kallsyms.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index ce1bd2fbf23e..ad39636e0c3f 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -65,11 +65,11 @@ static inline void *dereference_symbol_descriptor(void *ptr) return ptr; } +#ifdef CONFIG_KALLSYMS int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data); -#ifdef CONFIG_KALLSYMS /* Lookup the address for a symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name); @@ -163,6 +163,11 @@ static inline bool kallsyms_show_value(const struct cred *cred) return false; } +static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, + unsigned long), void *data) +{ + return -EOPNOTSUPP; +} #endif /*CONFIG_KALLSYMS*/ static inline void print_ip_sym(const char *loglvl, unsigned long ip) -- cgit v1.2.3 From bed0d9a50dacee6fcf785c555cfb0d2573355afc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 10 May 2022 14:26:13 +0200 Subject: ftrace: Add ftrace_lookup_symbols function Adding ftrace_lookup_symbols function that resolves array of symbols with single pass over kallsyms. The user provides array of string pointers with count and pointer to allocated array for resolved values. int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) It iterates all kallsyms symbols and tries to loop up each in provided symbols array with bsearch. The symbols array needs to be sorted by name for this reason. We also check each symbol to pass ftrace_location, because this API will be used for fprobe symbols resolving. Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Reviewed-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220510122616.2652285-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/ftrace.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4816b7e11047..820500430eae 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -303,6 +303,8 @@ int unregister_ftrace_function(struct ftrace_ops *ops); extern void ftrace_stub(unsigned long a0, unsigned long a1, struct ftrace_ops *op, struct ftrace_regs *fregs); + +int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs); #else /* !CONFIG_FUNCTION_TRACER */ /* * (un)register_ftrace_function must be a macro since the ops parameter @@ -313,6 +315,10 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1, static inline void ftrace_kill(void) { } static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } +static inline int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_FUNCTION_TRACER */ struct ftrace_func_entry { -- cgit v1.2.3 From ddccc9ef55992716ad477d38fbcd9f8f1d34fc67 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 May 2022 09:04:54 -0700 Subject: skbuff: add a basic intro doc Add basic skb documentation. It's mostly an intro to the subsequent patches - it would looks strange if we documented advanced topics without covering the basics in any way. Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 97de40bcfef6..69802624276d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -800,6 +800,46 @@ typedef unsigned int sk_buff_data_t; typedef unsigned char *sk_buff_data_t; #endif +/** + * DOC: Basic sk_buff geometry + * + * struct sk_buff itself is a metadata structure and does not hold any packet + * data. All the data is held in associated buffers. + * + * &sk_buff.head points to the main "head" buffer. The head buffer is divided + * into two parts: + * + * - data buffer, containing headers and sometimes payload; + * this is the part of the skb operated on by the common helpers + * such as skb_put() or skb_pull(); + * - shared info (struct skb_shared_info) which holds an array of pointers + * to read-only data in the (page, offset, length) format. + * + * Optionally &skb_shared_info.frag_list may point to another skb. + * + * Basic diagram may look like this:: + * + * --------------- + * | sk_buff | + * --------------- + * ,--------------------------- + head + * / ,----------------- + data + * / / ,----------- + tail + * | | | , + end + * | | | | + * v v v v + * ----------------------------------------------- + * | headroom | data | tailroom | skb_shared_info | + * ----------------------------------------------- + * + [page frag] + * + [page frag] + * + [page frag] + * + [page frag] --------- + * + frag_list --> | sk_buff | + * --------- + * + */ + /** * struct sk_buff - socket buffer * @next: Next buffer in list -- cgit v1.2.3 From 9ec7ea1462084df695f34c5ac2d2d2250d9d6897 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 May 2022 09:04:55 -0700 Subject: skbuff: rewrite the doc for data-only skbs The comment about shinfo->dataref split is really unhelpful, at least to me. Rewrite it and render it to skb documentation. Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 69802624276d..0e492f9bf532 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -727,16 +727,32 @@ struct skb_shared_info { skb_frag_t frags[MAX_SKB_FRAGS]; }; -/* We divide dataref into two halves. The higher 16 bits hold references - * to the payload part of skb->data. The lower 16 bits hold references to - * the entire skb->data. A clone of a headerless skb holds the length of - * the header in skb->hdr_len. - * - * All users must obey the rule that the skb->data reference count must be - * greater than or equal to the payload reference count. - * - * Holding a reference to the payload part means that the user does not - * care about modifications to the header part of skb->data. +/** + * DOC: dataref and headerless skbs + * + * Transport layers send out clones of payload skbs they hold for + * retransmissions. To allow lower layers of the stack to prepend their headers + * we split &skb_shared_info.dataref into two halves. + * The lower 16 bits count the overall number of references. + * The higher 16 bits indicate how many of the references are payload-only. + * skb_header_cloned() checks if skb is allowed to add / write the headers. + * + * The creator of the skb (e.g. TCP) marks its skb as &sk_buff.nohdr + * (via __skb_header_release()). Any clone created from marked skb will get + * &sk_buff.hdr_len populated with the available headroom. + * If there's the only clone in existence it's able to modify the headroom + * at will. The sequence of calls inside the transport layer is:: + * + * + * skb_reserve() + * __skb_header_release() + * skb_clone() + * // send the clone down the stack + * + * This is not a very generic construct and it depends on the transport layers + * doing the right thing. In practice there's usually only one payload-only skb. + * Having multiple payload-only skbs with different lengths of hdr_len is not + * possible. The payload-only skbs should never leave their owner. */ #define SKB_DATAREF_SHIFT 16 #define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1) @@ -2027,8 +2043,10 @@ static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri) } /** - * __skb_header_release - release reference to header - * @skb: buffer to operate on + * __skb_header_release() - allow clones to use the headroom + * @skb: buffer to operate on + * + * See "DOC: dataref and headerless skbs". */ static inline void __skb_header_release(struct sk_buff *skb) { -- cgit v1.2.3 From 9facd94114b59afebd405e74b3bc2bf184efe986 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 May 2022 09:04:56 -0700 Subject: skbuff: render the checksum comment to documentation Long time ago Tom added a giant comment to skbuff.h explaining checksums. Now that we have a place in Documentation for skbuff docs we should render it. Sprinkle some markup while at it. Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 219 ++++++++++++++++++++++++++++--------------------- 1 file changed, 124 insertions(+), 95 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0e492f9bf532..5296b2c37f62 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -43,98 +43,112 @@ #include #endif -/* The interface for checksum offload between the stack and networking drivers +/** + * DOC: skb checksums + * + * The interface for checksum offload between the stack and networking drivers * is as follows... * - * A. IP checksum related features + * IP checksum related features + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * Drivers advertise checksum offload capabilities in the features of a device. * From the stack's point of view these are capabilities offered by the driver. * A driver typically only advertises features that it is capable of offloading * to its device. * - * The checksum related features are: - * - * NETIF_F_HW_CSUM - The driver (or its device) is able to compute one - * IP (one's complement) checksum for any combination - * of protocols or protocol layering. The checksum is - * computed and set in a packet per the CHECKSUM_PARTIAL - * interface (see below). - * - * NETIF_F_IP_CSUM - Driver (device) is only able to checksum plain - * TCP or UDP packets over IPv4. These are specifically - * unencapsulated packets of the form IPv4|TCP or - * IPv4|UDP where the Protocol field in the IPv4 header - * is TCP or UDP. The IPv4 header may contain IP options. - * This feature cannot be set in features for a device - * with NETIF_F_HW_CSUM also set. This feature is being - * DEPRECATED (see below). - * - * NETIF_F_IPV6_CSUM - Driver (device) is only able to checksum plain - * TCP or UDP packets over IPv6. These are specifically - * unencapsulated packets of the form IPv6|TCP or - * IPv6|UDP where the Next Header field in the IPv6 - * header is either TCP or UDP. IPv6 extension headers - * are not supported with this feature. This feature - * cannot be set in features for a device with - * NETIF_F_HW_CSUM also set. This feature is being - * DEPRECATED (see below). - * - * NETIF_F_RXCSUM - Driver (device) performs receive checksum offload. - * This flag is only used to disable the RX checksum - * feature for a device. The stack will accept receive - * checksum indication in packets received on a device - * regardless of whether NETIF_F_RXCSUM is set. - * - * B. Checksumming of received packets by device. Indication of checksum - * verification is set in skb->ip_summed. Possible values are: - * - * CHECKSUM_NONE: + * .. flat-table:: Checksum related device features + * :widths: 1 10 + * + * * - %NETIF_F_HW_CSUM + * - The driver (or its device) is able to compute one + * IP (one's complement) checksum for any combination + * of protocols or protocol layering. The checksum is + * computed and set in a packet per the CHECKSUM_PARTIAL + * interface (see below). + * + * * - %NETIF_F_IP_CSUM + * - Driver (device) is only able to checksum plain + * TCP or UDP packets over IPv4. These are specifically + * unencapsulated packets of the form IPv4|TCP or + * IPv4|UDP where the Protocol field in the IPv4 header + * is TCP or UDP. The IPv4 header may contain IP options. + * This feature cannot be set in features for a device + * with NETIF_F_HW_CSUM also set. This feature is being + * DEPRECATED (see below). + * + * * - %NETIF_F_IPV6_CSUM + * - Driver (device) is only able to checksum plain + * TCP or UDP packets over IPv6. These are specifically + * unencapsulated packets of the form IPv6|TCP or + * IPv6|UDP where the Next Header field in the IPv6 + * header is either TCP or UDP. IPv6 extension headers + * are not supported with this feature. This feature + * cannot be set in features for a device with + * NETIF_F_HW_CSUM also set. This feature is being + * DEPRECATED (see below). + * + * * - %NETIF_F_RXCSUM + * - Driver (device) performs receive checksum offload. + * This flag is only used to disable the RX checksum + * feature for a device. The stack will accept receive + * checksum indication in packets received on a device + * regardless of whether NETIF_F_RXCSUM is set. + * + * Checksumming of received packets by device + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * Indication of checksum verification is set in &sk_buff.ip_summed. + * Possible values are: + * + * - %CHECKSUM_NONE * * Device did not checksum this packet e.g. due to lack of capabilities. * The packet contains full (though not verified) checksum in packet but * not in skb->csum. Thus, skb->csum is undefined in this case. * - * CHECKSUM_UNNECESSARY: + * - %CHECKSUM_UNNECESSARY * * The hardware you're dealing with doesn't calculate the full checksum - * (as in CHECKSUM_COMPLETE), but it does parse headers and verify checksums - * for specific protocols. For such packets it will set CHECKSUM_UNNECESSARY - * if their checksums are okay. skb->csum is still undefined in this case + * (as in %CHECKSUM_COMPLETE), but it does parse headers and verify checksums + * for specific protocols. For such packets it will set %CHECKSUM_UNNECESSARY + * if their checksums are okay. &sk_buff.csum is still undefined in this case * though. A driver or device must never modify the checksum field in the * packet even if checksum is verified. * - * CHECKSUM_UNNECESSARY is applicable to following protocols: - * TCP: IPv6 and IPv4. - * UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a + * %CHECKSUM_UNNECESSARY is applicable to following protocols: + * + * - TCP: IPv6 and IPv4. + * - UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a * zero UDP checksum for either IPv4 or IPv6, the networking stack * may perform further validation in this case. - * GRE: only if the checksum is present in the header. - * SCTP: indicates the CRC in SCTP header has been validated. - * FCOE: indicates the CRC in FC frame has been validated. + * - GRE: only if the checksum is present in the header. + * - SCTP: indicates the CRC in SCTP header has been validated. + * - FCOE: indicates the CRC in FC frame has been validated. * - * skb->csum_level indicates the number of consecutive checksums found in - * the packet minus one that have been verified as CHECKSUM_UNNECESSARY. + * &sk_buff.csum_level indicates the number of consecutive checksums found in + * the packet minus one that have been verified as %CHECKSUM_UNNECESSARY. * For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet * and a device is able to verify the checksums for UDP (possibly zero), - * GRE (checksum flag is set) and TCP, skb->csum_level would be set to + * GRE (checksum flag is set) and TCP, &sk_buff.csum_level would be set to * two. If the device were only able to verify the UDP checksum and not * GRE, either because it doesn't support GRE checksum or because GRE * checksum is bad, skb->csum_level would be set to zero (TCP checksum is * not considered in this case). * - * CHECKSUM_COMPLETE: + * - %CHECKSUM_COMPLETE * * This is the most generic way. The device supplied checksum of the _whole_ - * packet as seen by netif_rx() and fills in skb->csum. This means the + * packet as seen by netif_rx() and fills in &sk_buff.csum. This means the * hardware doesn't need to parse L3/L4 headers to implement this. * * Notes: + * * - Even if device supports only some protocols, but is able to produce * skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY. * - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols. * - * CHECKSUM_PARTIAL: + * - %CHECKSUM_PARTIAL * * A checksum is set up to be offloaded to a device as described in the * output description for CHECKSUM_PARTIAL. This may occur on a packet @@ -146,14 +160,18 @@ * packet that are after the checksum being offloaded are not considered to * be verified. * - * C. Checksumming on transmit for non-GSO. The stack requests checksum offload - * in the skb->ip_summed for a packet. Values are: + * Checksumming on transmit for non-GSO + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * The stack requests checksum offload in the &sk_buff.ip_summed for a packet. + * Values are: * - * CHECKSUM_PARTIAL: + * - %CHECKSUM_PARTIAL * * The driver is required to checksum the packet as seen by hard_start_xmit() - * from skb->csum_start up to the end, and to record/write the checksum at - * offset skb->csum_start + skb->csum_offset. A driver may verify that the + * from &sk_buff.csum_start up to the end, and to record/write the checksum at + * offset &sk_buff.csum_start + &sk_buff.csum_offset. + * A driver may verify that the * csum_start and csum_offset values are valid values given the length and * offset of the packet, but it should not attempt to validate that the * checksum refers to a legitimate transport layer checksum -- it is the @@ -165,55 +183,66 @@ * checksum calculation to the device, or call skb_checksum_help (in the case * that the device does not support offload for a particular checksum). * - * NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM are being deprecated in favor of - * NETIF_F_HW_CSUM. New devices should use NETIF_F_HW_CSUM to indicate + * %NETIF_F_IP_CSUM and %NETIF_F_IPV6_CSUM are being deprecated in favor of + * %NETIF_F_HW_CSUM. New devices should use %NETIF_F_HW_CSUM to indicate * checksum offload capability. - * skb_csum_hwoffload_help() can be called to resolve CHECKSUM_PARTIAL based + * skb_csum_hwoffload_help() can be called to resolve %CHECKSUM_PARTIAL based * on network device checksumming capabilities: if a packet does not match - * them, skb_checksum_help or skb_crc32c_help (depending on the value of - * csum_not_inet, see item D.) is called to resolve the checksum. + * them, skb_checksum_help() or skb_crc32c_help() (depending on the value of + * &sk_buff.csum_not_inet, see :ref:`crc`) + * is called to resolve the checksum. * - * CHECKSUM_NONE: + * - %CHECKSUM_NONE * * The skb was already checksummed by the protocol, or a checksum is not * required. * - * CHECKSUM_UNNECESSARY: + * - %CHECKSUM_UNNECESSARY * * This has the same meaning as CHECKSUM_NONE for checksum offload on * output. * - * CHECKSUM_COMPLETE: + * - %CHECKSUM_COMPLETE + * * Not used in checksum output. If a driver observes a packet with this value - * set in skbuff, it should treat the packet as if CHECKSUM_NONE were set. - * - * D. Non-IP checksum (CRC) offloads - * - * NETIF_F_SCTP_CRC - This feature indicates that a device is capable of - * offloading the SCTP CRC in a packet. To perform this offload the stack - * will set csum_start and csum_offset accordingly, set ip_summed to - * CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication in - * the skbuff that the CHECKSUM_PARTIAL refers to CRC32c. - * A driver that supports both IP checksum offload and SCTP CRC32c offload - * must verify which offload is configured for a packet by testing the - * value of skb->csum_not_inet; skb_crc32c_csum_help is provided to resolve - * CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1. - * - * NETIF_F_FCOE_CRC - This feature indicates that a device is capable of - * offloading the FCOE CRC in a packet. To perform this offload the stack - * will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset - * accordingly. Note that there is no indication in the skbuff that the - * CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports - * both IP checksum offload and FCOE CRC offload must verify which offload - * is configured for a packet, presumably by inspecting packet headers. - * - * E. Checksumming on output with GSO. - * - * In the case of a GSO packet (skb_is_gso(skb) is true), checksum offload + * set in skbuff, it should treat the packet as if %CHECKSUM_NONE were set. + * + * .. _crc: + * + * Non-IP checksum (CRC) offloads + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * .. flat-table:: + * :widths: 1 10 + * + * * - %NETIF_F_SCTP_CRC + * - This feature indicates that a device is capable of + * offloading the SCTP CRC in a packet. To perform this offload the stack + * will set csum_start and csum_offset accordingly, set ip_summed to + * %CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication + * in the skbuff that the %CHECKSUM_PARTIAL refers to CRC32c. + * A driver that supports both IP checksum offload and SCTP CRC32c offload + * must verify which offload is configured for a packet by testing the + * value of &sk_buff.csum_not_inet; skb_crc32c_csum_help() is provided to + * resolve %CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1. + * + * * - %NETIF_F_FCOE_CRC + * - This feature indicates that a device is capable of offloading the FCOE + * CRC in a packet. To perform this offload the stack will set ip_summed + * to %CHECKSUM_PARTIAL and set csum_start and csum_offset + * accordingly. Note that there is no indication in the skbuff that the + * %CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports + * both IP checksum offload and FCOE CRC offload must verify which offload + * is configured for a packet, presumably by inspecting packet headers. + * + * Checksumming on output with GSO + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * In the case of a GSO packet (skb_is_gso() is true), checksum offload * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the - * gso_type is SKB_GSO_TCPV4 or SKB_GSO_TCPV6, TCP checksum offload as + * gso_type is %SKB_GSO_TCPV4 or %SKB_GSO_TCPV6, TCP checksum offload as * part of the GSO operation is implied. If a checksum is being offloaded - * with GSO then ip_summed is CHECKSUM_PARTIAL, and both csum_start and + * with GSO then ip_summed is %CHECKSUM_PARTIAL, and both csum_start and * csum_offset are set to refer to the outermost checksum being offloaded * (two offloaded checksums are possible with UDP encapsulation). */ -- cgit v1.2.3 From f7e0beaf39d3868dc700d4954b26cf8443c5d423 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 10 May 2022 13:59:19 -0700 Subject: bpf, x86: Generate trampolines from bpf_tramp_links Replace struct bpf_tramp_progs with struct bpf_tramp_links to collect struct bpf_tramp_link(s) for a trampoline. struct bpf_tramp_link extends bpf_link to act as a linked list node. arch_prepare_bpf_trampoline() accepts a struct bpf_tramp_links to collects all bpf_tramp_link(s) that a trampoline should call. Change BPF trampoline and bpf_struct_ops to pass bpf_tramp_links instead of bpf_tramp_progs. Signed-off-by: Kui-Feng Lee Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220510205923.3206889-2-kuifeng@fb.com --- include/linux/bpf.h | 36 ++++++++++++++++++++++++------------ include/linux/bpf_types.h | 1 + 2 files changed, 25 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 551b7198ae8a..75e0110a65e1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -723,11 +723,11 @@ struct btf_func_model { /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 */ -#define BPF_MAX_TRAMP_PROGS 38 +#define BPF_MAX_TRAMP_LINKS 38 -struct bpf_tramp_progs { - struct bpf_prog *progs[BPF_MAX_TRAMP_PROGS]; - int nr_progs; +struct bpf_tramp_links { + struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS]; + int nr_links; }; /* Different use cases for BPF trampoline: @@ -753,7 +753,7 @@ struct bpf_tramp_progs { struct bpf_tramp_image; int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_progs *tprogs, + struct bpf_tramp_links *tlinks, void *orig_call); /* these two functions are called from generated trampoline */ u64 notrace __bpf_prog_enter(struct bpf_prog *prog); @@ -852,9 +852,10 @@ static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func( { return bpf_func(ctx, insnsi); } + #ifdef CONFIG_BPF_JIT -int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr); -int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr); +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); @@ -905,12 +906,12 @@ int bpf_jit_charge_modmem(u32 size); void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); #else -static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, +static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { return -ENOTSUPP; } -static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog, +static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { return -ENOTSUPP; @@ -1009,7 +1010,6 @@ struct bpf_prog_aux { bool tail_call_reachable; bool xdp_has_frags; bool use_bpf_prog_pack; - struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ @@ -1096,6 +1096,18 @@ struct bpf_link_ops { struct bpf_link_info *info); }; +struct bpf_tramp_link { + struct bpf_link link; + struct hlist_node tramp_hlist; +}; + +struct bpf_tracing_link { + struct bpf_tramp_link link; + enum bpf_attach_type attach_type; + struct bpf_trampoline *trampoline; + struct bpf_prog *tgt_prog; +}; + struct bpf_link_primer { struct bpf_link *link; struct file *file; @@ -1133,8 +1145,8 @@ bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value); -int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs, - struct bpf_prog *prog, +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, + struct bpf_tramp_link *link, const struct btf_func_model *model, void *image, void *image_end); static inline bool bpf_try_module_get(const void *data, struct module *owner) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 3e24ad0c4b3c..2b9112b80171 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -141,3 +141,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) #endif BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi) +BPF_LINK_TYPE(BPF_LINK_TYPE_STRUCT_OPS, struct_ops) -- cgit v1.2.3 From e384c7b7b46d0a5f4bf3c554f963e6e9622d0ab1 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 10 May 2022 13:59:20 -0700 Subject: bpf, x86: Create bpf_tramp_run_ctx on the caller thread's stack BPF trampolines will create a bpf_tramp_run_ctx, a bpf_run_ctx, on stacks and set/reset the current bpf_run_ctx before/after calling a bpf_prog. Signed-off-by: Kui-Feng Lee Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220510205923.3206889-3-kuifeng@fb.com --- include/linux/bpf.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 75e0110a65e1..256fb802e580 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -730,6 +730,8 @@ struct bpf_tramp_links { int nr_links; }; +struct bpf_tramp_run_ctx; + /* Different use cases for BPF trampoline: * 1. replace nop at the function entry (kprobe equivalent) * flags = BPF_TRAMP_F_RESTORE_REGS @@ -756,10 +758,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *i struct bpf_tramp_links *tlinks, void *orig_call); /* these two functions are called from generated trampoline */ -u64 notrace __bpf_prog_enter(struct bpf_prog *prog); -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog); -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start); +u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); @@ -1351,6 +1354,12 @@ struct bpf_trace_run_ctx { u64 bpf_cookie; }; +struct bpf_tramp_run_ctx { + struct bpf_run_ctx run_ctx; + u64 bpf_cookie; + struct bpf_run_ctx *saved_run_ctx; +}; + static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) { struct bpf_run_ctx *old_ctx = NULL; -- cgit v1.2.3 From 2fcc82411e74e5e6aba336561cf56fb899bfae4e Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Tue, 10 May 2022 13:59:21 -0700 Subject: bpf, x86: Attach a cookie to fentry/fexit/fmod_ret/lsm. Pass a cookie along with BPF_LINK_CREATE requests. Add a bpf_cookie field to struct bpf_tracing_link to attach a cookie. The cookie of a bpf_tracing_link is available by calling bpf_get_attach_cookie when running the BPF program of the attached link. The value of a cookie will be set at bpf_tramp_run_ctx by the trampoline of the link. Signed-off-by: Kui-Feng Lee Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220510205923.3206889-4-kuifeng@fb.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 256fb802e580..aba7ded56436 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1102,6 +1102,7 @@ struct bpf_link_ops { struct bpf_tramp_link { struct bpf_link link; struct hlist_node tramp_hlist; + u64 cookie; }; struct bpf_tracing_link { -- cgit v1.2.3 From 5b87be9e4978fe507c7e14c0bdff147d10d39aec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 May 2022 20:57:37 -0700 Subject: net: add include/net/net_debug.h Remove from include/linux/netdevice.h helpers that send debug/info/warnings to syslog. We plan adding more helpers in following patches. v2: added two includes, and 'struct net_device' forward declaration to avoid compile errors (kernel bots) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 141 +--------------------------------------------- 1 file changed, 1 insertion(+), 140 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 51fe143091cf..536321691c72 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -50,6 +50,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -5071,81 +5072,9 @@ static inline const char *netdev_reg_state(const struct net_device *dev) return " (unknown)"; } -__printf(3, 4) __cold -void netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...); -__printf(2, 3) __cold -void netdev_emerg(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_alert(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_crit(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_err(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_warn(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_notice(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_info(const struct net_device *dev, const char *format, ...); - -#define netdev_level_once(level, dev, fmt, ...) \ -do { \ - static bool __section(".data.once") __print_once; \ - \ - if (!__print_once) { \ - __print_once = true; \ - netdev_printk(level, dev, fmt, ##__VA_ARGS__); \ - } \ -} while (0) - -#define netdev_emerg_once(dev, fmt, ...) \ - netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__) -#define netdev_alert_once(dev, fmt, ...) \ - netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__) -#define netdev_crit_once(dev, fmt, ...) \ - netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__) -#define netdev_err_once(dev, fmt, ...) \ - netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__) -#define netdev_warn_once(dev, fmt, ...) \ - netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__) -#define netdev_notice_once(dev, fmt, ...) \ - netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__) -#define netdev_info_once(dev, fmt, ...) \ - netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__) - #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) -#if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) -#define netdev_dbg(__dev, format, args...) \ -do { \ - dynamic_netdev_dbg(__dev, format, ##args); \ -} while (0) -#elif defined(DEBUG) -#define netdev_dbg(__dev, format, args...) \ - netdev_printk(KERN_DEBUG, __dev, format, ##args) -#else -#define netdev_dbg(__dev, format, args...) \ -({ \ - if (0) \ - netdev_printk(KERN_DEBUG, __dev, format, ##args); \ -}) -#endif - -#if defined(VERBOSE_DEBUG) -#define netdev_vdbg netdev_dbg -#else - -#define netdev_vdbg(dev, format, args...) \ -({ \ - if (0) \ - netdev_printk(KERN_DEBUG, dev, format, ##args); \ - 0; \ -}) -#endif - /* * netdev_WARN() acts like dev_printk(), but with the key difference * of using a WARN/WARN_ON to get the message out, including the @@ -5159,74 +5088,6 @@ do { \ WARN_ONCE(1, "netdevice: %s%s: " format, netdev_name(dev), \ netdev_reg_state(dev), ##args) -/* netif printk helpers, similar to netdev_printk */ - -#define netif_printk(priv, type, level, dev, fmt, args...) \ -do { \ - if (netif_msg_##type(priv)) \ - netdev_printk(level, (dev), fmt, ##args); \ -} while (0) - -#define netif_level(level, priv, type, dev, fmt, args...) \ -do { \ - if (netif_msg_##type(priv)) \ - netdev_##level(dev, fmt, ##args); \ -} while (0) - -#define netif_emerg(priv, type, dev, fmt, args...) \ - netif_level(emerg, priv, type, dev, fmt, ##args) -#define netif_alert(priv, type, dev, fmt, args...) \ - netif_level(alert, priv, type, dev, fmt, ##args) -#define netif_crit(priv, type, dev, fmt, args...) \ - netif_level(crit, priv, type, dev, fmt, ##args) -#define netif_err(priv, type, dev, fmt, args...) \ - netif_level(err, priv, type, dev, fmt, ##args) -#define netif_warn(priv, type, dev, fmt, args...) \ - netif_level(warn, priv, type, dev, fmt, ##args) -#define netif_notice(priv, type, dev, fmt, args...) \ - netif_level(notice, priv, type, dev, fmt, ##args) -#define netif_info(priv, type, dev, fmt, args...) \ - netif_level(info, priv, type, dev, fmt, ##args) - -#if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) -#define netif_dbg(priv, type, netdev, format, args...) \ -do { \ - if (netif_msg_##type(priv)) \ - dynamic_netdev_dbg(netdev, format, ##args); \ -} while (0) -#elif defined(DEBUG) -#define netif_dbg(priv, type, dev, format, args...) \ - netif_printk(priv, type, KERN_DEBUG, dev, format, ##args) -#else -#define netif_dbg(priv, type, dev, format, args...) \ -({ \ - if (0) \ - netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \ - 0; \ -}) -#endif - -/* if @cond then downgrade to debug, else print at @level */ -#define netif_cond_dbg(priv, type, netdev, cond, level, fmt, args...) \ - do { \ - if (cond) \ - netif_dbg(priv, type, netdev, fmt, ##args); \ - else \ - netif_ ## level(priv, type, netdev, fmt, ##args); \ - } while (0) - -#if defined(VERBOSE_DEBUG) -#define netif_vdbg netif_dbg -#else -#define netif_vdbg(priv, type, dev, format, args...) \ -({ \ - if (0) \ - netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \ - 0; \ -}) -#endif - /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. -- cgit v1.2.3 From 66e4c8d950083df8e12981babca788e1635c92b6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 May 2022 20:57:39 -0700 Subject: net: warn if transport header was not set Make sure skb_transport_header() and skb_transport_offset() uses are not fooled if the transport header has not been set. This change will likely expose existing bugs in linux networking stacks. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5296b2c37f62..b91d225fdc13 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -42,6 +42,7 @@ #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include #endif +#include /** * DOC: skb checksums @@ -2904,6 +2905,7 @@ static inline bool skb_transport_header_was_set(const struct sk_buff *skb) static inline unsigned char *skb_transport_header(const struct sk_buff *skb) { + DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb)); return skb->head + skb->transport_header; } -- cgit v1.2.3 From ee692a21e9bf8354bd3ec816f1cf4bff8619ed77 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 May 2022 11:17:45 +0530 Subject: fs,io_uring: add infrastructure for uring-cmd file_operations->uring_cmd is a file private handler. This is somewhat similar to ioctl but hopefully a lot more sane and useful as it can be used to enable many io_uring capabilities for the underlying operation. IORING_OP_URING_CMD is a file private kind of request. io_uring doesn't know what is in this command type, it's for the provider of ->uring_cmd() to deal with. Co-developed-by: Kanchan Joshi Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220511054750.20432-2-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 ++ include/linux/io_uring.h | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..87b5af1d9fbe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1953,6 +1953,7 @@ struct dir_context { #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) struct iov_iter; +struct io_uring_cmd; struct file_operations { struct module *owner; @@ -1995,6 +1996,7 @@ struct file_operations { struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); + int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); } __randomize_layout; struct inode_operations { diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 24651c229ed2..4a2f6cc5a492 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -5,7 +5,32 @@ #include #include +enum io_uring_cmd_flags { + IO_URING_F_COMPLETE_DEFER = 1, + IO_URING_F_UNLOCKED = 2, + /* int's last bit, sign checks are usually faster than a bit test */ + IO_URING_F_NONBLOCK = INT_MIN, + + /* ctx state flags, for URING_CMD */ + IO_URING_F_SQE128 = 4, + IO_URING_F_CQE32 = 8, + IO_URING_F_IOPOLL = 16, +}; + +struct io_uring_cmd { + struct file *file; + const void *cmd; + /* callback to defer completions to task context */ + void (*task_work_cb)(struct io_uring_cmd *cmd); + u32 cmd_op; + u32 pad; + u8 pdu[32]; /* available inline for free use */ +}; + #if defined(CONFIG_IO_URING) +void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2); +void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)); struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); @@ -30,6 +55,14 @@ static inline void io_uring_free(struct task_struct *tsk) __io_uring_free(tsk); } #else +static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, + ssize_t ret2) +{ +} +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)) +{ +} static inline struct sock *io_uring_get_socket(struct file *file) { return NULL; -- cgit v1.2.3 From deaf7c4b4bf8b802cc465bb9b33fe6c76e812924 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 9 May 2022 21:03:43 +0200 Subject: lockdep: Delete local_irq_enable_in_hardirq() No more users and there is no desire to grow new ones. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/8735hir0j4.ffs@tglx --- include/linux/interrupt.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f40754caaefa..b5e06a6e4019 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -222,24 +222,6 @@ devm_request_any_context_irq(struct device *dev, unsigned int irq, extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); -/* - * On lockdep we dont want to enable hardirqs in hardirq - * context. Use local_irq_enable_in_hardirq() to annotate - * kernel code that has to do this nevertheless (pretty much - * the only valid case is for old/broken hardware that is - * insanely slow). - * - * NOTE: in theory this might break fragile code that relies - * on hardirq delivery - in practice we dont seem to have such - * places left. So the only effect should be slightly increased - * irqs-off latencies. - */ -#ifdef CONFIG_LOCKDEP -# define local_irq_enable_in_hardirq() do { } while (0) -#else -# define local_irq_enable_in_hardirq() local_irq_enable() -#endif - bool irq_has_action(unsigned int irq); extern void disable_irq_nosync(unsigned int irq); extern bool disable_hardirq(unsigned int irq); -- cgit v1.2.3 From abcebcd39fe094b68826cc04f2eca835606697f9 Mon Sep 17 00:00:00 2001 From: Michael Shych Date: Sat, 30 Apr 2022 14:58:07 +0300 Subject: platform_data/mlxreg: Add field for notification callback Add notification callback to inform caller that platform driver probing has been completed. It allows to caller to perform some initialization flow steps depending on specific driver probing completion. Signed-off-by: Michael Shych Reviewed-by: Vadim Pasternak Link: https://lore.kernel.org/r/20220430115809.54565-2-michaelsh@nvidia.com Signed-off-by: Hans de Goede --- include/linux/platform_data/mlxreg.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h index 40185f9d7c14..a6bd74e29b6b 100644 --- a/include/linux/platform_data/mlxreg.h +++ b/include/linux/platform_data/mlxreg.h @@ -216,6 +216,8 @@ struct mlxreg_core_platform_data { * @mask_low: low aggregation interrupt common mask; * @deferred_nr: I2C adapter number must be exist prior probing execution; * @shift_nr: I2C adapter numbers must be incremented by this value; + * @handle: handle to be passed by callback; + * @completion_notify: callback to notify when platform driver probing is done; */ struct mlxreg_core_hotplug_platform_data { struct mlxreg_core_item *items; @@ -228,6 +230,8 @@ struct mlxreg_core_hotplug_platform_data { u32 mask_low; int deferred_nr; int shift_nr; + void *handle; + int (*completion_notify)(void *handle, int id); }; #endif /* __LINUX_PLATFORM_DATA_MLXREG_H */ -- cgit v1.2.3 From f9d76d15072caf1ec5558fa7cc6d93c7b9d33488 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 10 May 2022 11:51:29 -0400 Subject: USB: gadget: Add ID numbers to gadget names Putting USB gadgets on a new bus of their own encounters a problem when multiple gadgets are present: They all have the same name! The driver core fails with a "sys: cannot create duplicate filename" error when creating any of the /sys/bus/gadget/devices/ symbolic links after the first. This patch fixes the problem by adding a ".N" suffix to each gadget's name when the gadget is registered (where N is a unique ID number), thus making the names distinct. Reported-and-tested-by: Geert Uytterhoeven Signed-off-by: Alan Stern Reviewed-by: Geert Uytterhoeven Fixes: fc274c1e9973 ("USB: gadget: Add a new bus for gadgets") Link: https://lore.kernel.org/r/YnqKAXKyp9Vq/pbn@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/gadget.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index cf7af8a0a6e9..3ad58b7a0824 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -386,6 +386,7 @@ struct usb_gadget_ops { * @lpm_capable: If the gadget max_speed is FULL or HIGH, this flag * indicates that it supports LPM as per the LPM ECN & errata. * @irq: the interrupt number for device controller. + * @id_number: a unique ID number for ensuring that gadget names are distinct * * Gadgets have a mostly-portable "gadget driver" implementing device * functions, handling all usb configurations and interfaces. Gadget @@ -446,6 +447,7 @@ struct usb_gadget { unsigned connected:1; unsigned lpm_capable:1; int irq; + int id_number; }; #define work_to_gadget(w) (container_of((w), struct usb_gadget, work)) -- cgit v1.2.3 From a6b94c6b49198266eaf78095a632df7245ef5196 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 2 May 2022 09:36:28 -0700 Subject: Drivers: hv: vmbus: Remove support for Hyper-V 2008 and Hyper-V 2008R2/Win7 The VMbus driver has special case code for running on the first released versions of Hyper-V: 2008 and 2008 R2/Windows 7. These versions are now out of support (except for extended security updates) and lack the performance features needed for effective production usage of Linux guests. Simplify the code by removing the negotiation of the VMbus protocol versions required for these releases of Hyper-V, and by removing the special case code for handling these VMbus protocol versions. Signed-off-by: Michael Kelley Reviewed-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/1651509391-2058-2-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index c440c45887c2..a2464295c14a 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -230,15 +230,19 @@ static inline u32 hv_get_avail_to_write_percent( * two 16 bit quantities: major_number. minor_number. * * 0 . 13 (Windows Server 2008) - * 1 . 1 (Windows 7) - * 2 . 4 (Windows 8) - * 3 . 0 (Windows 8 R2) + * 1 . 1 (Windows 7, WS2008 R2) + * 2 . 4 (Windows 8, WS2012) + * 3 . 0 (Windows 8.1, WS2012 R2) * 4 . 0 (Windows 10) * 4 . 1 (Windows 10 RS3) * 5 . 0 (Newer Windows 10) * 5 . 1 (Windows 10 RS4) * 5 . 2 (Windows Server 2019, RS5) * 5 . 3 (Windows Server 2022) + * + * The WS2008 and WIN7 versions are listed here for + * completeness but are no longer supported in the + * Linux kernel. */ #define VERSION_WS2008 ((0 << 16) | (13)) -- cgit v1.2.3 From 09ea48efffa3156218980e20aaf23dcc7d6000fc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 11 May 2022 13:12:58 -0600 Subject: vfio: Make vfio_(un)register_notifier accept a vfio_device All callers have a struct vfio_device trivially available, pass it in directly and avoid calling the expensive vfio_group_get_from_dev(). Acked-by: Eric Farman Reviewed-by: Jason J. Herne Reviewed-by: Tony Krowiak Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v4-8045e76bf00b+13d-vfio_mdev_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 66dda06ec42d..a00fd722f044 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -178,11 +178,11 @@ enum vfio_notify_type { /* events for VFIO_GROUP_NOTIFY */ #define VFIO_GROUP_NOTIFY_SET_KVM BIT(0) -extern int vfio_register_notifier(struct device *dev, +extern int vfio_register_notifier(struct vfio_device *device, enum vfio_notify_type type, unsigned long *required_events, struct notifier_block *nb); -extern int vfio_unregister_notifier(struct device *dev, +extern int vfio_unregister_notifier(struct vfio_device *device, enum vfio_notify_type type, struct notifier_block *nb); -- cgit v1.2.3 From 8e432bb015b6c327d016b1dca509964e189c4770 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 11 May 2022 13:12:59 -0600 Subject: vfio/mdev: Pass in a struct vfio_device * to vfio_pin/unpin_pages() Every caller has a readily available vfio_device pointer, use that instead of passing in a generic struct device. The struct vfio_device already contains the group we need so this avoids complexity, extra refcountings, and a confusing lifecycle model. Reviewed-by: Christoph Hellwig Acked-by: Eric Farman Reviewed-by: Jason J. Herne Reviewed-by: Tony Krowiak Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v4-8045e76bf00b+13d-vfio_mdev_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index a00fd722f044..bddc70f88899 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -150,9 +150,9 @@ extern long vfio_external_check_extension(struct vfio_group *group, #define VFIO_PIN_PAGES_MAX_ENTRIES (PAGE_SIZE/sizeof(unsigned long)) -extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, +extern int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, int npage, int prot, unsigned long *phys_pfn); -extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, +extern int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn, int npage); extern int vfio_group_pin_pages(struct vfio_group *group, -- cgit v1.2.3 From c6250ffbacc5989a5db3b9acce34b93570938f60 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 11 May 2022 13:12:59 -0600 Subject: vfio/mdev: Pass in a struct vfio_device * to vfio_dma_rw() Every caller has a readily available vfio_device pointer, use that instead of passing in a generic struct device. Change vfio_dma_rw() to take in the struct vfio_device and move the container users that would have been held by vfio_group_get_external_user_from_dev() to vfio_dma_rw() directly, like vfio_pin/unpin_pages(). Reviewed-by: Christoph Hellwig Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v4-8045e76bf00b+13d-vfio_mdev_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index bddc70f88899..8a1510258717 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -161,7 +161,7 @@ extern int vfio_group_pin_pages(struct vfio_group *group, extern int vfio_group_unpin_pages(struct vfio_group *group, unsigned long *user_iova_pfn, int npage); -extern int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova, +extern int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data, size_t len, bool write); extern struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group); -- cgit v1.2.3 From 231657b345046552b35670b45b892cfce2610e12 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 11 May 2022 13:13:00 -0600 Subject: vfio: Remove dead code Now that callers have been updated to use the vfio_device APIs the driver facing group interface is no longer used, delete it: - vfio_group_get_external_user_from_dev() - vfio_group_pin_pages() - vfio_group_unpin_pages() - vfio_group_iommu_domain() -- Reviewed-by: Christoph Hellwig Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v4-8045e76bf00b+13d-vfio_mdev_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 8a1510258717..6195edd2edcd 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -140,8 +140,6 @@ int vfio_mig_get_next_state(struct vfio_device *device, */ extern struct vfio_group *vfio_group_get_external_user(struct file *filep); extern void vfio_group_put_external_user(struct vfio_group *group); -extern struct vfio_group *vfio_group_get_external_user_from_dev(struct device - *dev); extern bool vfio_external_group_match_file(struct vfio_group *group, struct file *filep); extern int vfio_external_user_iommu_id(struct vfio_group *group); @@ -154,18 +152,9 @@ extern int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, int npage, int prot, unsigned long *phys_pfn); extern int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn, int npage); - -extern int vfio_group_pin_pages(struct vfio_group *group, - unsigned long *user_iova_pfn, int npage, - int prot, unsigned long *phys_pfn); -extern int vfio_group_unpin_pages(struct vfio_group *group, - unsigned long *user_iova_pfn, int npage); - extern int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data, size_t len, bool write); -extern struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group); - /* each type has independent events */ enum vfio_notify_type { VFIO_IOMMU_NOTIFY = 0, -- cgit v1.2.3 From ff806cbd90bd2cc3d08a026e26157e5b5a6b5e03 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 11 May 2022 13:19:07 -0600 Subject: vfio/pci: Remove vfio_device_get_from_dev() The last user of this function is in PCI callbacks that want to convert their struct pci_dev to a vfio_device. Instead of searching use the vfio_device available trivially through the drvdata. When a callback in the device_driver is called, the caller must hold the device_lock() on dev. The purpose of the device_lock is to prevent remove() from being called (see __device_release_driver), and allow the driver to safely interact with its drvdata without races. The PCI core correctly follows this and holds the device_lock() when calling error_detected (see report_error_detected) and sriov_configure (see sriov_numvfs_store). Further, since the drvdata holds a positive refcount on the vfio_device any access of the drvdata, under the device_lock(), from a driver callback needs no further protection or refcounting. Thus the remark in the vfio_device_get_from_dev() comment does not apply here, VFIO PCI drivers all call vfio_unregister_group_dev() from their remove callbacks under the device_lock() and cannot race with the remaining callers. Reviewed-by: Kevin Tian Reviewed-by: Shameer Kolothum Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v4-c841817a0349+8f-vfio_get_from_dev_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 2 -- include/linux/vfio_pci_core.h | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 6195edd2edcd..be1e7db4a314 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -125,8 +125,6 @@ void vfio_uninit_group_dev(struct vfio_device *device); int vfio_register_group_dev(struct vfio_device *device); int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); -extern struct vfio_device *vfio_device_get_from_dev(struct device *dev); -extern void vfio_device_put(struct vfio_device *device); int vfio_assign_device_set(struct vfio_device *device, void *set_id); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 48f2dd3c568c..23c176d4b073 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -227,8 +227,9 @@ void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev); -int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn); extern const struct pci_error_handlers vfio_pci_core_err_handlers; +int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, + int nr_virtfn); long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, -- cgit v1.2.3 From 157cc18122b4a1456d19048e151a164216c4a704 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Apr 2022 09:48:54 -0500 Subject: signal: Rename send_signal send_signal_locked Rename send_signal and __send_signal to send_signal_locked and __send_signal_locked to make send_signal usable outside of signal.c. Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-1-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/signal.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/signal.h b/include/linux/signal.h index a6db6f2ae113..55605bdf5ce9 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -283,6 +283,8 @@ extern int do_send_sig_info(int sig, struct kernel_siginfo *info, extern int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *); +extern int send_signal_locked(int sig, struct kernel_siginfo *info, + struct task_struct *p, enum pid_type type); extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); -- cgit v1.2.3 From e71ba124078e391879e0bf111529fa2d630d106c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Apr 2022 09:28:50 -0500 Subject: signal: Replace __group_send_sig_info with send_signal_locked The function __group_send_sig_info is just a light wrapper around send_signal_locked with one parameter fixed to a constant value. As the wrapper adds no real value update the code to directly call the wrapped function. Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-2-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/signal.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/signal.h b/include/linux/signal.h index 55605bdf5ce9..3b98e7a28538 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -282,7 +282,6 @@ extern int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); -extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern int send_signal_locked(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int sigprocmask(int, sigset_t *, sigset_t *); -- cgit v1.2.3 From c200e4bb44e80b343c09841e7caaaca0aac5e5fa Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Apr 2022 16:30:17 -0500 Subject: ptrace/um: Replace PT_DTRACE with TIF_SINGLESTEP User mode linux is the last user of the PT_DTRACE flag. Using the flag to indicate single stepping is a little confusing and worse changing tsk->ptrace without locking could potentionally cause problems. So use a thread info flag with a better name instead of flag in tsk->ptrace. Remove the definition PT_DTRACE as uml is the last user. Cc: stable@vger.kernel.org Acked-by: Johannes Berg Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-3-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/ptrace.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 15b3d176b6b4..4c06f9f8ef3f 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -30,7 +30,6 @@ extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 -#define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */ #define PT_OPT_FLAG_SHIFT 3 /* PT_TRACE_* event enable flags */ -- cgit v1.2.3 From 4a3d2717d140401df7501a95e454180831a0c5af Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Apr 2022 16:45:37 -0500 Subject: ptrace/xtensa: Replace PT_SINGLESTEP with TIF_SINGLESTEP xtensa is the last user of the PT_SINGLESTEP flag. Changing tsk->ptrace in user_enable_single_step and user_disable_single_step without locking could potentiallly cause problems. So use a thread info flag instead of a flag in tsk->ptrace. Use TIF_SINGLESTEP that xtensa already had defined but unused. Remove the definitions of PT_SINGLESTEP and PT_BLOCKSTEP as they have no more users. Cc: stable@vger.kernel.org Acked-by: Max Filippov Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-4-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/ptrace.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 4c06f9f8ef3f..c952c5ba8fab 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -46,12 +46,6 @@ extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, #define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT) #define PT_SUSPEND_SECCOMP (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) -/* single stepping state bits (used on ARM and PA-RISC) */ -#define PT_SINGLESTEP_BIT 31 -#define PT_SINGLESTEP (1< Date: Fri, 29 Apr 2022 08:43:34 -0500 Subject: ptrace: Don't change __state Stop playing with tsk->__state to remove TASK_WAKEKILL while a ptrace command is executing. Instead remove TASK_WAKEKILL from the definition of TASK_TRACED, and implement a new jobctl flag TASK_PTRACE_FROZEN. This new flag is set in jobctl_freeze_task and cleared when ptrace_stop is awoken or in jobctl_unfreeze_task (when ptrace_stop remains asleep). In signal_wake_up add __TASK_TRACED to state along with TASK_WAKEKILL when the wake up is for a fatal signal. Skip adding __TASK_TRACED when TASK_PTRACE_FROZEN is not set. This has the same effect as changing TASK_TRACED to __TASK_TRACED as all of the wake_ups that use TASK_KILLABLE go through signal_wake_up. Handle a ptrace_stop being called with a pending fatal signal. Previously it would have been handled by schedule simply failing to sleep. As TASK_WAKEKILL is no longer part of TASK_TRACED schedule will sleep with a fatal_signal_pending. The code in signal_wake_up guarantees that the code will be awaked by any fatal signal that codes after TASK_TRACED is set. Previously the __state value of __TASK_TRACED was changed to TASK_RUNNING when woken up or back to TASK_TRACED when the code was left in ptrace_stop. Now when woken up ptrace_stop now clears JOBCTL_PTRACE_FROZEN and when left sleeping ptrace_unfreezed_traced clears JOBCTL_PTRACE_FROZEN. Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-10-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/sched.h | 2 +- include/linux/sched/jobctl.h | 2 ++ include/linux/sched/signal.h | 5 +++-- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d5e3c00b74e1..610f2fdb1e2c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -103,7 +103,7 @@ struct task_group; /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) -#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) +#define TASK_TRACED __TASK_TRACED #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h index fa067de9f1a9..d556c3425963 100644 --- a/include/linux/sched/jobctl.h +++ b/include/linux/sched/jobctl.h @@ -19,6 +19,7 @@ struct task_struct; #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ #define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ +#define JOBCTL_PTRACE_FROZEN_BIT 24 /* frozen for ptrace */ #define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) @@ -28,6 +29,7 @@ struct task_struct; #define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) #define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) #define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) +#define JOBCTL_PTRACE_FROZEN (1UL << JOBCTL_PTRACE_FROZEN_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3c8b34876744..e66948abbee4 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -435,9 +435,10 @@ extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); -static inline void signal_wake_up(struct task_struct *t, bool resume) +static inline void signal_wake_up(struct task_struct *t, bool fatal) { - signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0); + fatal = fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN); + signal_wake_up_state(t, fatal ? TASK_WAKEKILL | __TASK_TRACED : 0); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { -- cgit v1.2.3 From 31cae1eaae4fd65095ad6a3659db467bc3c2599e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 May 2022 15:57:47 -0500 Subject: sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state Currently ptrace_stop() / do_signal_stop() rely on the special states TASK_TRACED and TASK_STOPPED resp. to keep unique state. That is, this state exists only in task->__state and nowhere else. There's two spots of bother with this: - PREEMPT_RT has task->saved_state which complicates matters, meaning task_is_{traced,stopped}() needs to check an additional variable. - An alternative freezer implementation that itself relies on a special TASK state would loose TASK_TRACED/TASK_STOPPED and will result in misbehaviour. As such, add additional state to task->jobctl to track this state outside of task->__state. NOTE: this doesn't actually fix anything yet, just adds extra state. --EWB * didn't add a unnecessary newline in signal.h * Update t->jobctl in signal_wake_up and ptrace_signal_wake_up instead of in signal_wake_up_state. This prevents the clearing of TASK_STOPPED and TASK_TRACED from getting lost. * Added warnings if JOBCTL_STOPPED or JOBCTL_TRACED are not cleared Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220421150654.757693825@infradead.org Tested-by: Kees Cook Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20220505182645.497868-12-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- include/linux/sched.h | 8 +++----- include/linux/sched/jobctl.h | 6 ++++++ include/linux/sched/signal.h | 19 +++++++++++++++---- 3 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 610f2fdb1e2c..cbe5c899599c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -118,11 +118,9 @@ struct task_group; #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) -#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0) - -#define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0) - -#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0) +#define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) +#define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) +#define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) /* * Special states are those that do not use the normal wait-loop pattern. See diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h index d556c3425963..68876d0a7ef9 100644 --- a/include/linux/sched/jobctl.h +++ b/include/linux/sched/jobctl.h @@ -21,6 +21,9 @@ struct task_struct; #define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ #define JOBCTL_PTRACE_FROZEN_BIT 24 /* frozen for ptrace */ +#define JOBCTL_STOPPED_BIT 26 /* do_signal_stop() */ +#define JOBCTL_TRACED_BIT 27 /* ptrace_stop() */ + #define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) #define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT) @@ -31,6 +34,9 @@ struct task_struct; #define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) #define JOBCTL_PTRACE_FROZEN (1UL << JOBCTL_PTRACE_FROZEN_BIT) +#define JOBCTL_STOPPED (1UL << JOBCTL_STOPPED_BIT) +#define JOBCTL_TRACED (1UL << JOBCTL_TRACED_BIT) + #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index e66948abbee4..07ba3404fcde 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -294,8 +294,10 @@ static inline int kernel_dequeue_signal(void) static inline void kernel_signal_stop(void) { spin_lock_irq(¤t->sighand->siglock); - if (current->jobctl & JOBCTL_STOP_DEQUEUED) + if (current->jobctl & JOBCTL_STOP_DEQUEUED) { + current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); + } spin_unlock_irq(¤t->sighand->siglock); schedule(); @@ -437,12 +439,21 @@ extern void signal_wake_up_state(struct task_struct *t, unsigned int state); static inline void signal_wake_up(struct task_struct *t, bool fatal) { - fatal = fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN); - signal_wake_up_state(t, fatal ? TASK_WAKEKILL | __TASK_TRACED : 0); + unsigned int state = 0; + if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) { + t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED); + state = TASK_WAKEKILL | __TASK_TRACED; + } + signal_wake_up_state(t, state); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { - signal_wake_up_state(t, resume ? __TASK_TRACED : 0); + unsigned int state = 0; + if (resume) { + t->jobctl &= ~JOBCTL_TRACED; + state = __TASK_TRACED; + } + signal_wake_up_state(t, state); } void task_join_group_stop(struct task_struct *task); -- cgit v1.2.3 From 5b74c690e1c55953ec99fd9dab74f72dbee4fe95 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 12 May 2022 01:16:51 +0530 Subject: bpf: Fix sparse warning for bpf_kptr_xchg_proto Kernel Test Robot complained about missing static storage class annotation for bpf_kptr_xchg_proto variable. sparse: symbol 'bpf_kptr_xchg_proto' was not declared. Should it be static? This caused by missing extern definition in the header. Add it to suppress the sparse warning. Fixes: c0a5a21c25f3 ("bpf: Allow storing referenced kptr in map") Reported-by: kernel test robot Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220511194654.765705-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aba7ded56436..3ded8711457f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2246,6 +2246,7 @@ extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_strncmp_proto; extern const struct bpf_func_proto bpf_copy_from_user_task_proto; +extern const struct bpf_func_proto bpf_kptr_xchg_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); -- cgit v1.2.3 From 07343110b293456d30393e89b86c4dee1ac051c8 Mon Sep 17 00:00:00 2001 From: Feng Zhou Date: Wed, 11 May 2022 17:38:53 +0800 Subject: bpf: add bpf_map_lookup_percpu_elem for percpu map Add new ebpf helpers bpf_map_lookup_percpu_elem. The implementation method is relatively simple, refer to the implementation method of map_lookup_elem of percpu map, increase the parameters of cpu, and obtain it according to the specified cpu. Signed-off-by: Feng Zhou Link: https://lore.kernel.org/r/20220511093854.411-2-zhoufeng.zf@bytedance.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3ded8711457f..5061ccd8b2dc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -89,6 +89,7 @@ struct bpf_map_ops { int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); int (*map_pop_elem)(struct bpf_map *map, void *value); int (*map_peek_elem)(struct bpf_map *map, void *value); + void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -2184,6 +2185,7 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_map_push_elem_proto; extern const struct bpf_func_proto bpf_map_pop_elem_proto; extern const struct bpf_func_proto bpf_map_peek_elem_proto; +extern const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; -- cgit v1.2.3 From 43213daed6d6cb60e8cf69058a6db8648a556d9d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 May 2022 19:53:01 -0700 Subject: fortify: Provide a memcpy trap door for sharp corners As we continue to narrow the scope of what the FORTIFY memcpy() will accept and build alternative APIs that give the compiler appropriate visibility into more complex memcpy scenarios, there is a need for "unfortified" memcpy use in rare cases where combinations of compiler behaviors, source code layout, etc, result in cases where the stricter memcpy checks need to be bypassed until appropriate solutions can be developed (i.e. fix compiler bugs, code refactoring, new API, etc). The intention is for this to be used only if there's no other reasonable solution, for its use to include a justification that can be used to assess future solutions, and for it to be temporary. Example usage included, based on analysis and discussion from: https://lore.kernel.org/netdev/CANn89iLS_2cshtuXPyNUGDPaic=sJiYfvTb_wNLgWrZRyBxZ_g@mail.gmail.com Cc: Jakub Kicinski Cc: Eric Dumazet Cc: "David S. Miller" Cc: Paolo Abeni Cc: Coco Li Cc: Tariq Toukan Cc: Saeed Mahameed Cc: Leon Romanovsky Cc: netdev@vger.kernel.org Cc: linux-hardening@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220511025301.3636666-1-keescook@chromium.org Signed-off-by: Paolo Abeni --- include/linux/fortify-string.h | 16 ++++++++++++++++ include/linux/string.h | 4 ++++ 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 295637a66c46..3b401fa0f374 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -52,6 +52,22 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) #define __underlying_strncpy __builtin_strncpy #endif +/** + * unsafe_memcpy - memcpy implementation with no FORTIFY bounds checking + * + * @dst: Destination memory address to write to + * @src: Source memory address to read from + * @bytes: How many bytes to write to @dst from @src + * @justification: Free-form text or comment describing why the use is needed + * + * This should be used for corner cases where the compiler cannot do the + * right thing, or during transitions between APIs, etc. It should be used + * very rarely, and includes a place for justification detailing where bounds + * checking has happened, and why existing solutions cannot be employed. + */ +#define unsafe_memcpy(dst, src, bytes, justification) \ + __underlying_memcpy(dst, src, bytes) + /* * Clang's use of __builtin_object_size() within inlines needs hinting via * __pass_object_size(). The preference is to only ever use type 1 (member diff --git a/include/linux/string.h b/include/linux/string.h index b6572aeca2f5..61ec7e4f6311 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -252,6 +252,10 @@ static inline const char *kbasename(const char *path) #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) #include #endif +#ifndef unsafe_memcpy +#define unsafe_memcpy(dst, src, bytes, justification) \ + memcpy(dst, src, bytes) +#endif void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, int pad); -- cgit v1.2.3 From a44623d9279086c89f631201d993aa332f7c9e66 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Tue, 10 May 2022 14:46:29 +0530 Subject: usb: core: hcd: Add support for deferring roothub registration It has been observed with certain PCIe USB cards (like Inateck connected to AM64 EVM or J7200 EVM) that as soon as the primary roothub is registered, port status change is handled even before xHC is running leading to cold plug USB devices not detected. For such cases, registering both the root hubs along with the second HCD is required. Add support for deferring roothub registration in usb_add_hcd(), so that both primary and secondary roothubs are registered along with the second HCD. This patch has been added and reverted earier as it triggered a race in usb device enumeration. That race is now fixed in 5.16-rc3, and in stable back to 5.4 commit 6cca13de26ee ("usb: hub: Fix locking issues with address0_mutex") commit 6ae6dc22d2d1 ("usb: hub: Fix usb enumeration issue due to address0 race") CC: stable@vger.kernel.org # 5.4+ Suggested-by: Mathias Nyman Tested-by: Chris Chiu Acked-by: Alan Stern Signed-off-by: Kishon Vijay Abraham I Link: https://lore.kernel.org/r/20220510091630.16564-2-kishon@ti.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/hcd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 548a028f2dab..2c1fc9212cf2 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -124,6 +124,7 @@ struct usb_hcd { #define HCD_FLAG_RH_RUNNING 5 /* root hub is running? */ #define HCD_FLAG_DEAD 6 /* controller has died? */ #define HCD_FLAG_INTF_AUTHORIZED 7 /* authorize interfaces? */ +#define HCD_FLAG_DEFER_RH_REGISTER 8 /* Defer roothub registration */ /* The flags can be tested using these macros; they are likely to * be slightly faster than test_bit(). @@ -134,6 +135,7 @@ struct usb_hcd { #define HCD_WAKEUP_PENDING(hcd) ((hcd)->flags & (1U << HCD_FLAG_WAKEUP_PENDING)) #define HCD_RH_RUNNING(hcd) ((hcd)->flags & (1U << HCD_FLAG_RH_RUNNING)) #define HCD_DEAD(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEAD)) +#define HCD_DEFER_RH_REGISTER(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEFER_RH_REGISTER)) /* * Specifies if interfaces are authorized by default -- cgit v1.2.3 From 5ce7729f25c16d5045deff4c9577e6d565da2d8d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 May 2022 08:14:08 +0200 Subject: block: reorder the REQ_ flags Keep the op-specific flag last so that they are clearly separate from the generic flags. Various recent commits just kept adding new flags at the end. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220512061408.1826595-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 4968cb17b13b..30f9a5391fd5 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -408,16 +408,17 @@ enum req_flag_bits { * work item to avoid such priority inversions. */ __REQ_CGROUP_PUNT, - - /* command specific flags for REQ_OP_WRITE_ZEROES: */ - __REQ_NOUNMAP, /* do not free blocks when zeroing */ - __REQ_POLLED, /* caller polls for completion using bio_poll */ __REQ_ALLOC_CACHE, /* allocate IO from cache if available */ + __REQ_SWAP, /* swap I/O */ + __REQ_DRV, /* for driver use */ + + /* + * Command specific flags, keep last: + */ + /* for REQ_OP_WRITE_ZEROES: */ + __REQ_NOUNMAP, /* do not free blocks when zeroing */ - /* for driver use */ - __REQ_DRV, - __REQ_SWAP, /* swapping request. */ __REQ_NR_BITS, /* stops here */ }; -- cgit v1.2.3 From 5d2ae14276e698c76fa0c8ce870103f343b38263 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 11 May 2022 16:51:52 -0700 Subject: block: Fix the bio.bi_opf comment Commit ef295ecf090d modified the Linux kernel such that the bottom bits of the bi_opf member contain the operation instead of the topmost bits. That commit did not update the comment next to bi_opf. Hence this patch. From commit ef295ecf090d: -#define bio_op(bio) ((bio)->bi_opf >> BIO_OP_SHIFT) +#define bio_op(bio) ((bio)->bi_opf & REQ_OP_MASK) Cc: Christoph Hellwig Cc: Ming Lei Fixes: ef295ecf090d ("block: better op and flags encoding") Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220511235152.1082246-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 30f9a5391fd5..40e815400611 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -246,9 +246,8 @@ typedef unsigned int blk_qc_t; struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; - unsigned int bi_opf; /* bottom bits req flags, - * top bits REQ_OP. Use - * accessors. + unsigned int bi_opf; /* bottom bits REQ_OP, top bits + * req_flags. */ unsigned short bi_flags; /* BIO_* below */ unsigned short bi_ioprio; -- cgit v1.2.3 From 2760f5a415c3b86c6394738c6cff740c8b4ce664 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 May 2022 15:54:01 -0700 Subject: stop_machine: Add stop_core_cpuslocked() for per-core operations Hardware core level testing features require near simultaneous execution of WRMSR instructions on all threads of a core to initiate a test. Provide a customized cut down version of stop_machine_cpuslocked() that just operates on the threads of a single core. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Tony Luck Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220506225410.1652287-4-tony.luck@intel.com Signed-off-by: Hans de Goede --- include/linux/stop_machine.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 46fb3ebdd16e..ea7a74ea7389 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -124,6 +124,22 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); */ int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); +/** + * stop_core_cpuslocked: - stop all threads on just one core + * @cpu: any cpu in the targeted core + * @fn: the function to run + * @data: the data ptr for @fn() + * + * Same as above, but instead of every CPU, only the logical CPUs of a + * single core are affected. + * + * Context: Must be called from within a cpus_read_lock() protected region. + * + * Return: 0 if all executions of @fn returned 0, any non zero return + * value if any returned non zero. + */ +int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data); + int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); #else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From 75d6fe48a21a0ea1565228c12b9c16f3fb37b673 Mon Sep 17 00:00:00 2001 From: Siddh Raman Pant Date: Thu, 12 May 2022 20:06:45 +0530 Subject: spi: Doc fix - Describe add_lock and dma_map_dev in spi_controller This fixes the corresponding warnings during building the docs. Signed-off-by: Siddh Raman Pant Link: https://lore.kernel.org/r/4e6187a4-d0f8-4750-e407-e09cc1c91789@gmail.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 5f8c063ddff4..df70eb1a671e 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -347,6 +347,7 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * @max_message_size: function that returns the max message size for * a &spi_device; may be %NULL, so the default %SIZE_MAX will be used. * @io_mutex: mutex for physical bus access + * @add_lock: mutex to avoid adding devices to the same chipselect * @bus_lock_spinlock: spinlock for SPI bus locking * @bus_lock_mutex: mutex for exclusion of multiple callers * @bus_lock_flag: indicates that the SPI bus is locked for exclusive use @@ -361,6 +362,7 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * @transfer: adds a message to the controller's transfer queue. * @cleanup: frees controller-specific state * @can_dma: determine whether this controller supports DMA + * @dma_map_dev: device which can be used for DMA mapping * @queued: whether this controller is providing an internal message queue * @kworker: pointer to thread struct for message pump * @pump_messages: work struct for scheduling work to the message pump -- cgit v1.2.3 From 9824117dd964ecebf5d81990dbf21dfb56445049 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Tue, 12 Apr 2022 15:38:51 -0500 Subject: ipmi: Add an intializer for ipmi_smi_msg struct There was a "type" element added to this structure, but some static values were missed. The default value will be zero, which is correct, but create an initializer for the type and initialize the type properly in the initializer to avoid future issues. Reported-by: Joe Wiese Signed-off-by: Corey Minyard --- include/linux/ipmi_smi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index 9277d21c2690..5d69820d8b02 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -125,6 +125,12 @@ struct ipmi_smi_msg { void (*done)(struct ipmi_smi_msg *msg); }; +#define INIT_IPMI_SMI_MSG(done_handler) \ +{ \ + .done = done_handler, \ + .type = IPMI_SMI_MSG_TYPE_NORMAL \ +} + struct ipmi_smi_handlers { struct module *owner; -- cgit v1.2.3 From f214549d717310f795c20db9497db3938116399d Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Tue, 12 Apr 2022 15:49:47 -0500 Subject: ipmi: Add an intializer for ipmi_recv_msg struct Don't hand-initialize the struct here, create a macro to initialize it so new fields added don't get forgotten in places. Signed-off-by: Corey Minyard --- include/linux/ipmi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index 163831a087ef..a1c9c0d48ebf 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -72,6 +72,11 @@ struct ipmi_recv_msg { unsigned char msg_data[IPMI_MAX_MSG_LENGTH]; }; +#define INIT_IPMI_RECV_MSG(done_handler) \ +{ \ + .done = done_handler \ +} + /* Allocate and free the receive message. */ void ipmi_free_recv_msg(struct ipmi_recv_msg *msg); -- cgit v1.2.3 From 80140a81f7f833998d732102eea0fea230b88067 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 27 Apr 2022 11:03:21 +0200 Subject: module.h: simplify MODULE_IMPORT_NS In commit ca321ec74322 ("module.h: allow #define strings to work with MODULE_IMPORT_NS") I fixed up the MODULE_IMPORT_NS() macro to allow defined strings to work with it. Unfortunatly I did it in a two-stage process, when it could just be done with the __stringify() macro as pointed out by Masahiro Yamada. Clean this up to only be one macro instead of two steps to achieve the same end result. Fixes: ca321ec74322 ("module.h: allow #define strings to work with MODULE_IMPORT_NS") Reported-by: Masahiro Yamada Cc: Luis Chamberlain Cc: Jessica Yu Cc: Matthias Maennich Signed-off-by: Greg Kroah-Hartman Signed-off-by: Luis Chamberlain --- include/linux/module.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 46d4d5f2516e..abd9fa916b7d 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -290,8 +290,7 @@ extern typeof(name) __mod_##type##__##name##_device_table \ * files require multiple MODULE_FIRMWARE() specifiers */ #define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware) -#define _MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, #ns) -#define MODULE_IMPORT_NS(ns) _MODULE_IMPORT_NS(ns) +#define MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, __stringify(ns)) struct notifier_block; -- cgit v1.2.3 From 0df65743537dd6e1c8b0924714f14dc367cba2be Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 May 2022 10:23:05 -0700 Subject: skbuff: replace a BUG_ON() with the new DEBUG_NET_WARN_ON_ONCE() Very few drivers actually have Kconfig knobs for adding -DDEBUG. 8 according to a quick grep, while there are 93 users of skb_checksum_none_assert(). Switch to the new DEBUG_NET_WARN_ON_ONCE() to catch bad skbs. Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220511172305.1382810-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b91d225fdc13..9d82a8b6c8f1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -5048,9 +5048,7 @@ static inline void skb_forward_csum(struct sk_buff *skb) */ static inline void skb_checksum_none_assert(const struct sk_buff *skb) { -#ifdef DEBUG - BUG_ON(skb->ip_summed != CHECKSUM_NONE); -#endif + DEBUG_NET_WARN_ON_ONCE(skb->ip_summed != CHECKSUM_NONE); } bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); -- cgit v1.2.3 From 58e4a2d27d3255e4e8c507fdc13734dccc9fc4c7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 17 Dec 2021 09:28:46 +0300 Subject: extcon: Fix extcon_get_extcon_dev() error handling The extcon_get_extcon_dev() function returns error pointers on error, NULL when it's a -EPROBE_DEFER defer situation, and ERR_PTR(-ENODEV) when the CONFIG_EXTCON option is disabled. This is very complicated for the callers to handle and a number of them had bugs that would lead to an Oops. In real life, there are two things which prevented crashes. First, error pointers would only be returned if there was bug in the caller where they passed a NULL "extcon_name" and none of them do that. Second, only two out of the eight drivers will build when CONFIG_EXTCON is disabled. The normal way to write this would be to return -EPROBE_DEFER directly when appropriate and return NULL when CONFIG_EXTCON is disabled. Then the error handling is simple and just looks like: dev->edev = extcon_get_extcon_dev(acpi_dev_name(adev)); if (IS_ERR(dev->edev)) return PTR_ERR(dev->edev); For the two drivers which can build with CONFIG_EXTCON disabled, then extcon_get_extcon_dev() will now return NULL which is not treated as an error and the probe will continue successfully. Those two drivers are "typec_fusb302" and "max8997-battery". In the original code, the typec_fusb302 driver had an 800ms hang in tcpm_get_current_limit() but now that function is a no-op. For the max8997-battery driver everything should continue working as is. Signed-off-by: Dan Carpenter Reviewed-by: Hans de Goede Reviewed-by: Heikki Krogerus Reviewed-by: Guenter Roeck Acked-by: Sebastian Reichel Signed-off-by: Chanwoo Choi --- include/linux/extcon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/extcon.h b/include/linux/extcon.h index 0c19010da77f..685401d94d39 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -296,7 +296,7 @@ static inline void devm_extcon_unregister_notifier_all(struct device *dev, static inline struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name) { - return ERR_PTR(-ENODEV); + return NULL; } static inline struct extcon_dev *extcon_find_edev_by_node(struct device_node *node) -- cgit v1.2.3 From 59fba9eed5a736caa257df4738c57ff72492365f Mon Sep 17 00:00:00 2001 From: Yunfei Dong Date: Thu, 12 May 2022 04:19:47 +0200 Subject: media: mediatek: vcodec: support stateless H.264 decoding for mt8192 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds h264 lat and core architecture driver for mt8192, and the decode mode is frame based for stateless decoder. Signed-off-by: Yunfei Dong Tested-by: Nícolas F. R. A. Prado Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/linux/remoteproc/mtk_scp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/remoteproc/mtk_scp.h b/include/linux/remoteproc/mtk_scp.h index b47416f7aeb8..7c2b7cc9fe6c 100644 --- a/include/linux/remoteproc/mtk_scp.h +++ b/include/linux/remoteproc/mtk_scp.h @@ -41,6 +41,8 @@ enum scp_ipi_id { SCP_IPI_ISP_FRAME, SCP_IPI_FD_CMD, SCP_IPI_CROS_HOST_CMD, + SCP_IPI_VDEC_LAT, + SCP_IPI_VDEC_CORE, SCP_IPI_NS_SERVICE = 0xFF, SCP_IPI_MAX = 0x100, }; -- cgit v1.2.3 From ea661ad6e1573d5b08c27444ff2ed403bf39ff66 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 10 May 2022 10:34:03 +0800 Subject: iommu/vt-d: Size Page Request Queue to avoid overflow condition PRQ overflow may cause I/O throughput congestion, resulting in unnecessary degradation of I/O performance. Appropriately increasing the length of PRQ can greatly reduce the occurrence of PRQ overflow. The count of maximum page requests that can be generated in parallel by a PCIe device is statically defined in the Outstanding Page Request Capacity field of the PCIe ATS configure space. The new length of PRQ is calculated by summing up the value of Outstanding Page Request Capacity register across all devices where Page Requests are supported on the real PR-capable platform (Intel Sapphire Rapids). The result is round to the nearest higher power of 2. The PRQ length is also double sized as the VT-d IOMMU driver only updates the Page Request Queue Head Register (PQH_REG) after processing the entire queue. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20220421113558.3504874-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20220510023407.2759143-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-svm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index b3b125b332aa..207ef06ba3e1 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -9,7 +9,7 @@ #define __INTEL_SVM_H__ /* Page Request Queue depth */ -#define PRQ_ORDER 2 +#define PRQ_ORDER 4 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) #define PRQ_DEPTH ((0x1000 << PRQ_ORDER) >> 5) -- cgit v1.2.3 From fc0051cb95909ab56bd8c929f24d48c9870c3e3a Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 10 May 2022 10:34:05 +0800 Subject: iommu/vt-d: Check domain force_snooping against attached devices As domain->force_snooping only impacts the devices attached with the domain, there's no need to check against all IOMMU units. On the other hand, force_snooping could be set on a domain no matter whether it has been attached or not, and once set it is an immutable flag. If no device attached, the operation always succeeds. Then this empty domain can be only attached to a device of which the IOMMU supports snoop control. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20220508123525.1973626-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20220510023407.2759143-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 72e5d7900e71..4f29139bbfc3 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -540,6 +540,7 @@ struct dmar_domain { u8 has_iotlb_device: 1; u8 iommu_coherency: 1; /* indicate coherency of iommu access */ u8 force_snooping : 1; /* Create IOPTEs with snoop control */ + u8 set_pte_snp:1; struct list_head devices; /* all devices' list */ struct iova_domain iovad; /* iova's that belong to this domain */ -- cgit v1.2.3 From 4a18419f71cdf9155d2d2a6c79546f720978b990 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 9 May 2022 18:20:50 -0700 Subject: mm/mprotect: use mmu_gather Patch series "mm/mprotect: avoid unnecessary TLB flushes", v6. This patchset is intended to remove unnecessary TLB flushes during mprotect() syscalls. Once this patch-set make it through, similar and further optimizations for MADV_COLD and userfaultfd would be possible. Basically, there are 3 optimizations in this patch-set: 1. Use TLB batching infrastructure to batch flushes across VMAs and do better/fewer flushes. This would also be handy for later userfaultfd enhancements. 2. Avoid unnecessary TLB flushes. This optimization is the one that provides most of the performance benefits. Unlike previous versions, we now only avoid flushes that would not result in spurious page-faults. 3. Avoiding TLB flushes on change_huge_pmd() that are only needed to prevent the A/D bits from changing. Andrew asked for some benchmark numbers. I do not have an easy determinate macrobenchmark in which it is easy to show benefit. I therefore ran a microbenchmark: a loop that does the following on anonymous memory, just as a sanity check to see that time is saved by avoiding TLB flushes. The loop goes: mprotect(p, PAGE_SIZE, PROT_READ) mprotect(p, PAGE_SIZE, PROT_READ|PROT_WRITE) *p = 0; // make the page writable The test was run in KVM guest with 1 or 2 threads (the second thread was busy-looping). I measured the time (cycles) of each operation: 1 thread 2 threads mmots +patch mmots +patch PROT_READ 3494 2725 (-22%) 8630 7788 (-10%) PROT_READ|WRITE 3952 2724 (-31%) 9075 2865 (-68%) [ mmots = v5.17-rc6-mmots-2022-03-06-20-38 ] The exact numbers are really meaningless, but the benefit is clear. There are 2 interesting results though. (1) PROT_READ is cheaper, while one can expect it not to be affected. This is presumably due to TLB miss that is saved (2) Without memory access (*p = 0), the speedup of the patch is even greater. In that scenario mprotect(PROT_READ) also avoids the TLB flush. As a result both operations on the patched kernel take roughly ~1500 cycles (with either 1 or 2 threads), whereas on mmotm their cost is as high as presented in the table. This patch (of 3): change_pXX_range() currently does not use mmu_gather, but instead implements its own deferred TLB flushes scheme. This both complicates the code, as developers need to be aware of different invalidation schemes, and prevents opportunities to avoid TLB flushes or perform them in finer granularity. The use of mmu_gather for modified PTEs has benefits in various scenarios even if pages are not released. For instance, if only a single page needs to be flushed out of a range of many pages, only that page would be flushed. If a THP page is flushed, on x86 a single TLB invlpg instruction can be used instead of 512 instructions (or a full TLB flush, which would Linux would actually use by default). mprotect() over multiple VMAs requires a single flush. Use mmu_gather in change_pXX_range(). As the pages are not released, only record the flushed range using tlb_flush_pXX_range(). Handle THP similarly and get rid of flush_cache_range() which becomes redundant since tlb_start_vma() calls it when needed. Link: https://lkml.kernel.org/r/20220401180821.1986781-1-namit@vmware.com Link: https://lkml.kernel.org/r/20220401180821.1986781-2-namit@vmware.com Signed-off-by: Nadav Amit Acked-by: Peter Zijlstra (Intel) Cc: Andrea Arcangeli Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Dave Hansen Cc: Peter Xu Cc: Thomas Gleixner Cc: Will Deacon Cc: Yu Zhao Cc: Nick Piggin Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 5 +++-- include/linux/mm.h | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2999190adc22..9a26bd10e083 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -36,8 +36,9 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr); bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd); -int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, - pgprot_t newprot, unsigned long cp_flags); +int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, pgprot_t newprot, + unsigned long cp_flags); vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); diff --git a/include/linux/mm.h b/include/linux/mm.h index 5dc5005ccc9b..d63ba0e0e068 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1967,10 +1967,11 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) -extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, +extern unsigned long change_protection(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, unsigned long cp_flags); -extern int mprotect_fixup(struct vm_area_struct *vma, +extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); -- cgit v1.2.3 From 4f83145721f362c2f4d312edc4755269a2069488 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 9 May 2022 18:20:50 -0700 Subject: mm: avoid unnecessary flush on change_huge_pmd() Calls to change_protection_range() on THP can trigger, at least on x86, two TLB flushes for one page: one immediately, when pmdp_invalidate() is called by change_huge_pmd(), and then another one later (that can be batched) when change_protection_range() finishes. The first TLB flush is only necessary to prevent the dirty bit (and with a lesser importance the access bit) from changing while the PTE is modified. However, this is not necessary as the x86 CPUs set the dirty-bit atomically with an additional check that the PTE is (still) present. One caveat is Intel's Knights Landing that has a bug and does not do so. Leverage this behavior to eliminate the unnecessary TLB flush in change_huge_pmd(). Introduce a new arch specific pmdp_invalidate_ad() that only invalidates the access and dirty bit from further changes. Link: https://lkml.kernel.org/r/20220401180821.1986781-4-namit@vmware.com Signed-off-by: Nadav Amit Cc: Andrea Arcangeli Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Dave Hansen Cc: Peter Xu Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: Yu Zhao Cc: Nick Piggin Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 53750224e176..530b1817b58c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -570,6 +570,26 @@ extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif +#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD + +/* + * pmdp_invalidate_ad() invalidates the PMD while changing a transparent + * hugepage mapping in the page tables. This function is similar to + * pmdp_invalidate(), but should only be used if the access and dirty bits would + * not be cleared by the software in the new PMD value. The function ensures + * that hardware changes of the access and dirty bits updates would not be lost. + * + * Doing so can allow in certain architectures to avoid a TLB flush in most + * cases. Yet, another TLB flush might be necessary later if the PMD update + * itself requires such flush (e.g., if protection was set to be stricter). Yet, + * even when a TLB flush is needed because of the update, the caller may be able + * to batch these TLB flushing operations, so fewer TLB flush operations are + * needed. + */ +extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +#endif + #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { -- cgit v1.2.3 From f38adfef7e6bfe681d6602b6668be31fe1310ed0 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 9 May 2022 18:20:51 -0700 Subject: mm/highmem: VM_BUG_ON() if offset + len > PAGE_SIZE Add VM_BUG_ON() bounds checking to make sure that, if "offset + len> PAGE_SIZE", memset() does not corrupt data in adjacent pages. Mainly to match all the similar functions in highmem.h. Link: https://lkml.kernel.org/r/20220426193020.8710-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Andrew Morton Cc: Ira Weiny Cc: Catalin Marinas Cc: "Matthew Wilcox (Oracle)" Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/highmem.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 39bb9b47fa9c..67720bfd6ade 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -358,6 +358,8 @@ static inline void memcpy_to_page(struct page *page, size_t offset, static inline void memzero_page(struct page *page, size_t offset, size_t len) { char *addr = kmap_local_page(page); + + VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, 0, len); flush_dcache_page(page); kunmap_local(addr); -- cgit v1.2.3 From 152e56178ad72037bc6a2f08d4a608543bc67cdf Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:51 -0700 Subject: mm/damon/core: add a function for damon_operations registration checks Patch series "mm/damon: allow users know which monitoring ops are available". DAMON users can configure it for vaious address spaces including virtual address spaces and the physical address space by setting its monitoring operations set with appropriate one for their purpose. However, there is no celan and simple way to know exactly which monitoring operations sets are available on the currently running kernel. This patchset adds functions for the purpose on DAMON's kernel API ('damon_is_registered_ops()') and sysfs interface ('avail_operations' file under each context directory). This patch (of 4): To know if a specific 'damon_operations' is registered, users need to check the kernel config or try 'damon_select_ops()' with the ops of the question, and then see if it successes. In the latter case, the user should also revert the change. To make the process simple and convenient, this commit adds a function for checking if a specific 'damon_operations' is registered or not. Link: https://lkml.kernel.org/r/20220426203843.45238-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220426203843.45238-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index f23cbfa4248d..73ff0e2d2a4d 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -509,6 +509,7 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); +bool damon_is_registered_ops(enum damon_ops_id id); int damon_register_ops(struct damon_operations *ops); int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id); -- cgit v1.2.3 From de6d01542a5c4f6fe6f0c65c14694b760f896acc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:52 -0700 Subject: mm/damon/vaddr: register a damon_operations for fixed virtual address ranges monitoring Patch series "support fixed virtual address ranges monitoring". The monitoring operations set for virtual address spaces automatically updates the monitoring target regions to cover entire mappings of the virtual address spaces as much as possible. Some users could have more information about their programs than kernel and therefore have interest in not entire regions but only specific regions. For such cases, the automatic monitoring target regions updates are only unnecessary overhead or distractions. This patchset adds supports for the use case on DAMON's kernel API (DAMON_OPS_FVADDR) and sysfs interface ('fvaddr' keyword for 'operations' sysfs file). This patch (of 3): The monitoring operations set for virtual address spaces automatically updates the monitoring target regions to cover entire mappings of the virtual address spaces as much as possible. Some users could have more information about their programs than kernel and therefore have interest in not entire regions but only specific regions. For such cases, the automatic monitoring target regions updates are only unnecessary overheads or distractions. For such cases, DAMON's API users can simply set the '->init()' and '->update()' of the DAMON context's '->ops' NULL, and set the target monitoring regions when creating the context. But, that would be a dirty hack. Worse yet, the hack is unavailable for DAMON user space interface users. To support the use case in a clean way that can easily exported to the user space, this commit adds another monitoring operations set called 'fvaddr', which is same to 'vaddr' but does not automatically update the monitoring regions. Instead, it will only respect the virtual address regions which have explicitly passed at the initial context creation. Note that this commit leave sysfs interface not supporting the feature yet. The support will be made in a following commit. Link: https://lkml.kernel.org/r/20220426231750.48822-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220426231750.48822-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 73ff0e2d2a4d..09a5d0d02c00 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -261,10 +261,13 @@ struct damos { * enum damon_ops_id - Identifier for each monitoring operations implementation * * @DAMON_OPS_VADDR: Monitoring operations for virtual address spaces + * @DAMON_OPS_FVADDR: Monitoring operations for only fixed ranges of virtual + * address spaces * @DAMON_OPS_PADDR: Monitoring operations for the physical address space */ enum damon_ops_id { DAMON_OPS_VADDR, + DAMON_OPS_FVADDR, DAMON_OPS_PADDR, NR_DAMON_OPS, }; -- cgit v1.2.3 From 534aa1dc975ac883ad89110534585a96630802a0 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 9 May 2022 18:20:53 -0700 Subject: printk: stop including cache.h from printk.h An inclusion of cache.h in printk.h was added in 2014 in commit c28aa1f0a847 ("printk/cache: mark printk_once test variable __read_mostly") in order to bring in the definition of __read_mostly. The usage of __read_mostly was later removed in commit 3ec25826ae33 ("printk: Tie printk_once / printk_deferred_once into .data.once for reset") which made the inclusion of cache.h unnecessary, so remove it. We have a small amount of code that depended on the inclusion of cache.h from printk.h; fix that code to include the appropriate header. This fixes a circular inclusion on arm64 (linux/printk.h -> linux/cache.h -> asm/cache.h -> linux/kasan-enabled.h -> linux/static_key.h -> linux/jump_label.h -> linux/bug.h -> asm/bug.h -> linux/printk.h) that would otherwise be introduced by the next patch. Build tested using {allyesconfig,defconfig} x {arm64,x86_64}. Link: https://linux-review.googlesource.com/id/I8fd51f72c9ef1f2d6afd3b2cbc875aa4792c1fba Link: https://lkml.kernel.org/r/20220427195820.1716975-1-pcc@google.com Signed-off-by: Peter Collingbourne Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric W. Biederman Cc: Herbert Xu Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Kees Cook Cc: Pekka Enberg Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/printk.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 1522df223c0f..8e8d74edf121 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From d949a8155d139aa890795b802004a196b7f00598 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 9 May 2022 18:20:53 -0700 Subject: mm: make minimum slab alignment a runtime property When CONFIG_KASAN_HW_TAGS is enabled we currently increase the minimum slab alignment to 16. This happens even if MTE is not supported in hardware or disabled via kasan=off, which creates an unnecessary memory overhead in those cases. Eliminate this overhead by making the minimum slab alignment a runtime property and only aligning to 16 if KASAN is enabled at runtime. On a DragonBoard 845c (non-MTE hardware) with a kernel built with CONFIG_KASAN_HW_TAGS, waiting for quiescence after a full Android boot I see the following Slab measurements in /proc/meminfo (median of 3 reboots): Before: 169020 kB After: 167304 kB [akpm@linux-foundation.org: make slab alignment type `unsigned int' to avoid casting] Link: https://linux-review.googlesource.com/id/I752e725179b43b144153f4b6f584ceb646473ead Link: https://lkml.kernel.org/r/20220427195820.1716975-2-pcc@google.com Signed-off-by: Peter Collingbourne Reviewed-by: Andrey Konovalov Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: David Rientjes Reviewed-by: Catalin Marinas Acked-by: Vlastimil Babka Cc: Pekka Enberg Cc: Roman Gushchin Cc: Joonsoo Kim Cc: Herbert Xu Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Eric W. Biederman Cc: Kees Cook Signed-off-by: Andrew Morton --- include/linux/slab.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 373b3ef99f4e..3d2f2a3ca17e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -209,6 +209,18 @@ void kmem_dump_obj(void *object); #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif +/* + * Arches can define this function if they want to decide the minimum slab + * alignment at runtime. The value returned by the function must be a power + * of two and >= ARCH_SLAB_MINALIGN. + */ +#ifndef arch_slab_minalign +static inline unsigned int arch_slab_minalign(void) +{ + return ARCH_SLAB_MINALIGN; +} +#endif + /* * kmalloc and friends return ARCH_KMALLOC_MINALIGN aligned * pointers. kmem_cache_alloc and friends return ARCH_SLAB_MINALIGN -- cgit v1.2.3 From b304c6f0d39d927a87e72a8ac6c89b96ac25f355 Mon Sep 17 00:00:00 2001 From: Hongchen Zhang Date: Mon, 9 May 2022 18:20:53 -0700 Subject: mm/swapops: make is_pmd_migration_entry more strict A pmd migration entry should first be a swap pmd,so use is_swap_pmd(pmd) instead of !pmd_present(pmd). On the other hand, some architecture (MIPS for example) may misjudge a pmd_none entry as a pmd migration entry. Link: https://lkml.kernel.org/r/1651131333-6386-1-git-send-email-zhanghongchen@loongson.cn Signed-off-by: Hongchen Zhang Acked-by: Peter Xu Cc: Alistair Popple Cc: Ralph Campbell Cc: Naoya Horiguchi Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/swapops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index d1b728904d4e..82265b055a0d 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -331,7 +331,7 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) static inline int is_pmd_migration_entry(pmd_t pmd) { - return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); + return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); } #else static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, -- cgit v1.2.3 From 6e74d2bf5a265113ca54a8323783d2f3fdde96b7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:54 -0700 Subject: mm/damon/core: add a new callback for watermarks checks Patch series "mm/damon: Support online tuning". Effects of DAMON and DAMON-based Operation Schemes highly depends on the configurations. Wrong configurations could even result in unexpected efficiency degradations. For finding a best configuration, repeating incremental configuration changes and results measurements, in other words, online tuning, could be helpful. Nevertheless, DAMON kernel API supports only restrictive online tuning. Worse yet, the sysfs-based DAMON user interface doesn't support online tuning at all. DAMON_RECLAIM also doesn't support online tuning. This patchset makes the DAMON kernel API, DAMON sysfs interface, and DAMON_RECLAIM supports online tuning. Sequence of patches ------------------- First two patches enhance DAMON online tuning for kernel API users. Specifically, patch 1 let kernel API users to be able to do DAMON online tuning without a restriction, and patch 2 makes error handling easier. Following seven patches (patches 3-9) refactor code for better readability and easier reuse of code fragments that will be useful for online tuning support. Patch 10 introduces DAMON callback based user request handling structure for DAMON sysfs interface, and patch 11 enables DAMON online tuning via DAMON sysfs interface. Documentation patch (patch 12) for usage of it follows. Patch 13 enables online tuning of DAMON_RECLAIM and finally patch 14 documents the DAMON_RECLAIM online tuning usage. This patch (of 14): For updating input parameters for running DAMON contexts, DAMON kernel API users can use the contexts' callbacks, as it is the safe place for context internal data accesses. When the context has DAMON-based operation schemes and all schemes are deactivated due to their watermarks, however, DAMON does nothing but only watermarks checks. As a result, no callbacks will be called back, and therefore the kernel API users cannot update the input parameters including monitoring attributes, DAMON-based operation schemes, and watermarks. To let users easily update such DAMON input parameters in such a case, this commit adds a new callback, 'after_wmarks_check()'. It will be called after each watermarks check. Users can do the online input parameters update in the callback even under the schemes deactivated case. Link: https://lkml.kernel.org/r/20220429160606.127307-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 09a5d0d02c00..6cb5ab5d8e9d 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -343,6 +343,7 @@ struct damon_operations { * struct damon_callback - Monitoring events notification callbacks. * * @before_start: Called before starting the monitoring. + * @after_wmarks_check: Called after each schemes' watermarks check. * @after_sampling: Called after each sampling. * @after_aggregation: Called after each aggregation. * @before_terminate: Called before terminating the monitoring. @@ -353,6 +354,11 @@ struct damon_operations { * respectively. Therefore, those are good places for installing and cleaning * @private. * + * The monitoring thread calls @after_wmarks_check after each DAMON-based + * operation schemes' watermarks check. If users need to make changes to the + * attributes of the monitoring context while it's deactivated due to the + * watermarks, this is the good place to do. + * * The monitoring thread calls @after_sampling and @after_aggregation for each * of the sampling intervals and aggregation intervals, respectively. * Therefore, users can safely access the monitoring results without additional @@ -365,6 +371,7 @@ struct damon_callback { void *private; int (*before_start)(struct damon_ctx *context); + int (*after_wmarks_check)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); void (*before_terminate)(struct damon_ctx *context); -- cgit v1.2.3 From d0723bc04185b15ed96ade0a0bf9277ef00008db Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 9 May 2022 18:20:55 -0700 Subject: mm/damon/vaddr: move 'damon_set_regions()' to core This commit moves 'damon_set_regions()' from vaddr to core, as it is aimed to be used by not only 'vaddr' but also other parts of DAMON. Link: https://lkml.kernel.org/r/20220429160606.127307-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 6cb5ab5d8e9d..d1e6ee28a2ff 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -494,6 +494,8 @@ static inline void damon_insert_region(struct damon_region *r, void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); +int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, + unsigned int nr_ranges); struct damos *damon_new_scheme( unsigned long min_sz_region, unsigned long max_sz_region, -- cgit v1.2.3 From 679d103319101800567c8bb7e341b5eee39f6685 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:52 -0700 Subject: mm: introduce PTE_MARKER swap entry Patch series "userfaultfd-wp: Support shmem and hugetlbfs", v8. Overview ======== Userfaultfd-wp anonymous support was merged two years ago. There're quite a few applications that started to leverage this capability either to take snapshots for user-app memory, or use it for full user controled swapping. This series tries to complete the feature for uffd-wp so as to cover all the RAM-based memory types. So far uffd-wp is the only missing piece of the rest features (uffd-missing & uffd-minor mode). One major reason to do so is that anonymous pages are sometimes not satisfying the need of applications, and there're growing users of either shmem and hugetlbfs for either sharing purpose (e.g., sharing guest mem between hypervisor process and device emulation process, shmem local live migration for upgrades), or for performance on tlb hits. All these mean that if a uffd-wp app wants to switch to any of the memory types, it'll stop working. I think it's worthwhile to have the kernel to cover all these aspects. This series chose to protect pages in pte level not page level. One major reason is safety. I have no idea how we could make it safe if any of the uffd-privileged app can wr-protect a page that any other application can use. It means this app can block any process potentially for any time it wants. The other reason is that it aligns very well with not only the anonymous uffd-wp solution, but also uffd as a whole. For example, userfaultfd is implemented fundamentally based on VMAs. We set flags to VMAs showing the status of uffd tracking. For another per-page based protection solution, it'll be crossing the fundation line on VMA-based, and it could simply be too far away already from what's called userfaultfd. PTE markers =========== The patchset is based on the idea called PTE markers. It was discussed in one of the mm alignment sessions, proposed starting from v6, and this is the 2nd version of it using PTE marker idea. PTE marker is a new type of swap entry that is ony applicable to file backed memories like shmem and hugetlbfs. It's used to persist some pte-level information even if the original present ptes in pgtable are zapped. Logically pte markers can store more than uffd-wp information, but so far only one bit is used for uffd-wp purpose. When the pte marker is installed with uffd-wp bit set, it means this pte is wr-protected by uffd. It solves the problem on e.g. file-backed memory mapped ptes got zapped due to any reason (e.g. thp split, or swapped out), we can still keep the wr-protect information in the ptes. Then when the page fault triggers again, we'll know this pte is wr-protected so we can treat the pte the same as a normal uffd wr-protected pte. The extra information is encoded into the swap entry, or swp_offset to be explicit, with the swp_type being PTE_MARKER. So far uffd-wp only uses one bit out of the swap entry, the rest bits of swp_offset are still reserved for other purposes. There're two configs to enable/disable PTE markers: CONFIG_PTE_MARKER CONFIG_PTE_MARKER_UFFD_WP We can set !PTE_MARKER to completely disable all the PTE markers, along with uffd-wp support. I made two config so we can also enable PTE marker but disable uffd-wp file-backed for other purposes. At the end of current series, I'll enable CONFIG_PTE_MARKER by default, but that patch is standalone and if anyone worries about having it by default, we can also consider turn it off by dropping that oneliner patch. So far I don't see a huge risk of doing so, so I kept that patch. In most cases, PTE markers should be treated as none ptes. It is because that unlike most of the other swap entry types, there's no PFN or block offset information encoded into PTE markers but some extra well-defined bits showing the status of the pte. These bits should only be used as extra data when servicing an upcoming page fault, and then we behave as if it's a none pte. I did spend a lot of time observing all the pte_none() users this time. It is indeed a challenge because there're a lot, and I hope I didn't miss a single of them when we should take care of pte markers. Luckily, I don't think it'll need to be considered in many cases, for example: boot code, arch code (especially non-x86), kernel-only page handlings (e.g. CPA), or device driver codes when we're tackling with pure PFN mappings. I introduced pte_none_mostly() in this series when we need to handle pte markers the same as none pte, the "mostly" is the other way to write "either none pte or a pte marker". I didn't replace pte_none() to cover pte markers for below reasons: - Very rare case of pte_none() callers will handle pte markers. E.g., all the kernel pages do not require knowledge of pte markers. So we don't pollute the major use cases. - Unconditionally change pte_none() semantics could confuse people, because pte_none() existed for so long a time. - Unconditionally change pte_none() semantics could make pte_none() slower even if in many cases pte markers do not exist. - There're cases where we'd like to handle pte markers differntly from pte_none(), so a full replace is also impossible. E.g. khugepaged should still treat pte markers as normal swap ptes rather than none ptes, because pte markers will always need a fault-in to merge the marker with a valid pte. Or the smap code will need to parse PTE markers not none ptes. Patch Layout ============ Introducing PTE marker and uffd-wp bit in PTE marker: mm: Introduce PTE_MARKER swap entry mm: Teach core mm about pte markers mm: Check against orig_pte for finish_fault() mm/uffd: PTE_MARKER_UFFD_WP Adding support for shmem uffd-wp: mm/shmem: Take care of UFFDIO_COPY_MODE_WP mm/shmem: Handle uffd-wp special pte in page fault handler mm/shmem: Persist uffd-wp bit across zapping for file-backed mm/shmem: Allow uffd wr-protect none pte for file-backed mem mm/shmem: Allows file-back mem to be uffd wr-protected on thps mm/shmem: Handle uffd-wp during fork() Adding support for hugetlbfs uffd-wp: mm/hugetlb: Introduce huge pte version of uffd-wp helpers mm/hugetlb: Hook page faults for uffd write protection mm/hugetlb: Take care of UFFDIO_COPY_MODE_WP mm/hugetlb: Handle UFFDIO_WRITEPROTECT mm/hugetlb: Handle pte markers in page faults mm/hugetlb: Allow uffd wr-protect none ptes mm/hugetlb: Only drop uffd-wp special pte if required mm/hugetlb: Handle uffd-wp during fork() Misc handling on the rest mm for uffd-wp file-backed: mm/khugepaged: Don't recycle vma pgtable if uffd-wp registered mm/pagemap: Recognize uffd-wp bit for shmem/hugetlbfs Enabling of uffd-wp on file-backed memory: mm/uffd: Enable write protection for shmem & hugetlbfs mm: Enable PTE markers by default selftests/uffd: Enable uffd-wp for shmem/hugetlbfs Tests ===== - Compile test on x86_64 and aarch64 on different configs - Kernel selftests - uffd-test [0] - Umapsort [1,2] test for shmem/hugetlb, with swap on/off [0] https://github.com/xzpeter/clibs/tree/master/uffd-test [1] https://github.com/xzpeter/umap-apps/tree/peter [2] https://github.com/xzpeter/umap/tree/peter-shmem-hugetlbfs This patch (of 23): Introduces a new swap entry type called PTE_MARKER. It can be installed for any pte that maps a file-backed memory when the pte is temporarily zapped, so as to maintain per-pte information. The information that kept in the pte is called a "marker". Here we define the marker as "unsigned long" just to match pgoff_t, however it will only work if it still fits in swp_offset(), which is e.g. currently 58 bits on x86_64. A new config CONFIG_PTE_MARKER is introduced too; it's by default off. A bunch of helpers are defined altogether to service the rest of the pte marker code. [peterx@redhat.com: fixup] Link: https://lkml.kernel.org/r/Yk2rdB7SXZf+2BDF@xz-m1.local Link: https://lkml.kernel.org/r/20220405014646.13522-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20220405014646.13522-2-peterx@redhat.com Signed-off-by: Peter Xu Cc: Mike Kravetz Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Alistair Popple Cc: Nadav Amit Cc: Axel Rasmussen Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Hugh Dickins Cc: Jerome Glisse Cc: Mike Rapoport Signed-off-by: Andrew Morton --- include/linux/swap.h | 15 +++++++++- include/linux/swapops.h | 78 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 7daae5a4b3e1..5553189d0215 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -55,6 +55,19 @@ static inline int current_is_kswapd(void) * actions on faults. */ +/* + * PTE markers are used to persist information onto PTEs that are mapped with + * file-backed memories. As its name "PTE" hints, it should only be applied to + * the leaves of pgtables. + */ +#ifdef CONFIG_PTE_MARKER +#define SWP_PTE_MARKER_NUM 1 +#define SWP_PTE_MARKER (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ + SWP_MIGRATION_NUM + SWP_DEVICE_NUM) +#else +#define SWP_PTE_MARKER_NUM 0 +#endif + /* * Unaddressable device memory support. See include/linux/hmm.h and * Documentation/vm/hmm.rst. Short description is we need struct pages for @@ -107,7 +120,7 @@ static inline int current_is_kswapd(void) #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM) + SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM) /* * Magic header for a swap area. The first part of the union is diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 82265b055a0d..bb70da461b7d 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -274,6 +274,84 @@ static inline int is_readable_migration_entry(swp_entry_t entry) #endif +typedef unsigned long pte_marker; + +#define PTE_MARKER_MASK (0) + +#ifdef CONFIG_PTE_MARKER + +static inline swp_entry_t make_pte_marker_entry(pte_marker marker) +{ + return swp_entry(SWP_PTE_MARKER, marker); +} + +static inline bool is_pte_marker_entry(swp_entry_t entry) +{ + return swp_type(entry) == SWP_PTE_MARKER; +} + +static inline pte_marker pte_marker_get(swp_entry_t entry) +{ + return swp_offset(entry) & PTE_MARKER_MASK; +} + +static inline bool is_pte_marker(pte_t pte) +{ + return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); +} + +#else /* CONFIG_PTE_MARKER */ + +static inline swp_entry_t make_pte_marker_entry(pte_marker marker) +{ + /* This should never be called if !CONFIG_PTE_MARKER */ + WARN_ON_ONCE(1); + return swp_entry(0, 0); +} + +static inline bool is_pte_marker_entry(swp_entry_t entry) +{ + return false; +} + +static inline pte_marker pte_marker_get(swp_entry_t entry) +{ + return 0; +} + +static inline bool is_pte_marker(pte_t pte) +{ + return false; +} + +#endif /* CONFIG_PTE_MARKER */ + +static inline pte_t make_pte_marker(pte_marker marker) +{ + return swp_entry_to_pte(make_pte_marker_entry(marker)); +} + +/* + * This is a special version to check pte_none() just to cover the case when + * the pte is a pte marker. It existed because in many cases the pte marker + * should be seen as a none pte; it's just that we have stored some information + * onto the none pte so it becomes not-none any more. + * + * It should be used when the pte is file-backed, ram-based and backing + * userspace pages, like shmem. It is not needed upon pgtables that do not + * support pte markers at all. For example, it's not needed on anonymous + * memory, kernel-only memory (including when the system is during-boot), + * non-ram based generic file-system. It's fine to be used even there, but the + * extra pte marker check will be pure overhead. + * + * For systems configured with !CONFIG_PTE_MARKER this will be automatically + * optimized to pte_none(). + */ +static inline int pte_none_mostly(pte_t pte) +{ + return pte_none(pte) || is_pte_marker(pte); +} + static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { struct page *p = pfn_to_page(swp_offset(entry)); -- cgit v1.2.3 From f46f2adecdcc1ba0799383e67fe98f65f41fea5c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:52 -0700 Subject: mm: check against orig_pte for finish_fault() This patch allows do_fault() to trigger on !pte_none() cases too. This prepares for the pte markers to be handled by do_fault() just like none pte. To achieve this, instead of unconditionally check against pte_none() in finish_fault(), we may hit the case that the orig_pte was some pte marker so what we want to do is to replace the pte marker with some valid pte entry. Then if orig_pte was set we'd want to check the current *pte (under pgtable lock) against orig_pte rather than none pte. Right now there's no solid way to safely reference orig_pte because when pmd is not allocated handle_pte_fault() will not initialize orig_pte, so it's not safe to reference it. There's another solution proposed before this patch to do pte_clear() for vmf->orig_pte for pmd==NULL case, however it turns out it'll break arm32 because arm32 could have assumption that pte_t* pointer will always reside on a real ram32 pgtable, not any kernel stack variable. To solve this, we add a new flag FAULT_FLAG_ORIG_PTE_VALID, and it'll be set along with orig_pte when there is valid orig_pte, or it'll be cleared when orig_pte was not initialized. It'll be updated every time we call handle_pte_fault(), so e.g. if a page fault retry happened it'll be properly updated along with orig_pte. [1] https://lore.kernel.org/lkml/710c48c9-406d-e4c5-a394-10501b951316@samsung.com/ [akpm@linux-foundation.org: coding-style cleanups] [peterx@redhat.com: fix crash reported by Marek] Link: https://lkml.kernel.org/r/Ylb9rXJyPm8/ao8f@xz-m1.local Link: https://lkml.kernel.org/r/20220405014836.14077-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Alistair Popple Tested-by: Marek Szyprowski Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7216d77a5884..dd382270ae40 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -822,6 +822,8 @@ typedef struct { * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to unshare (and mark * exclusive) a possibly shared anonymous page that is * mapped R/O. + * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. + * We should only access orig_pte if this flag set. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two @@ -858,6 +860,7 @@ enum fault_flag { FAULT_FLAG_INSTRUCTION = 1 << 8, FAULT_FLAG_INTERRUPTIBLE = 1 << 9, FAULT_FLAG_UNSHARE = 1 << 10, + FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, }; #endif /* _LINUX_MM_TYPES_H */ -- cgit v1.2.3 From 1db9dbc2ef05205bf1022f9b14aa29b1dd8efd7e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:52 -0700 Subject: mm/uffd: PTE_MARKER_UFFD_WP This patch introduces the 1st user of pte marker: the uffd-wp marker. When the pte marker is installed with the uffd-wp bit set, it means this pte was wr-protected by uffd. We will use this special pte to arm the ptes that got either unmapped or swapped out for a file-backed region that was previously wr-protected. This special pte could trigger a page fault just like swap entries. This idea is greatly inspired by Hugh and Andrea in the discussion, which is referenced in the links below. Some helpers are introduced to detect whether a swap pte is uffd wr-protected. After the pte marker introduced, one swap pte can be wr-protected in two forms: either it is a normal swap pte and it has _PAGE_SWP_UFFD_WP set, or it's a pte marker that has PTE_MARKER_UFFD_WP set. [peterx@redhat.com: fixup] Link: https://lkml.kernel.org/r/YkzKiM8tI4+qOfXF@xz-m1.local Link: https://lore.kernel.org/lkml/20201126222359.8120-1-peterx@redhat.com/ Link: https://lore.kernel.org/lkml/20201130230603.46187-1-peterx@redhat.com/ Link: https://lkml.kernel.org/r/20220405014838.14131-1-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Andrea Arcangeli Suggested-by: Hugh Dickins Cc: Alistair Popple Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/swapops.h | 3 ++- include/linux/userfaultfd_k.h | 47 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index bb70da461b7d..09945342bf76 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -276,7 +276,8 @@ static inline int is_readable_migration_entry(swp_entry_t entry) typedef unsigned long pte_marker; -#define PTE_MARKER_MASK (0) +#define PTE_MARKER_UFFD_WP BIT(0) +#define PTE_MARKER_MASK (PTE_MARKER_UFFD_WP) #ifdef CONFIG_PTE_MARKER diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 33cea484d1ad..3354d9ddc35f 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -15,6 +15,8 @@ #include #include +#include +#include #include /* The set of all possible UFFD-related VM flags. */ @@ -236,4 +238,49 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, #endif /* CONFIG_USERFAULTFD */ +static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) +{ +#ifdef CONFIG_PTE_MARKER_UFFD_WP + return is_pte_marker_entry(entry) && + (pte_marker_get(entry) & PTE_MARKER_UFFD_WP); +#else + return false; +#endif +} + +static inline bool pte_marker_uffd_wp(pte_t pte) +{ +#ifdef CONFIG_PTE_MARKER_UFFD_WP + swp_entry_t entry; + + if (!is_swap_pte(pte)) + return false; + + entry = pte_to_swp_entry(pte); + + return pte_marker_entry_uffd_wp(entry); +#else + return false; +#endif +} + +/* + * Returns true if this is a swap pte and was uffd-wp wr-protected in either + * forms (pte marker or a normal swap pte), false otherwise. + */ +static inline bool pte_swp_uffd_wp_any(pte_t pte) +{ +#ifdef CONFIG_PTE_MARKER_UFFD_WP + if (!is_swap_pte(pte)) + return false; + + if (pte_swp_uffd_wp(pte)) + return true; + + if (pte_marker_uffd_wp(pte)) + return true; +#endif + return false; +} + #endif /* _LINUX_USERFAULTFD_K_H */ -- cgit v1.2.3 From 8ee79edff6d3b43b2d0c1e41f92b32e128988b22 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:52 -0700 Subject: mm/shmem: take care of UFFDIO_COPY_MODE_WP Pass wp_copy into shmem_mfill_atomic_pte() through the stack, then apply the UFFD_WP bit properly when the UFFDIO_COPY on shmem is with UFFDIO_COPY_MODE_WP. wp_copy lands mfill_atomic_install_pte() finally. Note: we must do pte_wrprotect() if !writable in mfill_atomic_install_pte(), as mk_pte() could return a writable pte (e.g., when VM_SHARED on a shmem file). Link: https://lkml.kernel.org/r/20220405014841.14185-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 3e915cc550bc..a68f982f22d1 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -145,11 +145,11 @@ extern int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - bool zeropage, + bool zeropage, bool wp_copy, struct page **pagep); #else /* !CONFIG_SHMEM */ #define shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \ - src_addr, zeropage, pagep) ({ BUG(); 0; }) + src_addr, zeropage, wp_copy, pagep) ({ BUG(); 0; }) #endif /* CONFIG_SHMEM */ #endif /* CONFIG_USERFAULTFD */ -- cgit v1.2.3 From 9c28a205c06123b9f0a0c4d819ece9f5f552d004 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:53 -0700 Subject: mm/shmem: handle uffd-wp special pte in page fault handler File-backed memories are prone to unmap/swap so the ptes are always unstable, because they can be easily faulted back later using the page cache. This could lead to uffd-wp getting lost when unmapping or swapping out such memory. One example is shmem. PTE markers are needed to store those information. This patch prepares it by handling uffd-wp pte markers first it is applied elsewhere, so that the page fault handler can recognize uffd-wp pte markers. The handling of uffd-wp pte markers is similar to missing fault, it's just that we'll handle this "missing fault" when we see the pte markers, meanwhile we need to make sure the marker information is kept during processing the fault. This is a slow path of uffd-wp handling, because zapping of wr-protected shmem ptes should be rare. So far it should only trigger in two conditions: (1) When trying to punch holes in shmem_fallocate(), there is an optimization to zap the pgtables before evicting the page. (2) When swapping out shmem pages. Because of this, the page fault handling is simplifed too by not sending the wr-protect message in the 1st page fault, instead the page will be installed read-only, so the uffd-wp message will be generated in the next fault, which will trigger the do_wp_page() path of general uffd-wp handling. Disable fault-around for all uffd-wp registered ranges for extra safety just like uffd-minor fault, and clean the code up. Link: https://lkml.kernel.org/r/20220405014844.14239-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 3354d9ddc35f..e7afcdfd4b46 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -96,6 +96,18 @@ static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } +/* + * Don't do fault around for either WP or MINOR registered uffd range. For + * MINOR registered range, fault around will be a total disaster and ptes can + * be installed without notifications; for WP it should mostly be fine as long + * as the fault around checks for pte_none() before the installation, however + * to be super safe we just forbid it. + */ +static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); +} + static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MISSING; @@ -236,6 +248,11 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } +static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) +{ + return false; +} + #endif /* CONFIG_USERFAULTFD */ static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) -- cgit v1.2.3 From 999dad824c39ed14dee7c4412aae531ba9e74a90 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:53 -0700 Subject: mm/shmem: persist uffd-wp bit across zapping for file-backed File-backed memory is prone to being unmapped at any time. It means all information in the pte will be dropped, including the uffd-wp flag. To persist the uffd-wp flag, we'll use the pte markers. This patch teaches the zap code to understand uffd-wp and know when to keep or drop the uffd-wp bit. Add a new flag ZAP_FLAG_DROP_MARKER and set it in zap_details when we don't want to persist such an information, for example, when destroying the whole vma, or punching a hole in a shmem file. For the rest cases we should never drop the uffd-wp bit, or the wr-protect information will get lost. The new ZAP_FLAG_DROP_MARKER needs to be put into mm.h rather than memory.c because it'll be further referenced in hugetlb files later. Link: https://lkml.kernel.org/r/20220405014847.14295-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 ++++++++++ include/linux/mm_inline.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d63ba0e0e068..61786259e52a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3428,4 +3428,14 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, } #endif +typedef unsigned int __bitwise zap_flags_t; + +/* + * Whether to drop the pte markers, for example, the uffd-wp information for + * file-backed memory. This should only be specified when we will completely + * drop the page in the mm, either by truncation or unmapping of the vma. By + * default, the flag is not set. + */ +#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) + #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index ac32125745ab..7b25b53c474a 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -6,6 +6,8 @@ #include #include #include +#include +#include /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -316,5 +318,46 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm) return atomic_read(&mm->tlb_flush_pending) > 1; } +/* + * If this pte is wr-protected by uffd-wp in any form, arm the special pte to + * replace a none pte. NOTE! This should only be called when *pte is already + * cleared so we will never accidentally replace something valuable. Meanwhile + * none pte also means we are not demoting the pte so tlb flushed is not needed. + * E.g., when pte cleared the caller should have taken care of the tlb flush. + * + * Must be called with pgtable lock held so that no thread will see the none + * pte, and if they see it, they'll fault and serialize at the pgtable lock. + * + * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled. + */ +static inline void +pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, + pte_t *pte, pte_t pteval) +{ +#ifdef CONFIG_PTE_MARKER_UFFD_WP + bool arm_uffd_pte = false; + + /* The current status of the pte should be "cleared" before calling */ + WARN_ON_ONCE(!pte_none(*pte)); + + if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) + return; + + /* A uffd-wp wr-protected normal pte */ + if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) + arm_uffd_pte = true; + + /* + * A uffd-wp wr-protected swap pte. Note: this should even cover an + * existing pte marker with uffd-wp bit set. + */ + if (unlikely(pte_swp_uffd_wp_any(pteval))) + arm_uffd_pte = true; + + if (unlikely(arm_uffd_pte)) + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); +#endif +} #endif -- cgit v1.2.3 From 6041c69179034278ac6d57f90a55b09e588f4b90 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:54 -0700 Subject: mm/hugetlb: take care of UFFDIO_COPY_MODE_WP Pass the wp_copy variable into hugetlb_mcopy_atomic_pte() thoughout the stack. Apply the UFFD_WP bit if UFFDIO_COPY_MODE_WP is with UFFDIO_COPY. Hugetlb pages are only managed by hugetlbfs, so we're safe even without setting dirty bit in the huge pte if the page is installed as read-only. However we'd better still keep the dirty bit set for a read-only UFFDIO_COPY pte (when UFFDIO_COPY_MODE_WP bit is set), not only to match what we do with shmem, but also because the page does contain dirty data that the kernel just copied from the userspace. Link: https://lkml.kernel.org/r/20220405014904.14643-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ac2ece9e9c79..159b253e523d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -160,7 +160,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, unsigned long dst_addr, unsigned long src_addr, enum mcopy_atomic_mode mode, - struct page **pagep); + struct page **pagep, + bool wp_copy); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, @@ -356,7 +357,8 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, unsigned long dst_addr, unsigned long src_addr, enum mcopy_atomic_mode mode, - struct page **pagep) + struct page **pagep, + bool wp_copy) { BUG(); return 0; -- cgit v1.2.3 From 5a90d5a103c2badfcf12d48e2fec350969e3f486 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:54 -0700 Subject: mm/hugetlb: handle UFFDIO_WRITEPROTECT This starts from passing cp_flags into hugetlb_change_protection() so hugetlb will be able to handle MM_CP_UFFD_WP[_RESOLVE] requests. huge_pte_clear_uffd_wp() is introduced to handle the case where the UFFDIO_WRITEPROTECT is requested upon migrating huge page entries. Link: https://lkml.kernel.org/r/20220405014906.14708-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 159b253e523d..f1143f1fb444 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -211,7 +211,8 @@ struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, int pmd_huge(pmd_t pmd); int pud_huge(pud_t pud); unsigned long hugetlb_change_protection(struct vm_area_struct *vma, - unsigned long address, unsigned long end, pgprot_t newprot); + unsigned long address, unsigned long end, pgprot_t newprot, + unsigned long cp_flags); bool is_hugetlb_entry_migration(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); @@ -397,7 +398,8 @@ static inline void move_hugetlb_state(struct page *oldpage, static inline unsigned long hugetlb_change_protection( struct vm_area_struct *vma, unsigned long address, - unsigned long end, pgprot_t newprot) + unsigned long end, pgprot_t newprot, + unsigned long cp_flags) { return 0; } -- cgit v1.2.3 From 05e90bd05eea33fc77d6b11e121e2da01fee379f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:55 -0700 Subject: mm/hugetlb: only drop uffd-wp special pte if required As with shmem uffd-wp special ptes, only drop the uffd-wp special swap pte if unmapping an entire vma or synchronized such that faults can not race with the unmap operation. This requires passing zap_flags all the way to the lowest level hugetlb unmap routine: __unmap_hugepage_range. In general, unmap calls originated in hugetlbfs code will pass the ZAP_FLAG_DROP_MARKER flag as synchronization is in place to prevent faults. The exception is hole punch which will first unmap without any synchronization. Later when hole punch actually removes the page from the file, it will check to see if there was a subsequent fault and if so take the hugetlb fault mutex while unmapping again. This second unmap will pass in ZAP_FLAG_DROP_MARKER. The justification of "whether to apply ZAP_FLAG_DROP_MARKER flag when unmap a hugetlb range" is (IMHO): we should never reach a state when a page fault could errornously fault in a page-cache page that was wr-protected to be writable, even in an extremely short period. That could happen if e.g. we pass ZAP_FLAG_DROP_MARKER when hugetlbfs_punch_hole() calls hugetlb_vmdelete_list(), because if a page faults after that call and before remove_inode_hugepages() is executed, the page cache can be mapped writable again in the small racy window, that can cause unexpected data overwritten. [peterx@redhat.com: fix sparse warning] Link: https://lkml.kernel.org/r/Ylcdw8I1L5iAoWhb@xz-m1.local [akpm@linux-foundation.org: move zap_flags_t from mm.h to mm_types.h to fix build issues] Link: https://lkml.kernel.org/r/20220405014915.14873-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 8 +++++--- include/linux/mm.h | 2 -- include/linux/mm_types.h | 2 ++ 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f1143f1fb444..19cec415f546 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -143,11 +143,12 @@ long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, unsigned long *, unsigned long *, long, unsigned int, int *); void unmap_hugepage_range(struct vm_area_struct *, - unsigned long, unsigned long, struct page *); + unsigned long, unsigned long, struct page *, + zap_flags_t); void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, - struct page *ref_page); + struct page *ref_page, zap_flags_t zap_flags); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(char *buf, int len, int nid); void hugetlb_show_meminfo(void); @@ -406,7 +407,8 @@ static inline unsigned long hugetlb_change_protection( static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) + unsigned long end, struct page *ref_page, + zap_flags_t zap_flags) { BUG(); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 61786259e52a..de32c0383387 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3428,8 +3428,6 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, } #endif -typedef unsigned int __bitwise zap_flags_t; - /* * Whether to drop the pte markers, for example, the uffd-wp information for * file-backed memory. This should only be specified when we will completely diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index dd382270ae40..b34ff2cdbc4f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -863,4 +863,6 @@ enum fault_flag { FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, }; +typedef unsigned int __bitwise zap_flags_t; + #endif /* _LINUX_MM_TYPES_H */ -- cgit v1.2.3 From bc70fbf269fdff410b0b6d75c3770b9f59117b90 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:55 -0700 Subject: mm/hugetlb: handle uffd-wp during fork() Firstly, we'll need to pass in dst_vma into copy_hugetlb_page_range() because for uffd-wp it's the dst vma that matters on deciding how we should treat uffd-wp protected ptes. We should recognize pte markers during fork and do the pte copy if needed. [lkp@intel.com: vma_needs_copy can be static] Link: https://lkml.kernel.org/r/Ylb0CGeFJlc4EzLk@7ec4ff11d4ae Link: https://lkml.kernel.org/r/20220405014918.14932-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 19cec415f546..04f0186b089b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -137,7 +137,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long len); -int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); +int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, + struct vm_area_struct *, struct vm_area_struct *); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, unsigned long *, long, unsigned int, @@ -269,7 +270,9 @@ static inline struct page *follow_huge_addr(struct mm_struct *mm, } static inline int copy_hugetlb_page_range(struct mm_struct *dst, - struct mm_struct *src, struct vm_area_struct *vma) + struct mm_struct *src, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { BUG(); return 0; -- cgit v1.2.3 From b1f9e876862d8f7176299ec4fb2108bc1045cbc8 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:56 -0700 Subject: mm/uffd: enable write protection for shmem & hugetlbfs We've had all the necessary changes ready for both shmem and hugetlbfs. Turn on all the shmem/hugetlbfs switches for userfaultfd-wp. We can expand UFFD_API_RANGE_IOCTLS_BASIC with _UFFDIO_WRITEPROTECT too because all existing types now support write protection mode. Since vma_can_userfault() will be used elsewhere, move into userfaultfd_k.h. Link: https://lkml.kernel.org/r/20220405014926.15101-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e7afcdfd4b46..732b522bacb7 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -18,6 +18,7 @@ #include #include #include +#include /* The set of all possible UFFD-related VM flags. */ #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) @@ -140,6 +141,25 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return vma->vm_flags & __VM_UFFD_FLAGS; } +static inline bool vma_can_userfault(struct vm_area_struct *vma, + unsigned long vm_flags) +{ + if (vm_flags & VM_UFFD_MINOR) + return is_vm_hugetlb_page(vma) || vma_is_shmem(vma); + +#ifndef CONFIG_PTE_MARKER_UFFD_WP + /* + * If user requested uffd-wp but not enabled pte markers for + * uffd-wp, then shmem & hugetlbfs are not supported but only + * anonymous. + */ + if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) + return false; +#endif + return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || + vma_is_shmem(vma); +} + extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); -- cgit v1.2.3 From b48d8a8e5ce53e3114a1ffe96563e3555b51d40b Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 12 May 2022 20:22:57 -0700 Subject: mm: page_isolation: move has_unmovable_pages() to mm/page_isolation.c Patch series "Use pageblock_order for cma and alloc_contig_range alignment", v11. This patchset tries to remove the MAX_ORDER-1 alignment requirement for CMA and alloc_contig_range(). It prepares for my upcoming changes to make MAX_ORDER adjustable at boot time[1]. The MAX_ORDER - 1 alignment requirement comes from that alloc_contig_range() isolates pageblocks to remove free memory from buddy allocator but isolating only a subset of pageblocks within a page spanning across multiple pageblocks causes free page accounting issues. Isolated page might not be put into the right free list, since the code assumes the migratetype of the first pageblock as the whole free page migratetype. This is based on the discussion at [2]. To remove the requirement, this patchset: 1. isolates pages at pageblock granularity instead of max(MAX_ORDER_NR_PAEGS, pageblock_nr_pages); 2. splits free pages across the specified range or migrates in-use pages across the specified range then splits the freed page to avoid free page accounting issues (it happens when multiple pageblocks within a single page have different migratetypes); 3. only checks unmovable pages within the range instead of MAX_ORDER - 1 aligned range during isolation to avoid alloc_contig_range() failure when pageblocks within a MAX_ORDER - 1 aligned range are allocated separately. 4. returns pages not in the range as it did before. One optimization might come later: 1. make MIGRATE_ISOLATE a separate bit to be able to restore the original migratetypes when isolation fails in the middle of the range. [1] https://lore.kernel.org/linux-mm/20210805190253.2795604-1-zi.yan@sent.com/ [2] https://lore.kernel.org/linux-mm/d19fb078-cb9b-f60f-e310-fdeea1b947d2@redhat.com/ This patch (of 6): has_unmovable_pages() is only used in mm/page_isolation.c. Move it from mm/page_alloc.c and make it static. Link: https://lkml.kernel.org/r/20220425143118.2850746-2-zi.yan@sent.com Signed-off-by: Zi Yan Reviewed-by: Oscar Salvador Reviewed-by: Mike Rapoport Acked-by: David Hildenbrand Cc: Vlastimil Babka Cc: Mel Gorman Cc: Eric Ren Cc: Christophe Leroy Cc: Minchan Kim Cc: kernel test robot Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 572458016331..e14eddf6741a 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -33,8 +33,6 @@ static inline bool is_migrate_isolate(int migratetype) #define MEMORY_OFFLINE 0x1 #define REPORT_FAILURE 0x2 -struct page *has_unmovable_pages(struct zone *zone, struct page *page, - int migratetype, int flags); void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable); -- cgit v1.2.3 From b2c9e2fbba32539626522b6aed30d1dde7b7e971 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 12 May 2022 20:22:58 -0700 Subject: mm: make alloc_contig_range work at pageblock granularity alloc_contig_range() worked at MAX_ORDER_NR_PAGES granularity to avoid merging pageblocks with different migratetypes. It might unnecessarily convert extra pageblocks at the beginning and at the end of the range. Change alloc_contig_range() to work at pageblock granularity. Special handling is needed for free pages and in-use pages across the boundaries of the range specified by alloc_contig_range(). Because these= Partially isolated pages causes free page accounting issues. The free pages will be split and freed into separate migratetype lists; the in-use= Pages will be migrated then the freed pages will be handled in the aforementioned way. [ziy@nvidia.com: fix deadlock/crash] Link: https://lkml.kernel.org/r/23A7297E-6C84-4138-A9FE-3598234004E6@nvidia.com Link: https://lkml.kernel.org/r/20220425143118.2850746-4-zi.yan@sent.com Signed-off-by: Zi Yan Reported-by: kernel test robot Cc: Christophe Leroy Cc: David Hildenbrand Cc: Eric Ren Cc: Mel Gorman Cc: Mike Rapoport Cc: Minchan Kim Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index e14eddf6741a..5456b7be38ae 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -42,7 +42,7 @@ int move_freepages_block(struct zone *zone, struct page *page, */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - unsigned migratetype, int flags); + int migratetype, int flags, gfp_t gfp_flags); /* * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. @@ -50,7 +50,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, */ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - unsigned migratetype); + int migratetype); /* * Test all pages in [start_pfn, end_pfn) are isolated or not. -- cgit v1.2.3 From 11ac3e87ce09c27f4587a8c4fe0829d814021a82 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 12 May 2022 20:22:58 -0700 Subject: mm: cma: use pageblock_order as the single alignment Now alloc_contig_range() works at pageblock granularity. Change CMA allocation, which uses alloc_contig_range(), to use pageblock_nr_pages alignment. Link: https://lkml.kernel.org/r/20220425143118.2850746-6-zi.yan@sent.com Signed-off-by: Zi Yan Cc: Christophe Leroy Cc: David Hildenbrand Cc: Eric Ren Cc: kernel test robot Cc: Mel Gorman Cc: Mike Rapoport Cc: Minchan Kim Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/cma.h | 4 ++-- include/linux/mmzone.h | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cma.h b/include/linux/cma.h index a6f637342740..63873b93deaa 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -17,11 +17,11 @@ #define CMA_MAX_NAME 64 /* - * TODO: once the buddy -- especially pageblock merging and alloc_contig_range() + * the buddy -- especially pageblock merging and alloc_contig_range() * -- can deal with only some pageblocks of a higher-order page being * MIGRATE_CMA, we can use pageblock_nr_pages. */ -#define CMA_MIN_ALIGNMENT_PAGES MAX_ORDER_NR_PAGES +#define CMA_MIN_ALIGNMENT_PAGES pageblock_nr_pages #define CMA_MIN_ALIGNMENT_BYTES (PAGE_SIZE * CMA_MIN_ALIGNMENT_PAGES) struct cma; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 46ffab808f03..aab70355d64f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -54,10 +54,7 @@ enum migratetype { * * The way to use it is to change migratetype of a range of * pageblocks to MIGRATE_CMA which can be done by - * __free_pageblock_cma() function. What is important though - * is that a range of pageblocks must be aligned to - * MAX_ORDER_NR_PAGES should biggest page be bigger than - * a single pageblock. + * __free_pageblock_cma() function. */ MIGRATE_CMA, #endif -- cgit v1.2.3 From adf88aa8ea7ff143825a2a8a7193f92e0e6fc79b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:01 -0700 Subject: mm: remove alloc_pages_vma() All callers have now been converted to use vma_alloc_folio(), so convert the body of alloc_pages_vma() to allocate folios instead. Link: https://lkml.kernel.org/r/20220504182857.4013401-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/gfp.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3e3d36fc2109..2a08a3c4ba95 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -613,13 +613,8 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages(gfp_t gfp, unsigned int order); struct folio *folio_alloc(gfp_t gfp, unsigned order); -struct page *alloc_pages_vma(gfp_t gfp_mask, int order, - struct vm_area_struct *vma, unsigned long addr, - bool hugepage); struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage); -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages_vma(gfp_mask, order, vma, addr, true) #else static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { @@ -629,16 +624,17 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } -#define alloc_pages_vma(gfp_mask, order, vma, addr, hugepage) \ - alloc_pages(gfp_mask, order) #define vma_alloc_folio(gfp, order, vma, addr, hugepage) \ folio_alloc(gfp, order) -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) -#define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, false) +static inline struct page *alloc_page_vma(gfp_t gfp, + struct vm_area_struct *vma, unsigned long addr) +{ + struct folio *folio = vma_alloc_folio(gfp, 0, vma, addr, false); + + return &folio->page; +} extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); -- cgit v1.2.3 From e2e3fdc7d4afdb8e7ba981eba7827993f2d390a8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:02 -0700 Subject: swap: turn get_swap_page() into folio_alloc_swap() This removes an assumption that a large folio is HPAGE_PMD_NR pages in size. Link: https://lkml.kernel.org/r/20220504182857.4013401-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/swap.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 5553189d0215..252c23a6667f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -470,7 +470,7 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -extern swp_entry_t get_swap_page(struct page *page); +swp_entry_t folio_alloc_swap(struct folio *folio); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); @@ -583,7 +583,7 @@ static inline int try_to_free_swap(struct page *page) return 0; } -static inline swp_entry_t get_swap_page(struct page *page) +static inline swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; entry.val = 0; @@ -643,12 +643,13 @@ static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) #ifdef CONFIG_MEMCG_SWAP void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); -extern int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); -static inline int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); +static inline int mem_cgroup_try_charge_swap(struct folio *folio, + swp_entry_t entry) { if (mem_cgroup_disabled()) return 0; - return __mem_cgroup_try_charge_swap(page, entry); + return __mem_cgroup_try_charge_swap(folio, entry); } extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); @@ -666,7 +667,7 @@ static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { } -static inline int mem_cgroup_try_charge_swap(struct page *page, +static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { return 0; -- cgit v1.2.3 From 64daa5d818ae3430f0785206b0af13ef528cb9ef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:03 -0700 Subject: vmscan: convert lazy freeing to folios Remove a hidden call to compound_head(), and account nr_pages instead of a single page. This matches the code in lru_lazyfree_fn() that accounts nr_pages to PGLAZYFREE. Link: https://lkml.kernel.org/r/20220504182857.4013401-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fe580cb96683..8ea4b541c31e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1057,6 +1057,15 @@ static inline void count_memcg_page_event(struct page *page, count_memcg_events(memcg, idx, 1); } +static inline void count_memcg_folio_events(struct folio *folio, + enum vm_event_item idx, unsigned long nr) +{ + struct mem_cgroup *memcg = folio_memcg(folio); + + if (memcg) + count_memcg_events(memcg, idx, nr); +} + static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { @@ -1494,6 +1503,11 @@ static inline void count_memcg_page_event(struct page *page, { } +static inline void count_memcg_folio_events(struct folio *folio, + enum vm_event_item idx, unsigned long nr) +{ +} + static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { -- cgit v1.2.3 From dc786690a6a1d9a680e6872821291ad7ca9f520d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:03 -0700 Subject: mm: allow can_split_folio() to be called when THP are disabled The call to can_split_folio() in vmscan is currently guarded by a test of PageTransHuge() so the BUILD_BUG() is eliminated if THP are disabled. The next patch replaces that test with folio_test_large() which may be true, even when THP are disabled. However, if THP are disabled, we cannot split, so an unconditional return of false is appropriate. Link: https://lkml.kernel.org/r/20220504182857.4013401-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9a26bd10e083..fbf36bb1be22 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -348,7 +348,6 @@ static inline void prep_transhuge_page(struct page *page) {} static inline bool can_split_folio(struct folio *folio, int *pextra_pins) { - BUILD_BUG(); return false; } static inline int -- cgit v1.2.3 From 039bc1240165c99b932fb4be2f784948d1038eea Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:04 -0700 Subject: mm/swap: add folio_throttle_swaprate The only use of the page argument to cgroup_throttle_swaprate() is to get the node ID, and this will be the same for all pages in the folio, so just pass in the first page of the folio. Link: https://lkml.kernel.org/r/20220504182857.4013401-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 252c23a6667f..6bf1506e5080 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -640,6 +640,10 @@ static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { } #endif +static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) +{ + cgroup_throttle_swaprate(&folio->page, gfp); +} #ifdef CONFIG_MEMCG_SWAP void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); -- cgit v1.2.3 From da08e9b7932345aff0a748c55f8a25709505f2a3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:05 -0700 Subject: mm/shmem: convert shmem_swapin_page() to shmem_swapin_folio() shmem_swapin_page() only brings in order-0 pages, which are folios by definition. Link: https://lkml.kernel.org/r/20220504182857.4013401-24-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 530b1817b58c..39fa93e94b80 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -758,7 +758,7 @@ static inline void arch_swap_invalidate_area(int type) #endif #ifndef __HAVE_ARCH_SWAP_RESTORE -static inline void arch_swap_restore(swp_entry_t entry, struct page *page) +static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { } #endif -- cgit v1.2.3 From a9595b305c0f6cbbf9505325509d95637f6a7062 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:05 -0700 Subject: mm: add folio_mapping_flags() This is the equivalent of PageMappingFlags and is needed for converting mm/migrate.c to folios. Link: https://lkml.kernel.org/r/20220504182857.4013401-25-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b70124b9c7c1..97eb88dbcbbc 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -650,6 +650,11 @@ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) +static __always_inline bool folio_mapping_flags(struct folio *folio) +{ + return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0; +} + static __always_inline int PageMappingFlags(struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0; -- cgit v1.2.3 From 8b463be3a0241558071e6ba9bf11e4bf42ccc856 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 12 May 2022 20:23:05 -0700 Subject: mm: add folio_test_movable() This is the folio equivalent of PageMovable() which is needed to convert mm/migrate.c to folios. Link: https://lkml.kernel.org/r/20220504182857.4013401-26-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/migrate.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 2707bfd43a0d..069a89e847f3 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -105,6 +105,11 @@ static inline void __ClearPageMovable(struct page *page) } #endif +static inline bool folio_test_movable(struct folio *folio) +{ + return PageMovable(&folio->page); +} + #ifdef CONFIG_NUMA_BALANCING extern int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node); -- cgit v1.2.3 From de8c8e52836d0082188508548d0b939f49f7f0e6 Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Thu, 12 May 2022 20:23:06 -0700 Subject: mm: page_table_check: add hooks to public helpers Move ptep_clear() to the include/linux/pgtable.h and add page table check relate hooks to some helpers, it's prepare for support page table check feature on new architecture. Optimize the implementation of ptep_clear(), page table hooks added page table check stubs, the interface control should be at stubs, there is no rationale for doing a IS_ENABLED() check here. For architectures that do not enable CONFIG_PAGE_TABLE_CHECK, they will call a fallback page table check stubs[1] when getting their page table helpers[2] in include/linux/pgtable.h. [1] page table check stubs defined in include/linux/page_table_check.h [2] ptep_clear() ptep_get_and_clear() pmdp_huge_get_and_clear() pudp_huge_get_and_clear() Link: https://lkml.kernel.org/r/20220507110114.4128854-4-tongtiangen@huawei.com Signed-off-by: Tong Tiangen Acked-by: Pasha Tatashin Cc: Anshuman Khandual Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kefeng Wang Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 39fa93e94b80..e0a449b477c5 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -12,6 +12,7 @@ #include #include #include +#include #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS @@ -259,14 +260,6 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif -#ifndef __HAVE_ARCH_PTEP_CLEAR -static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - pte_clear(mm, addr, ptep); -} -#endif - #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, @@ -274,10 +267,19 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = *ptep; pte_clear(mm, address, ptep); + page_table_check_pte_clear(mm, address, pte); return pte; } #endif +#ifndef __HAVE_ARCH_PTEP_CLEAR +static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + ptep_get_and_clear(mm, addr, ptep); +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET static inline pte_t ptep_get(pte_t *ptep) { @@ -347,7 +349,10 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_t *pmdp) { pmd_t pmd = *pmdp; + pmd_clear(pmdp); + page_table_check_pmd_clear(mm, address, pmd); + return pmd; } #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ @@ -359,6 +364,8 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, pud_t pud = *pudp; pud_clear(pudp); + page_table_check_pud_clear(mm, address, pud); + return pud; } #endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ -- cgit v1.2.3 From 2e7dc2b632a324c670ff11dd6c84d711043c683f Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Thu, 12 May 2022 20:23:06 -0700 Subject: mm: remove __HAVE_ARCH_PTEP_CLEAR in pgtable.h Currently, there is no architecture definition __HAVE_ARCH_PTEP_CLEAR, Generic ptep_clear() is the only definition for all architecture, So drop the "#ifndef __HAVE_ARCH_PTEP_CLEAR". Link: https://lkml.kernel.org/r/20220507110114.4128854-5-tongtiangen@huawei.com Signed-off-by: Tong Tiangen Suggested-by: Anshuman Khandual Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kefeng Wang Cc: Palmer Dabbelt Cc: Pasha Tatashin Cc: Paul Walmsley Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e0a449b477c5..ecb4c762d760 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -272,13 +272,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, } #endif -#ifndef __HAVE_ARCH_PTEP_CLEAR static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep_get_and_clear(mm, addr, ptep); } -#endif #ifndef __HAVE_ARCH_PTEP_GET static inline pte_t ptep_get(pte_t *ptep) -- cgit v1.2.3 From c8db8c2628afc7088a43de3f7cfbcc2ef1f182f7 Mon Sep 17 00:00:00 2001 From: Li kunyu Date: Thu, 12 May 2022 20:23:07 -0700 Subject: mm: functions may simplify the use of return values p4d_clear_huge may be optimized for void return type and function usage. vunmap_p4d_range function saves a few steps here. Link: https://lkml.kernel.org/r/20220507150630.90399-1-kunyu@nfschina.com Signed-off-by: Li kunyu Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index ecb4c762d760..3cdc16cfd867 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1448,16 +1448,13 @@ static inline int pmd_protnone(pmd_t pmd) #ifndef __PAGETABLE_P4D_FOLDED int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot); -int p4d_clear_huge(p4d_t *p4d); +void p4d_clear_huge(p4d_t *p4d); #else static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } -static inline int p4d_clear_huge(p4d_t *p4d) -{ - return 0; -} +static inline void p4d_clear_huge(p4d_t *p4d) { } #endif /* !__PAGETABLE_P4D_FOLDED */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); @@ -1480,10 +1477,7 @@ static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { return 0; } -static inline int p4d_clear_huge(p4d_t *p4d) -{ - return 0; -} +static inline void p4d_clear_huge(p4d_t *p4d) { } static inline int pud_clear_huge(pud_t *pud) { return 0; -- cgit v1.2.3 From fe573327ffb1deba802c91dd1d4ff41dafa97a0e Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 12 May 2022 20:23:08 -0700 Subject: tracing: incorrect gfp_t conversion Fixes the following sparse warnings: include/trace/events/*: sparse: cast to restricted gfp_t include/trace/events/*: sparse: restricted gfp_t degrades to integer gfp_t type is bitwise and requires __force attributes for any casts. Link: https://lkml.kernel.org/r/331d88fe-f4f7-657c-02a2-d977f15fbff6@openvz.org Signed-off-by: Vasily Averin Cc: Steven Rostedt Cc: Ingo Molnar Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 2a08a3c4ba95..2d2ccae933c2 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -367,7 +367,7 @@ static inline int gfp_migratetype(const gfp_t gfp_flags) return MIGRATE_UNMOVABLE; /* Group based on mobility */ - return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; + return (__force unsigned long)(gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; } #undef GFP_MOVABLE_MASK #undef GFP_MOVABLE_SHIFT -- cgit v1.2.3 From 50d63b5bbfd12262069ad062611cd5e69c5e9e05 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 May 2022 16:14:41 -0300 Subject: vfio: Change vfio_external_user_iommu_id() to vfio_file_iommu_group() The only caller wants to get a pointer to the struct iommu_group associated with the VFIO group file. Instead of returning the group ID then searching sysfs for that string to get the struct iommu_group just directly return the iommu_group pointer already held by the vfio_group struct. It already has a safe lifetime due to the struct file kref, the vfio_group and thus the iommu_group cannot be destroyed while the group file is open. Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v3-f7729924a7ea+25e33-vfio_kvm_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index be1e7db4a314..d8c34c8490cb 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -140,7 +140,7 @@ extern struct vfio_group *vfio_group_get_external_user(struct file *filep); extern void vfio_group_put_external_user(struct vfio_group *group); extern bool vfio_external_group_match_file(struct vfio_group *group, struct file *filep); -extern int vfio_external_user_iommu_id(struct vfio_group *group); +extern struct iommu_group *vfio_file_iommu_group(struct file *file); extern long vfio_external_check_extension(struct vfio_group *group, unsigned long arg); -- cgit v1.2.3 From c38ff5b0c373fbbd6a249eb461ffd4ae0f9dbfa0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 May 2022 16:14:42 -0300 Subject: vfio: Remove vfio_external_group_match_file() vfio_group_fops_open() ensures there is only ever one struct file open for any struct vfio_group at any time: /* Do we need multiple instances of the group open? Seems not. */ opened = atomic_cmpxchg(&group->opened, 0, 1); if (opened) { vfio_group_put(group); return -EBUSY; Therefor the struct file * can be used directly to search the list of VFIO groups that KVM keeps instead of using the vfio_external_group_match_file() callback to try to figure out if the passed in FD matches the list or not. Delete vfio_external_group_match_file(). Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v3-f7729924a7ea+25e33-vfio_kvm_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index d8c34c8490cb..df0adecdf1ea 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -138,8 +138,6 @@ int vfio_mig_get_next_state(struct vfio_device *device, */ extern struct vfio_group *vfio_group_get_external_user(struct file *filep); extern void vfio_group_put_external_user(struct vfio_group *group); -extern bool vfio_external_group_match_file(struct vfio_group *group, - struct file *filep); extern struct iommu_group *vfio_file_iommu_group(struct file *file); extern long vfio_external_check_extension(struct vfio_group *group, unsigned long arg); -- cgit v1.2.3 From a905ad043f32bbb0c35d4325036397f20f30c8a9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 May 2022 16:14:43 -0300 Subject: vfio: Change vfio_external_check_extension() to vfio_file_enforced_coherent() Instead of a general extension check change the function into a limited test if the iommu_domain has enforced coherency, which is the only thing kvm needs to query. Make the new op self contained by properly refcounting the container before touching it. Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v3-f7729924a7ea+25e33-vfio_kvm_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index df0adecdf1ea..7cc374da4839 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -139,8 +139,7 @@ int vfio_mig_get_next_state(struct vfio_device *device, extern struct vfio_group *vfio_group_get_external_user(struct file *filep); extern void vfio_group_put_external_user(struct vfio_group *group); extern struct iommu_group *vfio_file_iommu_group(struct file *file); -extern long vfio_external_check_extension(struct vfio_group *group, - unsigned long arg); +extern bool vfio_file_enforced_coherent(struct file *file); #define VFIO_PIN_PAGES_MAX_ENTRIES (PAGE_SIZE/sizeof(unsigned long)) -- cgit v1.2.3 From ba70a89f3c2a8279809ea0fc7684857c91938b8a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 May 2022 16:14:44 -0300 Subject: vfio: Change vfio_group_set_kvm() to vfio_file_set_kvm() Just change the argument from struct vfio_group to struct file *. Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v3-f7729924a7ea+25e33-vfio_kvm_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 7cc374da4839..b298a26c4f07 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -15,6 +15,8 @@ #include #include +struct kvm; + /* * VFIO devices can be placed in a set, this allows all devices to share this * structure and the VFIO core will provide a lock that is held around @@ -140,6 +142,7 @@ extern struct vfio_group *vfio_group_get_external_user(struct file *filep); extern void vfio_group_put_external_user(struct vfio_group *group); extern struct iommu_group *vfio_file_iommu_group(struct file *file); extern bool vfio_file_enforced_coherent(struct file *file); +extern void vfio_file_set_kvm(struct file *file, struct kvm *kvm); #define VFIO_PIN_PAGES_MAX_ENTRIES (PAGE_SIZE/sizeof(unsigned long)) @@ -170,8 +173,6 @@ extern int vfio_unregister_notifier(struct vfio_device *device, enum vfio_notify_type type, struct notifier_block *nb); -struct kvm; -extern void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm); /* * Sub-module helpers -- cgit v1.2.3 From 6a985ae80befcf2c00e7c889336bfe9e9739e2ef Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 May 2022 16:14:46 -0300 Subject: vfio/pci: Use the struct file as the handle not the vfio_group VFIO PCI does a security check as part of hot reset to prove that the user has permission to manipulate all the devices that will be impacted by the reset. Use a new API vfio_file_has_dev() to perform this security check against the struct file directly and remove the vfio_group from VFIO PCI. Since VFIO PCI was the last user of vfio_group_get_external_user() and vfio_group_put_external_user() remove it as well. Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/8-v3-f7729924a7ea+25e33-vfio_kvm_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index b298a26c4f07..45b287826ce6 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -138,11 +138,10 @@ int vfio_mig_get_next_state(struct vfio_device *device, /* * External user API */ -extern struct vfio_group *vfio_group_get_external_user(struct file *filep); -extern void vfio_group_put_external_user(struct vfio_group *group); extern struct iommu_group *vfio_file_iommu_group(struct file *file); extern bool vfio_file_enforced_coherent(struct file *file); extern void vfio_file_set_kvm(struct file *file, struct kvm *kvm); +extern bool vfio_file_has_dev(struct file *file, struct vfio_device *device); #define VFIO_PIN_PAGES_MAX_ENTRIES (PAGE_SIZE/sizeof(unsigned long)) -- cgit v1.2.3 From 1af0e4a0233fea7e8226cb977d379dc20f9bbe11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Thu, 17 Feb 2022 15:18:57 +0100 Subject: security: declare member holding string literal const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct security_hook_list member lsm is assigned in security_add_hooks() with string literals passed from the individual security modules. Declare the function parameter and the struct member const to signal their immutability. Reported by Clang [-Wwrite-strings]: security/selinux/hooks.c:7388:63: error: passing 'const char [8]' to parameter of type 'char *' discards qualifiers [-Werror,-Wincompatible-pointer-types-discards-qualifiers] security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks), selinux); ^~~~~~~~~ ./include/linux/lsm_hooks.h:1629:11: note: passing argument to parameter 'lsm' here char *lsm); ^ Signed-off-by: Christian Göttsche Reviewed-by: Paul Moore Reviewed-by: Casey Schaufler Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 419b5febc3ca..47cdf3fbecef 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1595,7 +1595,7 @@ struct security_hook_list { struct hlist_node list; struct hlist_head *head; union security_list_options hook; - char *lsm; + const char *lsm; } __randomize_layout; /* @@ -1630,7 +1630,7 @@ extern struct security_hook_heads security_hook_heads; extern char *lsm_names; extern void security_add_hooks(struct security_hook_list *hooks, int count, - char *lsm); + const char *lsm); #define LSM_FLAG_LEGACY_MAJOR BIT(0) #define LSM_FLAG_EXCLUSIVE BIT(1) -- cgit v1.2.3 From 1366992e16bddd5e2d9a561687f367f9f802e2e4 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 10 Apr 2022 16:49:50 +0200 Subject: timekeeping: Add raw clock fallback for random_get_entropy() The addition of random_get_entropy_fallback() provides access to whichever time source has the highest frequency, which is useful for gathering entropy on platforms without available cycle counters. It's not necessarily as good as being able to quickly access a cycle counter that the CPU has, but it's still something, even when it falls back to being jiffies-based. In the event that a given arch does not define get_cycles(), falling back to the get_cycles() default implementation that returns 0 is really not the best we can do. Instead, at least calling random_get_entropy_fallback() would be preferable, because that always needs to return _something_, even falling back to jiffies eventually. It's not as though random_get_entropy_fallback() is super high precision or guaranteed to be entropic, but basically anything that's not zero all the time is better than returning zero all the time. Finally, since random_get_entropy_fallback() is used during extremely early boot when randomizing freelists in mm_init(), it can be called before timekeeping has been initialized. In that case there really is nothing we can do; jiffies hasn't even started ticking yet. So just give up and return 0. Suggested-by: Thomas Gleixner Signed-off-by: Jason A. Donenfeld Reviewed-by: Thomas Gleixner Cc: Arnd Bergmann Cc: Theodore Ts'o --- include/linux/timex.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/timex.h b/include/linux/timex.h index 5745c90c8800..3871b06bd302 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -62,6 +62,8 @@ #include #include +unsigned long random_get_entropy_fallback(void); + #include #ifndef random_get_entropy @@ -74,8 +76,14 @@ * * By default we use get_cycles() for this purpose, but individual * architectures may override this in their asm/timex.h header file. + * If a given arch does not have get_cycles(), then we fallback to + * using random_get_entropy_fallback(). */ +#ifdef get_cycles #define random_get_entropy() ((unsigned long)get_cycles()) +#else +#define random_get_entropy() random_get_entropy_fallback() +#endif #endif /* -- cgit v1.2.3 From 16d1e00c7e8a4950e914223b3112144289a82913 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 9 May 2022 15:42:52 -0700 Subject: bpf: Add MEM_UNINIT as a bpf_type_flag Instead of having uninitialized versions of arguments as separate bpf_arg_types (eg ARG_PTR_TO_UNINIT_MEM as the uninitialized version of ARG_PTR_TO_MEM), we can instead use MEM_UNINIT as a bpf_type_flag modifier to denote that the argument is uninitialized. Doing so cleans up some of the logic in the verifier. We no longer need to do two checks against an argument type (eg "if (base_type(arg_type) == ARG_PTR_TO_MEM || base_type(arg_type) == ARG_PTR_TO_UNINIT_MEM)"), since uninitialized and initialized versions of the same argument type will now share the same base type. In the near future, MEM_UNINIT will be used by dynptr helper functions as well. Signed-off-by: Joanne Koong Acked-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/r/20220509224257.3222614-2-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5061ccd8b2dc..c107392b0ba7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -390,7 +390,10 @@ enum bpf_type_flag { */ PTR_UNTRUSTED = BIT(6 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = PTR_UNTRUSTED, + MEM_UNINIT = BIT(7 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_FLAG_MAX, + __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; /* Max number of base types. */ @@ -409,16 +412,11 @@ enum bpf_arg_type { ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ - ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ - /* the following constraints used to prototype bpf_memcmp() and other - * functions that access data on eBPF program stack + /* Used to prototype bpf_memcmp() and other functions that access data + * on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ - ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, - * helper function must fill all bytes or clear - * them in error case. - */ ARG_CONST_SIZE, /* number of bytes accessed from memory */ ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ @@ -450,6 +448,10 @@ enum bpf_arg_type { ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, + /* pointer to memory does not need to be initialized, helper function must fill + * all bytes or clear them in error case. + */ + ARG_PTR_TO_UNINIT_MEM = MEM_UNINIT | ARG_PTR_TO_MEM, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. -- cgit v1.2.3 From e7392b4eca84e86718a7b73aea8fa5573b06ba76 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Fri, 13 May 2022 16:48:55 -0700 Subject: mm/highmem: fix kernel-doc warnings in highmem*.h Patch series "Extend and reorganize Highmem's documentation", v4. This series has the purpose to extend and reorganize Highmem's documentation. This is a work in progress because some information should still be moved from highmem.rst to highmem.h and highmem-internal.h. Specifically I'm talking about moving the "how to" information to the relevant headers, as it as been suggested by Ira Weiny (Intel). Also, this is a work in progress because some kdocs in highmem.h and highmem-internal.h should be improved. This patch (of 4): `scripts/kernel-doc -v -none include/linux/highmem*` reports the following warnings: include/linux/highmem.h:160: warning: expecting prototype for kunmap_atomic(). Prototype was for nr_free_highpages() instead include/linux/highmem.h:204: warning: No description found for return value of 'alloc_zeroed_user_highpage_movable' include/linux/highmem-internal.h:256: warning: Function parameter or member '__addr' not described in 'kunmap_atomic' include/linux/highmem-internal.h:256: warning: Excess function parameter 'addr' description in 'kunmap_atomic' Fix these warnings by (1) moving the kernel-doc comments from highmem.h to highmem-internal.h (which is the file were the kunmap_atomic() macro is actually defined), (2) extending and merging it with the comment which was already in highmem-internal.h, and (3) using correct parameter names (4) correcting a few technical inaccuracies in comments, and (5) adding a deprecation notice in kunmap_atomic() for consistency with kmap_atomic(). Link: https://lkml.kernel.org/r/20220428212455.892-1-fmdefrancesco@gmail.com Link: https://lkml.kernel.org/r/20220428212455.892-2-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reviewed-by: Sebastian Andrzej Siewior Reviewed-by: Ira Weiny Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Catalin Marinas Cc: Will Deacon Cc: Peter Collingbourne Cc: Vlastimil Babka Cc: Jonathan Corbet Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/highmem-internal.h | 18 +++++++++++++++--- include/linux/highmem.h | 22 ++++++++-------------- 2 files changed, 23 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index a77be5630209..a694ca95c4ed 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -236,9 +236,21 @@ static inline unsigned long totalhigh_pages(void) { return 0UL; } #endif /* CONFIG_HIGHMEM */ -/* - * Prevent people trying to call kunmap_atomic() as if it were kunmap() - * kunmap_atomic() should get the return value of kmap_atomic, not the page. +/** + * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - deprecated! + * @__addr: Virtual address to be unmapped + * + * Unmaps an address previously mapped by kmap_atomic() and re-enables + * pagefaults. Depending on PREEMP_RT configuration, re-enables also + * migration and preemption. Users should not count on these side effects. + * + * Mappings should be unmapped in the reverse order that they were mapped. + * See kmap_local_page() for details on nesting. + * + * @__addr can be any address within the mapped page, so there is no need + * to subtract any offset that has been added. In contrast to kunmap(), + * this function takes the address returned from kmap_atomic(), not the + * page passed to it. The compiler will warn you if you pass the page. */ #define kunmap_atomic(__addr) \ do { \ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 67720bfd6ade..c05ae395898a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -37,7 +37,7 @@ static inline void *kmap(struct page *page); /** * kunmap - Unmap the virtual address mapped by kmap() - * @addr: Virtual address to be unmapped + * @page: Pointer to the page which was mapped by kmap() * * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of * pages in the low memory area. @@ -138,24 +138,16 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset); * * Returns: The virtual address of the mapping * - * Effectively a wrapper around kmap_local_page() which disables pagefaults - * and preemption. + * In fact a wrapper around kmap_local_page() which also disables pagefaults + * and, depending on PREEMPT_RT configuration, also CPU migration and + * preemption. Therefore users should not count on the latter two side effects. + * + * Mappings should always be released by kunmap_atomic(). * * Do not use in new code. Use kmap_local_page() instead. */ static inline void *kmap_atomic(struct page *page); -/** - * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - * @addr: Virtual address to be unmapped - * - * Counterpart to kmap_atomic(). - * - * Effectively a wrapper around kunmap_local() which additionally undoes - * the side effects of kmap_atomic(), i.e. reenabling pagefaults and - * preemption. - */ - /* Highmem related interfaces for management code */ static inline unsigned int nr_free_highpages(void); static inline unsigned long totalhigh_pages(void); @@ -191,6 +183,8 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) * @vma: The VMA the page is to be allocated for * @vaddr: The virtual address the page will be inserted into * + * Returns: The allocated and zeroed HIGHMEM page + * * This function will allocate a page for a VMA that the caller knows will * be able to migrate in the future using move_pages() or reclaimed * -- cgit v1.2.3 From 85a85e7601263f2b4edc86136dd0172c2cfbfaa1 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Fri, 13 May 2022 16:48:55 -0700 Subject: Documentation/vm: move "Using kmap-atomic" to highmem.h The use of kmap_atomic() is new code is being deprecated in favor of kmap_local_page(). For this reason the "Using kmap_atomic" section in highmem.rst is obsolete and unnecessary, but it can still help developers if it were moved to kdocs in highmem.h. Therefore, move the relevant parts of this section from highmem.rst and merge them with the kdocs in highmem.h. Link: https://lkml.kernel.org/r/20220428212455.892-4-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Suggested-by: Ira Weiny Reviewed-by: Sebastian Andrzej Siewior Reviewed-by: Ira Weiny Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Catalin Marinas Cc: Mike Rapoport Cc: Peter Collingbourne Cc: Vlastimil Babka Cc: Will Deacon Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/highmem.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index c05ae395898a..3af34de54330 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -145,6 +145,37 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset); * Mappings should always be released by kunmap_atomic(). * * Do not use in new code. Use kmap_local_page() instead. + * + * It is used in atomic context when code wants to access the contents of a + * page that might be allocated from high memory (see __GFP_HIGHMEM), for + * example a page in the pagecache. The API has two functions, and they + * can be used in a manner similar to the following: + * + * -- Find the page of interest. -- + * struct page *page = find_get_page(mapping, offset); + * + * -- Gain access to the contents of that page. -- + * void *vaddr = kmap_atomic(page); + * + * -- Do something to the contents of that page. -- + * memset(vaddr, 0, PAGE_SIZE); + * + * -- Unmap that page. -- + * kunmap_atomic(vaddr); + * + * Note that the kunmap_atomic() call takes the result of the kmap_atomic() + * call, not the argument. + * + * If you need to map two pages because you want to copy from one page to + * another you need to keep the kmap_atomic calls strictly nested, like: + * + * vaddr1 = kmap_atomic(page1); + * vaddr2 = kmap_atomic(page2); + * + * memcpy(vaddr1, vaddr2, PAGE_SIZE); + * + * kunmap_atomic(vaddr2); + * kunmap_atomic(vaddr1); */ static inline void *kmap_atomic(struct page *page); -- cgit v1.2.3 From 5d4af6195c87c6b162b7963e0ad00a214b80d764 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 13 May 2022 16:48:55 -0700 Subject: mm: rmap: fix CONT-PTE/PMD size hugetlb issue when migration On some architectures (like ARM64), it can support CONT-PTE/PMD size hugetlb, which means it can support not only PMD/PUD size hugetlb: 2M and 1G, but also CONT-PTE/PMD size: 64K and 32M if a 4K page size specified. When migrating a hugetlb page, we will get the relevant page table entry by huge_pte_offset() only once to nuke it and remap it with a migration pte entry. This is correct for PMD or PUD size hugetlb, since they always contain only one pmd entry or pud entry in the page table. However this is incorrect for CONT-PTE and CONT-PMD size hugetlb, since they can contain several continuous pte or pmd entry with same page table attributes. So we will nuke or remap only one pte or pmd entry for this CONT-PTE/PMD size hugetlb page, which is not expected for hugetlb migration. The problem is we can still continue to modify the subpages' data of a hugetlb page during migrating a hugetlb page, which can cause a serious data consistent issue, since we did not nuke the page table entry and set a migration pte for the subpages of a hugetlb page. To fix this issue, we should change to use huge_ptep_clear_flush() to nuke a hugetlb page table, and remap it with set_huge_pte_at() and set_huge_swap_pte_at() when migrating a hugetlb page, which already considered the CONT-PTE or CONT-PMD size hugetlb. [akpm@linux-foundation.org: fix nommu build] [baolin.wang@linux.alibaba.com: fix build errors for !CONFIG_MMU] Link: https://lkml.kernel.org/r/a4baca670aca637e7198d9ae4543b8873cb224dc.1652270205.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/ea5abf529f0997b5430961012bfda6166c1efc8c.1652147571.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Muchun Song Reviewed-by: Mike Kravetz Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: James Bottomley Cc: Michael Ellerman Cc: Paul Mackerras Cc: Rich Felker Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 04f0186b089b..0f2894d01333 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1093,6 +1093,17 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr pte_t *ptep, pte_t pte, unsigned long sz) { } + +static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + return *ptep; +} + +static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, -- cgit v1.2.3 From 78f39084b41d287aedb2ea55f2c1895cfa11d61a Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 13 May 2022 16:48:56 -0700 Subject: mm: hugetlb_vmemmap: add hugetlb_optimize_vmemmap sysctl We must add hugetlb_free_vmemmap=on (or "off") to the boot cmdline and reboot the server to enable or disable the feature of optimizing vmemmap pages associated with HugeTLB pages. However, rebooting usually takes a long time. So add a sysctl to enable or disable the feature at runtime without rebooting. Why we need this? There are 3 use cases. 1) The feature of minimizing overhead of struct page associated with each HugeTLB is disabled by default without passing "hugetlb_free_vmemmap=on" to the boot cmdline. When we (ByteDance) deliver the servers to the users who want to enable this feature, they have to configure the grub (change boot cmdline) and reboot the servers, whereas rebooting usually takes a long time (we have thousands of servers). It's a very bad experience for the users. So we need a approach to enable this feature after rebooting. This is a use case in our practical environment. 2) Some use cases are that HugeTLB pages are allocated 'on the fly' instead of being pulled from the HugeTLB pool, those workloads would be affected with this feature enabled. Those workloads could be identified by the characteristics of they never explicitly allocating huge pages with 'nr_hugepages' but only set 'nr_overcommit_hugepages' and then let the pages be allocated from the buddy allocator at fault time. We can confirm it is a real use case from the commit 099730d67417. For those workloads, the page fault time could be ~2x slower than before. We suspect those users want to disable this feature if the system has enabled this before and they don't think the memory savings benefit is enough to make up for the performance drop. 3) If the workload which wants vmemmap pages to be optimized and the workload which wants to set 'nr_overcommit_hugepages' and does not want the extera overhead at fault time when the overcommitted pages be allocated from the buddy allocator are deployed in the same server. The user could enable this feature and set 'nr_hugepages' and 'nr_overcommit_hugepages', then disable the feature. In this case, the overcommited HugeTLB pages will not encounter the extra overhead at fault time. Link: https://lkml.kernel.org/r/20220512041142.39501-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Jonathan Corbet Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Oscar Salvador Cc: David Hildenbrand Cc: Masahiro Yamada Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e0b2209ab71c..20d7edf62a6a 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -351,4 +351,13 @@ void arch_remove_linear_mapping(u64 start, u64 size); extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ +#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY +bool mhp_memmap_on_memory(void); +#else +static inline bool mhp_memmap_on_memory(void) +{ + return false; +} +#endif + #endif /* __LINUX_MEMORY_HOTPLUG_H */ -- cgit v1.2.3 From d4a157f5a26fbf1f1fd5da47a4772a738306348f Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Fri, 13 May 2022 16:48:57 -0700 Subject: mm/damon: add documentation for Enum value Fix the warning - "Enum value 'NR_DAMON_OPS' not described in enum 'damon_ops_id'" generated by the command "make pdfdocs" Link: https://lkml.kernel.org/r/20220508073316.141401-1-gautammenghani201@gmail.com Signed-off-by: Gautam Menghani Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index d1e6ee28a2ff..7c62da31ce4b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -264,6 +264,7 @@ struct damos { * @DAMON_OPS_FVADDR: Monitoring operations for only fixed ranges of virtual * address spaces * @DAMON_OPS_PADDR: Monitoring operations for the physical address space + * @NR_DAMON_OPS: Number of monitoring operations implementations */ enum damon_ops_id { DAMON_OPS_VADDR, -- cgit v1.2.3 From 81132a39c152ca09832b9e4cb748129cee5f55ec Mon Sep 17 00:00:00 2001 From: Gou Hao Date: Tue, 2 Nov 2021 10:46:48 +0800 Subject: fs: remove fget_many and fput_many interface These two interface were added in 091141a42 commit, but now there is no place to call them. The only user of fput/fget_many() was removed in commit 62906e89e63b ("io_uring: remove file batch-get optimisation"). A user of get_file_rcu_many() were removed in commit f073531070d2 ("init: add an init_dup helper"). And replace atomic_long_sub/add to atomic_long_dec/inc can improve performance. Here are the test results of unixbench: Cmd: ./Run -c 64 context1 Without patch: System Benchmarks Partial Index BASELINE RESULT INDEX Pipe-based Context Switching 4000.0 2798407.0 6996.0 ======== System Benchmarks Index Score (Partial Only) 6996.0 With patch: System Benchmarks Partial Index BASELINE RESULT INDEX Pipe-based Context Switching 4000.0 3486268.8 8715.7 ======== System Benchmarks Index Score (Partial Only) 8715.7 Signed-off-by: Gou Hao Signed-off-by: Al Viro --- include/linux/file.h | 2 -- include/linux/fs.h | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/file.h b/include/linux/file.h index 51e830b4fe3a..39704eae83e2 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -14,7 +14,6 @@ struct file; extern void fput(struct file *); -extern void fput_many(struct file *, unsigned int); struct file_operations; struct task_struct; @@ -47,7 +46,6 @@ static inline void fdput(struct fd fd) } extern struct file *fget(unsigned int fd); -extern struct file *fget_many(unsigned int fd, unsigned int refs); extern struct file *fget_raw(unsigned int fd); extern struct file *fget_task(struct task_struct *task, unsigned int fd); extern unsigned long __fdget(unsigned int fd); diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..3660c338bb16 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -981,9 +981,7 @@ static inline struct file *get_file(struct file *f) atomic_long_inc(&f->f_count); return f; } -#define get_file_rcu_many(x, cnt) \ - atomic_long_add_unless(&(x)->f_count, (cnt), 0) -#define get_file_rcu(x) get_file_rcu_many((x), 1) +#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count) #define file_count(x) atomic_long_read(&(x)->f_count) #define MAX_NON_LFS ((1UL<<31) - 1) -- cgit v1.2.3 From 6319194ec57b0452dcda4589d24c4e7db299c5bf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 12 May 2022 17:08:03 -0400 Subject: Unify the primitives for file descriptor closing Currently we have 3 primitives for removing an opened file from descriptor table - pick_file(), __close_fd_get_file() and close_fd_get_file(). Their calling conventions are rather odd and there's a code duplication for no good reason. They can be unified - 1) have __range_close() cap max_fd in the very beginning; that way we don't need separate way for pick_file() to report being past the end of descriptor table. 2) make {__,}close_fd_get_file() return file (or NULL) directly, rather than returning it via struct file ** argument. Don't bother with (bogus) return value - nobody wants that -ENOENT. 3) make pick_file() return NULL on unopened descriptor - the only caller that used to care about the distinction between descriptor past the end of descriptor table and finding NULL in descriptor table doesn't give a damn after (1). 4) lift ->files_lock out of pick_file() That actually simplifies the callers, as well as the primitives themselves. Code duplication is also gone... Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Al Viro --- include/linux/fdtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index d0e78174874a..e066816f3519 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -125,7 +125,7 @@ int iterate_fd(struct files_struct *, unsigned, extern int close_fd(unsigned int fd); extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); -extern int close_fd_get_file(unsigned int fd, struct file **res); +extern struct file *close_fd_get_file(unsigned int fd); extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, struct files_struct **new_fdp); -- cgit v1.2.3 From 03fea699b050805ad6ee111f9db04f223f3e835e Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 15 May 2022 21:58:30 +0100 Subject: cdrom: remove the unused driver specific disc change ioctl This was only used by the ide-cd driver, which went away in commit b7fb14d3ac63 ("ide: remove the legacy ide driver") so we might as well take advantage of that and get rid of this hook as well. Cc: Christoph Hellwig Cc: Jens Axboe Cc: Phillip Potter Signed-off-by: Paul Gortmaker Link: https://lore.kernel.org/all/20220427132436.12795-2-paul.gortmaker@windriver.com Signed-off-by: Phillip Potter Link: https://lore.kernel.org/r/20220515205833.944139-3-phil@philpotter.co.uk Signed-off-by: Jens Axboe --- include/linux/cdrom.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 0a89f111e00e..67caa909e3e6 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -77,7 +77,6 @@ struct cdrom_device_ops { int (*tray_move) (struct cdrom_device_info *, int); int (*lock_door) (struct cdrom_device_info *, int); int (*select_speed) (struct cdrom_device_info *, int); - int (*select_disc) (struct cdrom_device_info *, int); int (*get_last_session) (struct cdrom_device_info *, struct cdrom_multisession *); int (*get_mcn) (struct cdrom_device_info *, -- cgit v1.2.3 From ca2d89925ae3f3d5c65182ff75e58bc9b484e69c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 28 Apr 2022 12:19:35 +0300 Subject: nvme: add missing status values to verbose logging Log a few more path related status codes. Signed-off-by: Max Gurtovoy Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index f626a445d1a8..bbabdc7600da 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1679,9 +1679,11 @@ enum { /* * Path-related Errors: */ + NVME_SC_INTERNAL_PATH_ERROR = 0x300, NVME_SC_ANA_PERSISTENT_LOSS = 0x301, NVME_SC_ANA_INACCESSIBLE = 0x302, NVME_SC_ANA_TRANSITION = 0x303, + NVME_SC_CTRL_PATH_ERROR = 0x360, NVME_SC_HOST_PATH_ERROR = 0x370, NVME_SC_HOST_ABORTED_CMD = 0x371, -- cgit v1.2.3 From 7c4e983c4f3cf94fcd879730c6caa877e0768a4d Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 13 May 2022 11:33:57 -0700 Subject: net: allow gso_max_size to exceed 65536 The code for gso_max_size was added originally to allow for debugging and workaround of buggy devices that couldn't support TSO with blocks 64K in size. The original reason for limiting it to 64K was because that was the existing limits of IPv4 and non-jumbogram IPv6 length fields. With the addition of Big TCP we can remove this limit and allow the value to potentially go up to UINT_MAX and instead be limited by the tso_max_size value. So in order to support this we need to go through and clean up the remaining users of the gso_max_size value so that the values will cap at 64K for non-TCPv6 flows. In addition we can clean up the GSO_MAX_SIZE value so that 64K becomes GSO_LEGACY_MAX_SIZE and UINT_MAX will now be the upper limit for GSO_MAX_SIZE. v6: (edumazet) fixed a compile error if CONFIG_IPV6=n, in a new sk_trim_gso_size() helper. netif_set_tso_max_size() caps the requested TSO size with GSO_MAX_SIZE. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 536321691c72..ce780e352f43 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2272,7 +2272,9 @@ struct net_device { const struct rtnl_link_ops *rtnl_link_ops; /* for setting kernel sock attribute on TCP connection setup */ -#define GSO_MAX_SIZE 65536 +#define GSO_LEGACY_MAX_SIZE 65536u +#define GSO_MAX_SIZE UINT_MAX + unsigned int gso_max_size; #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX -- cgit v1.2.3 From 34b92e8d19da1e44070dc0ecc2747871469ac534 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:33:58 -0700 Subject: net: limit GSO_MAX_SIZE to 524280 bytes Make sure we will not overflow shinfo->gso_segs Minimal TCP MSS size is 8 bytes, and shinfo->gso_segs is a 16bit field. TCP_MIN_GSO_SIZE is currently defined in include/net/tcp.h, it seems cleaner to not bring tcp details into include/linux/netdevice.h Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ce780e352f43..fd38847d0dc7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2272,14 +2272,17 @@ struct net_device { const struct rtnl_link_ops *rtnl_link_ops; /* for setting kernel sock attribute on TCP connection setup */ +#define GSO_MAX_SEGS 65535u #define GSO_LEGACY_MAX_SIZE 65536u -#define GSO_MAX_SIZE UINT_MAX +/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), + * and shinfo->gso_segs is a 16bit field. + */ +#define GSO_MAX_SIZE (8 * GSO_MAX_SEGS) unsigned int gso_max_size; #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX unsigned int tso_max_size; -#define GSO_MAX_SEGS 65535 u16 gso_max_segs; #define TSO_MAX_SEGS U16_MAX u16 tso_max_segs; -- cgit v1.2.3 From 0fe79f28bfaf73b66b7b1562d2468f94aa03bd12 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 13 May 2022 11:34:03 -0700 Subject: net: allow gro_max_size to exceed 65536 Allow the gro_max_size to exceed a value larger than 65536. There weren't really any external limitations that prevented this other than the fact that IPv4 only supports a 16 bit length field. Since we have the option of adding a hop-by-hop header for IPv6 we can allow IPv6 to exceed this value and for IPv4 and non-TCP flows we can cap things at 65536 via a constant rather than relying on gro_max_size. [edumazet] limit GRO_MAX_SIZE to (8 * 65535) to avoid overflows. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fd38847d0dc7..d57ce248004c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2161,7 +2161,11 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; int napi_defer_hard_irqs; -#define GRO_MAX_SIZE 65536 +#define GRO_LEGACY_MAX_SIZE 65536u +/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), + * and shinfo->gso_segs is a 16bit field. + */ +#define GRO_MAX_SIZE (8 * 65535u) unsigned int gro_max_size; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; -- cgit v1.2.3 From 80e425b613421911f89664663a7060216abcaed2 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Fri, 13 May 2022 11:34:04 -0700 Subject: ipv6: Add hop-by-hop header to jumbograms in ip6_output Instead of simply forcing a 0 payload_len in IPv6 header, implement RFC 2675 and insert a custom extension header. Note that only TCP stack is currently potentially generating jumbograms, and that this extension header is purely local, it wont be sent on a physical link. This is needed so that packet capture (tcpdump and friends) can properly dissect these large packets. Signed-off-by: Coco Li Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index ec5ca392eaa3..38c8203d52cb 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -145,6 +145,7 @@ struct inet6_skb_parm { #define IP6SKB_L3SLAVE 64 #define IP6SKB_JUMBOGRAM 128 #define IP6SKB_SEG6 256 +#define IP6SKB_FAKEJUMBO 512 }; #if defined(CONFIG_NET_L3_MASTER_DEV) -- cgit v1.2.3 From 7ebd3f3ee51a9e02994cd0a1be44fbd325d1e0dc Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 13 May 2022 11:03:38 +0800 Subject: net: skb: change the definition SKB_DR_SET() The SKB_DR_OR() is used to set the drop reason to a value when it is not set or specified yet. SKB_NOT_DROPPED_YET should also be considered as not set. Reviewed-by: Jiang Biao Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9d82a8b6c8f1..5a2b29df6cab 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -510,7 +510,8 @@ enum skb_drop_reason { (name = SKB_DROP_REASON_##reason) #define SKB_DR_OR(name, reason) \ do { \ - if (name == SKB_DROP_REASON_NOT_SPECIFIED) \ + if (name == SKB_DROP_REASON_NOT_SPECIFIED || \ + name == SKB_NOT_DROPPED_YET) \ SKB_DR_SET(name, reason); \ } while (0) -- cgit v1.2.3 From 97e719a82b43c6c2bb5eebdb3c5d479a332ac2ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 May 2022 21:24:53 -0700 Subject: net: fix possible race in skb_attempt_defer_free() A cpu can observe sd->defer_count reaching 128, and call smp_call_function_single_async() Problem is that the remote CPU can clear sd->defer_count before the IPI is run/acknowledged. Other cpus can queue more packets and also decide to call smp_call_function_single_async() while the pending IPI was not yet delivered. This is a common issue with smp_call_function_single_async(). Callers must ensure correct synchronization and serialization. I triggered this issue while experimenting smaller threshold. Performing the call to smp_call_function_single_async() under sd->defer_lock protection did not solve the problem. Commit 5a18ceca6350 ("smp: Allow smp_call_function_single_async() to insert locked csd") replaced an informative WARN_ON_ONCE() with a return of -EBUSY, which is often ignored. Test of CSD_FLAG_LOCK presence is racy anyway. Fixes: 68822bdf76f1 ("net: generalize skb freeing deferral to per-cpu lists") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d57ce248004c..cbaf312e365b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3136,6 +3136,7 @@ struct softnet_data { /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; int defer_count; + int defer_ipi_scheduled; struct sk_buff *defer_list; call_single_data_t defer_csd; }; -- cgit v1.2.3 From cf2df74e202d81b09f09d84c2d8903e0e87e9274 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 9 May 2022 14:26:15 +0200 Subject: net: fix dev_fill_forward_path with pppoe + bridge When calling dev_fill_forward_path on a pppoe device, the provided destination address is invalid. In order for the bridge fdb lookup to succeed, the pppoe code needs to update ctx->daddr to the correct value. Fix this by storing the address inside struct net_device_path_ctx Fixes: f6efc675c9dd ("net: ppp: resolve forwarding path for bridge pppoe devices") Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b1fbe21650bb..f736c020cde2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -900,7 +900,7 @@ struct net_device_path_stack { struct net_device_path_ctx { const struct net_device *dev; - const u8 *daddr; + u8 daddr[ETH_ALEN]; int num_vlans; struct { -- cgit v1.2.3 From 9db69df4bdd37eb1f65b6931ee067fb15b9a4d5c Mon Sep 17 00:00:00 2001 From: TingHan Shen Date: Thu, 12 May 2022 16:22:13 +0800 Subject: firmware: mediatek: Add adsp ipc protocol interface Some of mediatek processors contain the Tensilica HiFix DSP for audio processing. The communication between Host CPU and DSP firmware is taking place using a shared memory area for message passing. ADSP IPC protocol offers (send/recv) interfaces using mediatek-mailbox APIs. We use two mbox channels to implement a request-reply protocol. Signed-off-by: Allen-KH Cheng Signed-off-by: TingHan Shen Reviewed-by: Pierre-Louis Bossart Reviewed-by: Curtis Malainey Reviewed-by: Tzung-Bi Shih Reviewed-by: YC Hung Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20220512082215.3018-2-tinghan.shen@mediatek.com Signed-off-by: Mark Brown --- include/linux/firmware/mediatek/mtk-adsp-ipc.h | 65 ++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 include/linux/firmware/mediatek/mtk-adsp-ipc.h (limited to 'include/linux') diff --git a/include/linux/firmware/mediatek/mtk-adsp-ipc.h b/include/linux/firmware/mediatek/mtk-adsp-ipc.h new file mode 100644 index 000000000000..28fd313340b8 --- /dev/null +++ b/include/linux/firmware/mediatek/mtk-adsp-ipc.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 MediaTek Inc. + */ + +#ifndef MTK_ADSP_IPC_H +#define MTK_ADSP_IPC_H + +#include +#include +#include +#include + +#define MTK_ADSP_IPC_REQ 0 +#define MTK_ADSP_IPC_RSP 1 +#define MTK_ADSP_IPC_OP_REQ 0x1 +#define MTK_ADSP_IPC_OP_RSP 0x2 + +enum { + MTK_ADSP_MBOX_REPLY, + MTK_ADSP_MBOX_REQUEST, + MTK_ADSP_MBOX_NUM, +}; + +struct mtk_adsp_ipc; + +struct mtk_adsp_ipc_ops { + void (*handle_reply)(struct mtk_adsp_ipc *ipc); + void (*handle_request)(struct mtk_adsp_ipc *ipc); +}; + +struct mtk_adsp_chan { + struct mtk_adsp_ipc *ipc; + struct mbox_client cl; + struct mbox_chan *ch; + char *name; + int idx; +}; + +struct mtk_adsp_ipc { + struct mtk_adsp_chan chans[MTK_ADSP_MBOX_NUM]; + struct device *dev; + struct mtk_adsp_ipc_ops *ops; + void *private_data; +}; + +static inline void mtk_adsp_ipc_set_data(struct mtk_adsp_ipc *ipc, void *data) +{ + if (!ipc) + return; + + ipc->private_data = data; +} + +static inline void *mtk_adsp_ipc_get_data(struct mtk_adsp_ipc *ipc) +{ + if (!ipc) + return NULL; + + return ipc->private_data; +} + +int mtk_adsp_ipc_send(struct mtk_adsp_ipc *ipc, unsigned int idx, uint32_t op); + +#endif /* MTK_ADSP_IPC_H */ -- cgit v1.2.3 From 7f8d12ea96352275c2850c24a1367166179392d2 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 29 Mar 2022 15:55:59 +0900 Subject: fs: add a lockdep check function for sb_start_write() Add a function sb_write_started() to allow callers to verify if sb_start_write() is properly called. It will be used for assertion in btrfs. Reviewed-by: Filipe Manana Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/linux/fs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..01d61984ce7a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1708,6 +1708,11 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level) #define __sb_writers_release(sb, lev) \ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) +static inline bool sb_write_started(const struct super_block *sb) +{ + return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1); +} + /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to -- cgit v1.2.3 From 908c54909ae72dcbf1d7e1440f7297187d06c275 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 5 May 2022 15:11:10 -0500 Subject: iomap: allow the file system to provide a bio_set for direct I/O Allow the file system to provide a specific bio_set for allocating direct I/O bios. This will allow file systems that use the ->submit_io hook to stash away additional information for file system use. To make use of this additional space for information in the completion path, the file system needs to override the ->bi_end_io callback and then call back into iomap, so export iomap_dio_bio_end_io for that. Reviewed-by: Darrick J. Wong Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/linux/iomap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b76f0dd149fb..cf903f1a230f 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -320,6 +320,16 @@ struct iomap_dio_ops { unsigned flags); void (*submit_io)(const struct iomap_iter *iter, struct bio *bio, loff_t file_offset); + + /* + * Filesystems wishing to attach private information to a direct io bio + * must provide a ->submit_io method that attaches the additional + * information to the bio and changes the ->bi_end_io callback to a + * custom function. This function should, at a minimum, perform any + * relevant post-processing of the bio and end with a call to + * iomap_dio_bio_end_io. + */ + struct bio_set *bio_set; }; /* @@ -349,6 +359,7 @@ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, size_t done_before); ssize_t iomap_dio_complete(struct iomap_dio *dio); +void iomap_dio_bio_end_io(struct bio *bio); #ifdef CONFIG_SWAP struct file; -- cgit v1.2.3 From 786f847f43a54e63161474fe85a4f1764d871a35 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 5 May 2022 15:11:11 -0500 Subject: iomap: add per-iomap_iter private data Allow the file system to keep state for all iterations. For now only wire it up for direct I/O as there is an immediate need for it there. Reviewed-by: Darrick J. Wong Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/linux/iomap.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index cf903f1a230f..5b6f64f4d771 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -188,6 +188,7 @@ struct iomap_iter { unsigned flags; struct iomap iomap; struct iomap srcmap; + void *private; }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); @@ -354,10 +355,10 @@ struct iomap_dio_ops { ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags, size_t done_before); + unsigned int dio_flags, void *private, size_t done_before); struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags, size_t done_before); + unsigned int dio_flags, void *private, size_t done_before); ssize_t iomap_dio_complete(struct iomap_dio *dio); void iomap_dio_bio_end_io(struct bio *bio); -- cgit v1.2.3 From b3fdf9398a16f01dc013967a4ab25e99c3f4fc12 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Mon, 16 May 2022 11:21:46 -0700 Subject: x86/mce: relocate set{clear}_mce_nospec() functions Relocate the twin mce functions to arch/x86/mm/pat/set_memory.c file where they belong. While at it, fixup a function name in a comment. Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Signed-off-by: Jane Chu Acked-by: Borislav Petkov Cc: Stephen Rothwell [sfr: gate {set,clear}_mce_nospec() by CONFIG_X86_64] Link: https://lore.kernel.org/r/165272527328.90175.8336008202048685278.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/set_memory.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index f36be5166c19..683a6c3f7179 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -42,14 +42,14 @@ static inline bool can_set_direct_map(void) #endif #endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ -#ifndef set_mce_nospec +#ifdef CONFIG_X86_64 +int set_mce_nospec(unsigned long pfn, bool unmap); +int clear_mce_nospec(unsigned long pfn); +#else static inline int set_mce_nospec(unsigned long pfn, bool unmap) { return 0; } -#endif - -#ifndef clear_mce_nospec static inline int clear_mce_nospec(unsigned long pfn) { return 0; -- cgit v1.2.3 From 5898b43af954b83c4a4ee4ab85c4dbafa395822a Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Mon, 16 May 2022 11:38:10 -0700 Subject: mce: fix set_mce_nospec to always unmap the whole page The set_memory_uc() approach doesn't work well in all cases. As Dan pointed out when "The VMM unmapped the bad page from guest physical space and passed the machine check to the guest." "The guest gets virtual #MC on an access to that page. When the guest tries to do set_memory_uc() and instructs cpa_flush() to do clean caches that results in taking another fault / exception perhaps because the VMM unmapped the page from the guest." Since the driver has special knowledge to handle NP or UC, mark the poisoned page with NP and let driver handle it when it comes down to repair. Please refer to discussions here for more details. https://lore.kernel.org/all/CAPcyv4hrXPb1tASBZUg-GgdVs0OOFKXMXLiHmktg_kFi7YBMyQ@mail.gmail.com/ Now since poisoned page is marked as not-present, in order to avoid writing to a not-present page and trigger kernel Oops, also fix pmem_do_write(). Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()") Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Signed-off-by: Jane Chu Acked-by: Tony Luck Link: https://lore.kernel.org/r/165272615484.103830.2563950688772226611.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/set_memory.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 683a6c3f7179..369769ce7399 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -43,10 +43,10 @@ static inline bool can_set_direct_map(void) #endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ #ifdef CONFIG_X86_64 -int set_mce_nospec(unsigned long pfn, bool unmap); +int set_mce_nospec(unsigned long pfn); int clear_mce_nospec(unsigned long pfn); #else -static inline int set_mce_nospec(unsigned long pfn, bool unmap) +static inline int set_mce_nospec(unsigned long pfn) { return 0; } -- cgit v1.2.3 From e511c4a3d2a1f64aafc1f5df37a2ffcf7ef91b55 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Fri, 13 May 2022 15:10:58 -0700 Subject: dax: introduce DAX_RECOVERY_WRITE dax access mode Up till now, dax_direct_access() is used implicitly for normal access, but for the purpose of recovery write, dax range with poison is requested. To make the interface clear, introduce enum dax_access_mode { DAX_ACCESS, DAX_RECOVERY_WRITE, } where DAX_ACCESS is used for normal dax access, and DAX_RECOVERY_WRITE is used for dax recovery write. Suggested-by: Dan Williams Signed-off-by: Jane Chu Reviewed-by: Christoph Hellwig Cc: Mike Snitzer Reviewed-by: Vivek Goyal Link: https://lore.kernel.org/r/165247982851.52965.11024212198889762949.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/dax.h | 9 +++++++-- include/linux/device-mapper.h | 4 +++- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 9fc5f99a0ae2..3f1339bce3c0 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -14,6 +14,11 @@ struct iomap_ops; struct iomap_iter; struct iomap; +enum dax_access_mode { + DAX_ACCESS, + DAX_RECOVERY_WRITE, +}; + struct dax_operations { /* * direct_access: translate a device-relative @@ -21,7 +26,7 @@ struct dax_operations { * number of pages available for DAX at that pfn. */ long (*direct_access)(struct dax_device *, pgoff_t, long, - void **, pfn_t *); + enum dax_access_mode, void **, pfn_t *); /* * Validate whether this device is usable as an fsdax backing * device. @@ -178,7 +183,7 @@ static inline void dax_read_unlock(int id) bool dax_alive(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, - void **kaddr, pfn_t *pfn); + enum dax_access_mode mode, void **kaddr, pfn_t *pfn); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index c2a3758c4aaa..acdedda0d12b 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -20,6 +20,7 @@ struct dm_table; struct dm_report_zones_args; struct mapped_device; struct bio_vec; +enum dax_access_mode; /* * Type of table, mapped_device's mempool and request_queue @@ -146,7 +147,8 @@ typedef int (*dm_busy_fn) (struct dm_target *ti); * >= 0 : the number of bytes accessible at the address */ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn); + long nr_pages, enum dax_access_mode node, void **kaddr, + pfn_t *pfn); typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff, size_t nr_pages); -- cgit v1.2.3 From 047218ec904da19c45c4a70274fc3f818a1fcba1 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Fri, 22 Apr 2022 16:45:06 -0600 Subject: dax: add .recovery_write dax_operation Introduce dax_recovery_write() operation. The function is used to recover a dax range that contains poison. Typical use case is when a user process receives a SIGBUS with si_code BUS_MCEERR_AR indicating poison(s) in a dax range, in response, the user process issues a pwrite() to the page-aligned dax range, thus clears the poison and puts valid data in the range. Reviewed-by: Christoph Hellwig Signed-off-by: Jane Chu Link: https://lore.kernel.org/r/20220422224508.440670-6-jane.chu@oracle.com Signed-off-by: Dan Williams --- include/linux/dax.h | 13 +++++++++++++ include/linux/device-mapper.h | 9 +++++++++ 2 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 3f1339bce3c0..e7b81634c52a 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -35,6 +35,12 @@ struct dax_operations { sector_t, sector_t); /* zero_page_range: required operation. Zero page range */ int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); + /* + * recovery_write: recover a poisoned range by DAX device driver + * capable of clearing poison. + */ + size_t (*recovery_write)(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *iter); }; #if IS_ENABLED(CONFIG_DAX) @@ -45,6 +51,8 @@ void dax_write_cache(struct dax_device *dax_dev, bool wc); bool dax_write_cache_enabled(struct dax_device *dax_dev); bool dax_synchronous(struct dax_device *dax_dev); void set_dax_synchronous(struct dax_device *dax_dev); +size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i); /* * Check if given mapping is supported by the file / underlying device. */ @@ -92,6 +100,11 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, { return !(vma->vm_flags & VM_SYNC); } +static inline size_t dax_recovery_write(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) +{ + return 0; +} #endif void set_dax_nocache(struct dax_device *dax_dev); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index acdedda0d12b..47a01c7cffdf 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -152,6 +152,14 @@ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff, size_t nr_pages); +/* + * Returns: + * != 0 : number of bytes transferred + * 0 : recovery write failed + */ +typedef size_t (*dm_dax_recovery_write_fn)(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i); + void dm_error(const char *message); struct dm_dev { @@ -201,6 +209,7 @@ struct target_type { dm_io_hints_fn io_hints; dm_dax_direct_access_fn direct_access; dm_dax_zero_page_range_fn dax_zero_page_range; + dm_dax_recovery_write_fn dax_recovery_write; /* For internal device-mapper use. */ struct list_head list; -- cgit v1.2.3 From 89af2ce2d95c80f6da9388d9da2e35c84c2573b1 Mon Sep 17 00:00:00 2001 From: Ricardo Martinez Date: Fri, 13 May 2022 10:34:00 -0700 Subject: net: skb: Remove skb_data_area_size() skb_data_area_size() is not needed. As Jakub pointed out [1]: For Rx, drivers can use the size passed during skb allocation or use skb_tailroom(). For Tx, drivers should use skb_headlen(). [1] https://lore.kernel.org/netdev/CAHNKnsTmH-rGgWi3jtyC=ktM1DW2W1VJkYoTMJV2Z_Bt498bsg@mail.gmail.com/ Signed-off-by: Ricardo Martinez Reviewed-by: Andy Shevchenko Reviewed-by: Sergey Ryazanov Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5a2b29df6cab..da96f0d3e753 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1765,11 +1765,6 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) } #endif -static inline unsigned int skb_data_area_size(struct sk_buff *skb) -{ - return skb_end_pointer(skb) - skb->data; -} - struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); -- cgit v1.2.3 From e626f37e657adbab2a7abe51480925891662a5f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 May 2022 14:29:43 +0200 Subject: nvme: split the enum used for various register constants Instead of having one big enum add one for each register or field. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni --- include/linux/nvme.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bbabdc7600da..5f6d432fa06a 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -204,8 +204,9 @@ enum { NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, - NVME_CAP_CSS_NVM = 1 << 0, - NVME_CAP_CSS_CSI = 1 << 6, +}; + +enum { NVME_CSTS_RDY = 1 << 0, NVME_CSTS_CFS = 1 << 1, NVME_CSTS_NSSRO = 1 << 4, @@ -214,10 +215,18 @@ enum { NVME_CSTS_SHST_OCCUR = 1 << 2, NVME_CSTS_SHST_CMPLT = 2 << 2, NVME_CSTS_SHST_MASK = 3 << 2, +}; + +enum { NVME_CMBMSC_CRE = 1 << 0, NVME_CMBMSC_CMSE = 1 << 1, }; +enum { + NVME_CAP_CSS_NVM = 1 << 0, + NVME_CAP_CSS_CSI = 1 << 6, +}; + struct nvme_id_power_state { __le16 max_power; /* centiwatts */ __u8 rsvd2; -- cgit v1.2.3 From a03dacb0316f74400846aaf144d6c73f4217ca08 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Tue, 2 Mar 2021 15:58:21 +0900 Subject: PM / devfreq: Add cpu based scaling support to passive governor Many CPU architectures have caches that can scale independent of the CPUs. Frequency scaling of the caches is necessary to make sure that the cache is not a performance bottleneck that leads to poor performance and power. The same idea applies for RAM/DDR. To achieve this, this patch adds support for cpu based scaling to the passive governor. This is accomplished by taking the current frequency of each CPU frequency domain and then adjust the frequency of the cache (or any devfreq device) based on the frequency of the CPUs. It listens to CPU frequency transition notifiers to keep itself up to date on the current CPU frequency. To decide the frequency of the device, the governor does one of the following: * Derives the optimal devfreq device opp from required-opps property of the parent cpu opp_table. * Scales the device frequency in proportion to the CPU frequency. So, if the CPUs are running at their max frequency, the device runs at its max frequency. If the CPUs are running at their min frequency, the device runs at its min frequency. It is interpolated for frequencies in between. Tested-by: Chen-Yu Tsai Tested-by: Johnson Wang Signed-off-by: Saravana Kannan [Sibi: Integrated cpu-freqmap governor into passive_governor] Signed-off-by: Sibi Sankar [Chanwoo: Fix conflict with latest code and cleanup code] Signed-off-by: Chanwoo Choi --- include/linux/devfreq.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h index 142474b4af96..b1e4a6f796ce 100644 --- a/include/linux/devfreq.h +++ b/include/linux/devfreq.h @@ -38,6 +38,7 @@ enum devfreq_timer { struct devfreq; struct devfreq_governor; +struct devfreq_cpu_data; struct thermal_cooling_device; /** @@ -288,6 +289,11 @@ struct devfreq_simple_ondemand_data { #endif #if IS_ENABLED(CONFIG_DEVFREQ_GOV_PASSIVE) +enum devfreq_parent_dev_type { + DEVFREQ_PARENT_DEV, + CPUFREQ_PARENT_DEV, +}; + /** * struct devfreq_passive_data - ``void *data`` fed to struct devfreq * and devfreq_add_device @@ -299,8 +305,11 @@ struct devfreq_simple_ondemand_data { * using governors except for passive governor. * If the devfreq device has the specific method to decide * the next frequency, should use this callback. - * @this: the devfreq instance of own device. - * @nb: the notifier block for DEVFREQ_TRANSITION_NOTIFIER list + * @parent_type: the parent type of the device. + * @this: the devfreq instance of own device. + * @nb: the notifier block for DEVFREQ_TRANSITION_NOTIFIER or + * CPUFREQ_TRANSITION_NOTIFIER list. + * @parent_cpu_data: the state min/max/current frequency of all online cpu's. * * The devfreq_passive_data have to set the devfreq instance of parent * device with governors except for the passive governor. But, don't need to @@ -314,9 +323,13 @@ struct devfreq_passive_data { /* Optional callback to decide the next frequency of passvice device */ int (*get_target_freq)(struct devfreq *this, unsigned long *freq); + /* Should set the type of parent device */ + enum devfreq_parent_dev_type parent_type; + /* For passive governor's internal use. Don't need to set them */ struct devfreq *this; struct notifier_block nb; + struct devfreq_cpu_data *parent_cpu_data[NR_CPUS]; }; #endif -- cgit v1.2.3 From 26984d9d581e5049bd75091d2e789b9cc3ea12e0 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Wed, 27 Apr 2022 03:49:19 +0900 Subject: PM / devfreq: passive: Keep cpufreq_policy for possible cpus The passive governor requires the cpu data to get the next target frequency of devfreq device if depending on cpu. In order to reduce the unnecessary memory data, keep cpufreq_policy data for possible cpus instead of NR_CPU. Tested-by: Chen-Yu Tsai Tested-by: Johnson Wang Signed-off-by: Chanwoo Choi --- include/linux/devfreq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h index b1e4a6f796ce..dc10bee75a72 100644 --- a/include/linux/devfreq.h +++ b/include/linux/devfreq.h @@ -309,7 +309,7 @@ enum devfreq_parent_dev_type { * @this: the devfreq instance of own device. * @nb: the notifier block for DEVFREQ_TRANSITION_NOTIFIER or * CPUFREQ_TRANSITION_NOTIFIER list. - * @parent_cpu_data: the state min/max/current frequency of all online cpu's. + * @cpu_data_list: the list of cpu frequency data for all cpufreq_policy. * * The devfreq_passive_data have to set the devfreq instance of parent * device with governors except for the passive governor. But, don't need to @@ -329,7 +329,7 @@ struct devfreq_passive_data { /* For passive governor's internal use. Don't need to set them */ struct devfreq *this; struct notifier_block nb; - struct devfreq_cpu_data *parent_cpu_data[NR_CPUS]; + struct list_head cpu_data_list; }; #endif -- cgit v1.2.3 From 1ad6c3b7ef132e1d8c5d606008069724625c8daf Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 5 Apr 2022 11:24:51 +0200 Subject: hwmon: introduce hwmon_sanitize_name() More and more drivers will check for bad characters in the hwmon name and all are using the same code snippet. Consolidate that code by adding a new hwmon_sanitize_name() function. Signed-off-by: Michael Walle Reviewed-by: Tom Rix Link: https://lore.kernel.org/r/20220405092452.4033674-2-michael@walle.cc Signed-off-by: Guenter Roeck --- include/linux/hwmon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index eba380b76d15..4efaf06fd2b8 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -461,6 +461,9 @@ void devm_hwmon_device_unregister(struct device *dev); int hwmon_notify_event(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel); +char *hwmon_sanitize_name(const char *name); +char *devm_hwmon_sanitize_name(struct device *dev, const char *name); + /** * hwmon_is_bad_char - Is the char invalid in a hwmon name * @ch: the char to be considered -- cgit v1.2.3 From c8383054506c77b814489c09877b5db83fd4abf2 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Mon, 25 Apr 2022 20:21:24 +0800 Subject: cachefiles: notify the user daemon when looking up cookie Fscache/CacheFiles used to serve as a local cache for a remote networking fs. A new on-demand read mode will be introduced for CacheFiles, which can boost the scenario where on-demand read semantics are needed, e.g. container image distribution. The essential difference between these two modes is seen when a cache miss occurs: In the original mode, the netfs will fetch the data from the remote server and then write it to the cache file; in on-demand read mode, fetching the data and writing it into the cache is delegated to a user daemon. As the first step, notify the user daemon when looking up cookie. In this case, an anonymous fd is sent to the user daemon, through which the user daemon can write the fetched data to the cache file. Since the user daemon may move the anonymous fd around, e.g. through dup(), an object ID uniquely identifying the cache file is also attached. Also add one advisory flag (FSCACHE_ADV_WANT_CACHE_SIZE) suggesting that the cache file size shall be retrieved at runtime. This helps the scenario where one cache file contains multiple netfs files, e.g. for the purpose of deduplication. In this case, netfs itself has no idea the size of the cache file, whilst the user daemon should give the hint on it. Signed-off-by: Jeffle Xu Link: https://lore.kernel.org/r/20220509074028.74954-3-jefflexu@linux.alibaba.com Acked-by: David Howells Signed-off-by: Gao Xiang --- include/linux/fscache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index e25539072463..72585c9729a2 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -39,6 +39,7 @@ struct fscache_cookie; #define FSCACHE_ADV_SINGLE_CHUNK 0x01 /* The object is a single chunk of data */ #define FSCACHE_ADV_WRITE_CACHE 0x00 /* Do cache if written to locally */ #define FSCACHE_ADV_WRITE_NOCACHE 0x02 /* Don't cache if written to locally */ +#define FSCACHE_ADV_WANT_CACHE_SIZE 0x04 /* Retrieve cache size at runtime */ #define FSCACHE_INVAL_DIO_WRITE 0x01 /* Invalidate due to DIO write */ -- cgit v1.2.3 From 9032b6e8589f269743984aac53e82e4835be16dc Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Mon, 25 Apr 2022 20:21:27 +0800 Subject: cachefiles: implement on-demand read Implement the data plane of on-demand read mode. The early implementation [1] place the entry to cachefiles_ondemand_read() in fscache_read(). However, fscache_read() can only detect if the requested file range is fully cache miss, whilst we need to notify the user daemon as long as there's a hole inside the requested file range. Thus the entry is now placed in cachefiles_prepare_read(). When working in on-demand read mode, once a hole detected, the read routine will send a READ request to the user daemon. The user daemon needs to fetch the data and write it to the cache file. After sending the READ request, the read routine will hang there, until the READ request is handled by the user daemon. Then it will retry to read from the same file range. If no progress encountered, the read routine will fail then. A new NETFS_SREQ_ONDEMAND flag is introduced to indicate that on-demand read should be done when a cache miss encountered. [1] https://lore.kernel.org/all/20220406075612.60298-6-jefflexu@linux.alibaba.com/ #v8 Signed-off-by: Jeffle Xu Acked-by: David Howells Link: https://lore.kernel.org/r/20220425122143.56815-6-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- include/linux/netfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c7bf1eaf51d5..057d04efaf79 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -159,6 +159,7 @@ struct netfs_io_subrequest { #define NETFS_SREQ_SHORT_IO 2 /* Set if the I/O was short */ #define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ #define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */ +#define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ }; enum netfs_io_origin { -- cgit v1.2.3 From 7b8b44eb7710e34a1ea5278392417993a549fabf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:36:58 -0400 Subject: NFSv4: Specify the type of ACL to cache When caching a NFSv4 ACL, we want to specify whether we are caching an NFSv4.0 type acl, the NFSv4.1 dacl or the NFSv4.1 sacl. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs4.h | 2 ++ include/linux/nfs_xdr.h | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 5662d8be04eb..8d04b6a5964c 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -451,6 +451,8 @@ enum lock_type4 { #define FATTR4_WORD1_TIME_MODIFY (1UL << 21) #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) +#define FATTR4_WORD1_DACL (1UL << 26) +#define FATTR4_WORD1_SACL (1UL << 27) #define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) #define FATTR4_WORD2_LAYOUT_TYPES (1UL << 0) #define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 2863e5a69c6a..13d068c57d8d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -800,6 +800,13 @@ struct nfs_setattrargs { const struct nfs4_label *label; }; +enum nfs4_acl_type { + NFS4ACL_NONE = 0, + NFS4ACL_ACL, + NFS4ACL_DACL, + NFS4ACL_SACL, +}; + struct nfs_setaclargs { struct nfs4_sequence_args seq_args; struct nfs_fh * fh; -- cgit v1.2.3 From db145db021abf75aa1a5810e631425055cfbed47 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:36:59 -0400 Subject: NFSv4: Add encoders/decoders for the NFSv4.1 dacl and sacl attributes Add the ability to set or retrieve the acl using the NFSv4.1 'dacl' and 'sacl' attributes to the NFSv4 xdr encoders/decoders. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 13d068c57d8d..4a8ba84f848e 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -810,6 +810,7 @@ enum nfs4_acl_type { struct nfs_setaclargs { struct nfs4_sequence_args seq_args; struct nfs_fh * fh; + enum nfs4_acl_type acl_type; size_t acl_len; struct page ** acl_pages; }; @@ -821,6 +822,7 @@ struct nfs_setaclres { struct nfs_getaclargs { struct nfs4_sequence_args seq_args; struct nfs_fh * fh; + enum nfs4_acl_type acl_type; size_t acl_len; struct page ** acl_pages; }; @@ -829,6 +831,7 @@ struct nfs_getaclargs { #define NFS4_ACL_TRUNC 0x0001 /* ACL was truncated */ struct nfs_getaclres { struct nfs4_sequence_res seq_res; + enum nfs4_acl_type acl_type; size_t acl_len; size_t acl_data_offset; int acl_flags; -- cgit v1.2.3 From 69e9cd66ae1392437234a63a3a1d60b6655f92ef Mon Sep 17 00:00:00 2001 From: Julian Orth Date: Tue, 17 May 2022 12:32:53 +0200 Subject: audit,io_uring,io-wq: call __audit_uring_exit for dummy contexts Not calling the function for dummy contexts will cause the context to not be reset. During the next syscall, this will cause an error in __audit_syscall_entry: WARN_ON(context->context != AUDIT_CTX_UNUSED); WARN_ON(context->name_count); if (context->context != AUDIT_CTX_UNUSED || context->name_count) { audit_panic("unrecoverable error in audit_syscall_entry()"); return; } These problematic dummy contexts are created via the following call chain: exit_to_user_mode_prepare -> arch_do_signal_or_restart -> get_signal -> task_work_run -> tctx_task_work -> io_req_task_submit -> io_issue_sqe -> audit_uring_entry Cc: stable@vger.kernel.org Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring") Signed-off-by: Julian Orth [PM: subject line tweaks] Signed-off-by: Paul Moore --- include/linux/audit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index d06134ac6245..cece70231138 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -339,7 +339,7 @@ static inline void audit_uring_entry(u8 op) } static inline void audit_uring_exit(int success, long code) { - if (unlikely(!audit_dummy_context())) + if (unlikely(audit_context())) __audit_uring_exit(success, code); } static inline void audit_syscall_entry(int major, unsigned long a0, -- cgit v1.2.3 From 0aa7be05d83cc584da0782405e8007e351dfb6cc Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 15 May 2022 20:42:03 +0200 Subject: locking/atomic: Add generic try_cmpxchg64 support Add generic support for try_cmpxchg64{,_acquire,_release,_relaxed} and their falbacks involving cmpxchg64. Signed-off-by: Uros Bizjak Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220515184205.103089-2-ubizjak@gmail.com --- include/linux/atomic/atomic-arch-fallback.h | 72 ++++++++++++++++++++++++++++- include/linux/atomic/atomic-instrumented.h | 40 +++++++++++++++- 2 files changed, 110 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h index 6db58d180866..77bc5522e61c 100644 --- a/include/linux/atomic/atomic-arch-fallback.h +++ b/include/linux/atomic/atomic-arch-fallback.h @@ -147,6 +147,76 @@ #endif /* arch_try_cmpxchg_relaxed */ +#ifndef arch_try_cmpxchg64_relaxed +#ifdef arch_try_cmpxchg64 +#define arch_try_cmpxchg64_acquire arch_try_cmpxchg64 +#define arch_try_cmpxchg64_release arch_try_cmpxchg64 +#define arch_try_cmpxchg64_relaxed arch_try_cmpxchg64 +#endif /* arch_try_cmpxchg64 */ + +#ifndef arch_try_cmpxchg64 +#define arch_try_cmpxchg64(_ptr, _oldp, _new) \ +({ \ + typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ + ___r = arch_cmpxchg64((_ptr), ___o, (_new)); \ + if (unlikely(___r != ___o)) \ + *___op = ___r; \ + likely(___r == ___o); \ +}) +#endif /* arch_try_cmpxchg64 */ + +#ifndef arch_try_cmpxchg64_acquire +#define arch_try_cmpxchg64_acquire(_ptr, _oldp, _new) \ +({ \ + typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ + ___r = arch_cmpxchg64_acquire((_ptr), ___o, (_new)); \ + if (unlikely(___r != ___o)) \ + *___op = ___r; \ + likely(___r == ___o); \ +}) +#endif /* arch_try_cmpxchg64_acquire */ + +#ifndef arch_try_cmpxchg64_release +#define arch_try_cmpxchg64_release(_ptr, _oldp, _new) \ +({ \ + typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ + ___r = arch_cmpxchg64_release((_ptr), ___o, (_new)); \ + if (unlikely(___r != ___o)) \ + *___op = ___r; \ + likely(___r == ___o); \ +}) +#endif /* arch_try_cmpxchg64_release */ + +#ifndef arch_try_cmpxchg64_relaxed +#define arch_try_cmpxchg64_relaxed(_ptr, _oldp, _new) \ +({ \ + typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ + ___r = arch_cmpxchg64_relaxed((_ptr), ___o, (_new)); \ + if (unlikely(___r != ___o)) \ + *___op = ___r; \ + likely(___r == ___o); \ +}) +#endif /* arch_try_cmpxchg64_relaxed */ + +#else /* arch_try_cmpxchg64_relaxed */ + +#ifndef arch_try_cmpxchg64_acquire +#define arch_try_cmpxchg64_acquire(...) \ + __atomic_op_acquire(arch_try_cmpxchg64, __VA_ARGS__) +#endif + +#ifndef arch_try_cmpxchg64_release +#define arch_try_cmpxchg64_release(...) \ + __atomic_op_release(arch_try_cmpxchg64, __VA_ARGS__) +#endif + +#ifndef arch_try_cmpxchg64 +#define arch_try_cmpxchg64(...) \ + __atomic_op_fence(arch_try_cmpxchg64, __VA_ARGS__) +#endif + +#endif /* arch_try_cmpxchg64_relaxed */ + #ifndef arch_atomic_read_acquire static __always_inline int arch_atomic_read_acquire(const atomic_t *v) @@ -2386,4 +2456,4 @@ arch_atomic64_dec_if_positive(atomic64_t *v) #endif #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// 8e2cc06bc0d2c0967d2f8424762bd48555ee40ae +// b5e87bdd5ede61470c29f7a7e4de781af3770f09 diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h index 5d69b143c28e..7a139ec030b0 100644 --- a/include/linux/atomic/atomic-instrumented.h +++ b/include/linux/atomic/atomic-instrumented.h @@ -2006,6 +2006,44 @@ atomic_long_dec_if_positive(atomic_long_t *v) arch_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) +#define try_cmpxchg64(ptr, oldp, ...) \ +({ \ + typeof(ptr) __ai_ptr = (ptr); \ + typeof(oldp) __ai_oldp = (oldp); \ + kcsan_mb(); \ + instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ + instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \ + arch_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \ +}) + +#define try_cmpxchg64_acquire(ptr, oldp, ...) \ +({ \ + typeof(ptr) __ai_ptr = (ptr); \ + typeof(oldp) __ai_oldp = (oldp); \ + instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ + instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \ + arch_try_cmpxchg64_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \ +}) + +#define try_cmpxchg64_release(ptr, oldp, ...) \ +({ \ + typeof(ptr) __ai_ptr = (ptr); \ + typeof(oldp) __ai_oldp = (oldp); \ + kcsan_release(); \ + instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ + instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \ + arch_try_cmpxchg64_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \ +}) + +#define try_cmpxchg64_relaxed(ptr, oldp, ...) \ +({ \ + typeof(ptr) __ai_ptr = (ptr); \ + typeof(oldp) __ai_oldp = (oldp); \ + instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ + instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \ + arch_try_cmpxchg64_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \ +}) + #define cmpxchg_local(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ @@ -2045,4 +2083,4 @@ atomic_long_dec_if_positive(atomic_long_t *v) }) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ -// 87c974b93032afd42143613434d1a7788fa598f9 +// 764f741eb77a7ad565dc8d99ce2837d5542e8aee -- cgit v1.2.3 From bec67592521ec816371f5f072b1a340e1c2ad434 Mon Sep 17 00:00:00 2001 From: Min Li Date: Mon, 16 May 2022 10:47:06 -0400 Subject: ptp: ptp_clockmatrix: Add PTP_CLK_REQ_EXTTS support Use TOD_READ_SECONDARY for extts to keep TOD_READ_PRIMARY for gettime and settime exclusively. Before this change, TOD_READ_PRIMARY was used for both extts and gettime/settime, which would result in changing TOD read/write triggers between operations. Using TOD_READ_SECONDARY would make extts independent of gettime/settime operation Signed-off-by: Min Li Acked-by: Richard Cochran Link: https://lore.kernel.org/r/1652712427-14703-1-git-send-email-min.li.xe@renesas.com Signed-off-by: Jakub Kicinski --- include/linux/mfd/idt8a340_reg.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/idt8a340_reg.h b/include/linux/mfd/idt8a340_reg.h index a18c1539a152..0c706085c205 100644 --- a/include/linux/mfd/idt8a340_reg.h +++ b/include/linux/mfd/idt8a340_reg.h @@ -407,7 +407,7 @@ #define TOD_READ_PRIMARY_0 0xcc40 #define TOD_READ_PRIMARY_0_V520 0xcc50 /* 8-bit subns, 32-bit ns, 48-bit seconds */ -#define TOD_READ_PRIMARY 0x0000 +#define TOD_READ_PRIMARY_BASE 0x0000 /* Counter increments after TOD write is completed */ #define TOD_READ_PRIMARY_COUNTER 0x000b /* Read trigger configuration */ @@ -424,6 +424,16 @@ #define TOD_READ_SECONDARY_0 0xcc90 #define TOD_READ_SECONDARY_0_V520 0xcca0 +/* 8-bit subns, 32-bit ns, 48-bit seconds */ +#define TOD_READ_SECONDARY_BASE 0x0000 +/* Counter increments after TOD write is completed */ +#define TOD_READ_SECONDARY_COUNTER 0x000b +/* Read trigger configuration */ +#define TOD_READ_SECONDARY_SEL_CFG_0 0x000c +/* Read trigger selection */ +#define TOD_READ_SECONDARY_CMD 0x000e +#define TOD_READ_SECONDARY_CMD_V520 0x000f + #define TOD_READ_SECONDARY_1 0xcca0 #define TOD_READ_SECONDARY_1_V520 0xccb0 #define TOD_READ_SECONDARY_2 0xccb0 -- cgit v1.2.3 From 1d2c717bc7f7fd3c9cf38d4a0d5d7ede06adf05b Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 13 May 2022 06:19:31 +0300 Subject: net/mlx5: Add last command failure syndrome to debugfs Add syndrome of last command failure per command type to debugfs to ease debugging of such failure. last_failed_syndrome - last command failed syndrome returned by FW. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d6bac3976913..74c8cfb771a2 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -272,6 +272,8 @@ struct mlx5_cmd_stats { u32 last_failed_errno; /* last bad status returned by FW */ u8 last_failed_mbox_status; + /* last command failed syndrome returned by FW */ + u32 last_failed_syndrome; struct dentry *root; /* protect command average calculations */ spinlock_t lock; -- cgit v1.2.3 From 9b45bde82c229fda94618896ff530dcba9d66fe0 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 25 Jan 2022 14:47:36 +0200 Subject: net/mlx5: Inline db alloc API function Take the wrapper version which picks default node into a header file. This reduces the number of exported functions. Signed-off-by: Tariq Toukan Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 74c8cfb771a2..b064bc278f52 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1053,9 +1053,14 @@ int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in, void *data_out, int size_out, u16 reg_num, int arg, int write); -int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db); int mlx5_db_alloc_node(struct mlx5_core_dev *dev, struct mlx5_db *db, int node); + +static inline int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db) +{ + return mlx5_db_alloc_node(dev, db, dev->priv.numa_node); +} + void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db); const char *mlx5_command_str(int command); -- cgit v1.2.3 From 94db3317781922ba52722c58061e0e8517d4d80d Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 31 Jan 2022 07:49:51 +0200 Subject: net/mlx5: Support multiport eswitch mode Multiport eswitch mode is a LAG mode that allows to add rules that forward traffic to a specific physical port without being affected by LAG affinity configuration. This mode of operation is mutual exclusive with the other LAG modes used by multipath and bonding. To make the transition between the modes, we maintain a counter on the number of rules specifying one of the uplink representors as the target of mirred egress redirect action. An example of such rule would be: $ tc filter add dev enp8s0f0_0 prot all root flower dst_mac \ 00:11:22:33:44:55 action mirred egress redirect dev enp8s0f0 If the reference count just grows to one and LAG is not in use, we create the LAG in multiport eswitch mode. Other mode changes are not allowed while in this mode. When the reference count reaches zero, we destroy the LAG and let other modes be used if needed. logic also changed such that if forwarding to some uplink destination cannot be guaranteed, we fail the operation so the rule will eventually be in software and not in hardware. Signed-off-by: Eli Cohen Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 7bab3e51c61e..78b3d3465dd7 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1359,7 +1359,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 vhca_resource_manager[0x1]; u8 hca_cap_2[0x1]; - u8 reserved_at_21[0x1]; + u8 create_lag_when_not_master_up[0x1]; u8 dtor[0x1]; u8 event_on_vhca_state_teardown_request[0x1]; u8 event_on_vhca_state_in_use[0x1]; @@ -10816,7 +10816,8 @@ struct mlx5_ifc_dcbx_param_bits { enum { MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY = 0, - MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT, + MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT = 1, + MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW = 2, }; struct mlx5_ifc_lagc_bits { -- cgit v1.2.3 From 41929c9f628b9990d33a200c54bb0c919e089aa8 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 6 Apr 2022 22:55:05 +0200 Subject: clocksource/drivers/ixp4xx: Drop boardfile probe path The boardfiles for IXP4xx have been deleted. Delete all the quirks and code dealing with that boot path and rely solely on device tree boot. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20220406205505.2332821-1-linus.walleij@linaro.org Signed-off-by: Daniel Lezcano --- include/linux/platform_data/timer-ixp4xx.h | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 include/linux/platform_data/timer-ixp4xx.h (limited to 'include/linux') diff --git a/include/linux/platform_data/timer-ixp4xx.h b/include/linux/platform_data/timer-ixp4xx.h deleted file mode 100644 index ee92ae7edaed..000000000000 --- a/include/linux/platform_data/timer-ixp4xx.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __TIMER_IXP4XX_H -#define __TIMER_IXP4XX_H - -#include - -void __init ixp4xx_timer_setup(resource_size_t timerbase, - int timer_irq, - unsigned int timer_freq); - -#endif -- cgit v1.2.3 From 14362a2541797cf9df0e86fb12dcd7950baf566e Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 11 May 2022 22:02:12 +0300 Subject: fsnotify: introduce mark type iterator fsnotify_foreach_iter_mark_type() is used to reduce boilerplate code of iterating all marks of a specific group interested in an event by consulting the iterator report_mask. Use an open coded version of that iterator in fsnotify_iter_next() that collects all marks of the current iteration group without consulting the iterator report_mask. At the moment, the two iterator variants are the same, but this decoupling will allow us to exclude some of the group's marks from reporting the event, for example for event on child and inode marks on parent did not request to watch events on children. Fixes: 2f02fd3fa13e ("fanotify: fix ignore mask logic for events on child and on dir") Reported-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20220511190213.831646-2-amir73il@gmail.com --- include/linux/fsnotify_backend.h | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 9a1a9e78f69f..9560734759fa 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -399,6 +399,7 @@ static inline bool fsnotify_valid_obj_type(unsigned int obj_type) struct fsnotify_iter_info { struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT]; + struct fsnotify_group *current_group; unsigned int report_mask; int srcu_idx; }; @@ -415,20 +416,31 @@ static inline void fsnotify_iter_set_report_type( iter_info->report_mask |= (1U << iter_type); } -static inline void fsnotify_iter_set_report_type_mark( - struct fsnotify_iter_info *iter_info, int iter_type, - struct fsnotify_mark *mark) +static inline struct fsnotify_mark *fsnotify_iter_mark( + struct fsnotify_iter_info *iter_info, int iter_type) { - iter_info->marks[iter_type] = mark; - iter_info->report_mask |= (1U << iter_type); + if (fsnotify_iter_should_report_type(iter_info, iter_type)) + return iter_info->marks[iter_type]; + return NULL; +} + +static inline int fsnotify_iter_step(struct fsnotify_iter_info *iter, int type, + struct fsnotify_mark **markp) +{ + while (type < FSNOTIFY_ITER_TYPE_COUNT) { + *markp = fsnotify_iter_mark(iter, type); + if (*markp) + break; + type++; + } + return type; } #define FSNOTIFY_ITER_FUNCS(name, NAME) \ static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \ struct fsnotify_iter_info *iter_info) \ { \ - return (iter_info->report_mask & (1U << FSNOTIFY_ITER_TYPE_##NAME)) ? \ - iter_info->marks[FSNOTIFY_ITER_TYPE_##NAME] : NULL; \ + return fsnotify_iter_mark(iter_info, FSNOTIFY_ITER_TYPE_##NAME); \ } FSNOTIFY_ITER_FUNCS(inode, INODE) @@ -438,6 +450,11 @@ FSNOTIFY_ITER_FUNCS(sb, SB) #define fsnotify_foreach_iter_type(type) \ for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++) +#define fsnotify_foreach_iter_mark_type(iter, mark, type) \ + for (type = 0; \ + type = fsnotify_iter_step(iter, type, &mark), \ + type < FSNOTIFY_ITER_TYPE_COUNT; \ + type++) /* * fsnotify_connp_t is what we embed in objects which connector can be attached -- cgit v1.2.3 From e73aaae2fa9024832e1f42e30c787c7baf61d014 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sat, 7 May 2022 14:03:46 +0200 Subject: siphash: use one source of truth for siphash permutations The SipHash family of permutations is currently used in three places: - siphash.c itself, used in the ordinary way it was intended. - random32.c, in a construction from an anonymous contributor. - random.c, as part of its fast_mix function. Each one of these places reinvents the wheel with the same C code, same rotation constants, and same symmetry-breaking constants. This commit tidies things up a bit by placing macros for the permutations and constants into siphash.h, where each of the three .c users can access them. It also leaves a note dissuading more users of them from emerging. Signed-off-by: Jason A. Donenfeld --- include/linux/prandom.h | 23 +++++++---------------- include/linux/siphash.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/prandom.h b/include/linux/prandom.h index 056d31317e49..a4aadd2dc153 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -10,6 +10,7 @@ #include #include +#include u32 prandom_u32(void); void prandom_bytes(void *buf, size_t nbytes); @@ -27,15 +28,10 @@ DECLARE_PER_CPU(unsigned long, net_rand_noise); * The core SipHash round function. Each line can be executed in * parallel given enough CPU resources. */ -#define PRND_SIPROUND(v0, v1, v2, v3) ( \ - v0 += v1, v1 = rol64(v1, 13), v2 += v3, v3 = rol64(v3, 16), \ - v1 ^= v0, v0 = rol64(v0, 32), v3 ^= v2, \ - v0 += v3, v3 = rol64(v3, 21), v2 += v1, v1 = rol64(v1, 17), \ - v3 ^= v0, v1 ^= v2, v2 = rol64(v2, 32) \ -) +#define PRND_SIPROUND(v0, v1, v2, v3) SIPHASH_PERMUTATION(v0, v1, v2, v3) -#define PRND_K0 (0x736f6d6570736575 ^ 0x6c7967656e657261) -#define PRND_K1 (0x646f72616e646f6d ^ 0x7465646279746573) +#define PRND_K0 (SIPHASH_CONST_0 ^ SIPHASH_CONST_2) +#define PRND_K1 (SIPHASH_CONST_1 ^ SIPHASH_CONST_3) #elif BITS_PER_LONG == 32 /* @@ -43,14 +39,9 @@ DECLARE_PER_CPU(unsigned long, net_rand_noise); * This is weaker, but 32-bit machines are not used for high-traffic * applications, so there is less output for an attacker to analyze. */ -#define PRND_SIPROUND(v0, v1, v2, v3) ( \ - v0 += v1, v1 = rol32(v1, 5), v2 += v3, v3 = rol32(v3, 8), \ - v1 ^= v0, v0 = rol32(v0, 16), v3 ^= v2, \ - v0 += v3, v3 = rol32(v3, 7), v2 += v1, v1 = rol32(v1, 13), \ - v3 ^= v0, v1 ^= v2, v2 = rol32(v2, 16) \ -) -#define PRND_K0 0x6c796765 -#define PRND_K1 0x74656462 +#define PRND_SIPROUND(v0, v1, v2, v3) HSIPHASH_PERMUTATION(v0, v1, v2, v3) +#define PRND_K0 (HSIPHASH_CONST_0 ^ HSIPHASH_CONST_2) +#define PRND_K1 (HSIPHASH_CONST_1 ^ HSIPHASH_CONST_3) #else #error Unsupported BITS_PER_LONG diff --git a/include/linux/siphash.h b/include/linux/siphash.h index cce8a9acc76c..3af1428da559 100644 --- a/include/linux/siphash.h +++ b/include/linux/siphash.h @@ -138,4 +138,32 @@ static inline u32 hsiphash(const void *data, size_t len, return ___hsiphash_aligned(data, len, key); } +/* + * These macros expose the raw SipHash and HalfSipHash permutations. + * Do not use them directly! If you think you have a use for them, + * be sure to CC the maintainer of this file explaining why. + */ + +#define SIPHASH_PERMUTATION(a, b, c, d) ( \ + (a) += (b), (b) = rol64((b), 13), (b) ^= (a), (a) = rol64((a), 32), \ + (c) += (d), (d) = rol64((d), 16), (d) ^= (c), \ + (a) += (d), (d) = rol64((d), 21), (d) ^= (a), \ + (c) += (b), (b) = rol64((b), 17), (b) ^= (c), (c) = rol64((c), 32)) + +#define SIPHASH_CONST_0 0x736f6d6570736575ULL +#define SIPHASH_CONST_1 0x646f72616e646f6dULL +#define SIPHASH_CONST_2 0x6c7967656e657261ULL +#define SIPHASH_CONST_3 0x7465646279746573ULL + +#define HSIPHASH_PERMUTATION(a, b, c, d) ( \ + (a) += (b), (b) = rol32((b), 5), (b) ^= (a), (a) = rol32((a), 16), \ + (c) += (d), (d) = rol32((d), 8), (d) ^= (c), \ + (a) += (d), (d) = rol32((d), 7), (d) ^= (a), \ + (c) += (b), (b) = rol32((b), 13), (b) ^= (c), (c) = rol32((c), 16)) + +#define HSIPHASH_CONST_0 0U +#define HSIPHASH_CONST_1 0U +#define HSIPHASH_CONST_2 0x6c796765U +#define HSIPHASH_CONST_3 0x74656462U + #endif /* _LINUX_SIPHASH_H */ -- cgit v1.2.3 From d4150779e60fb6c49be25572596b2cdfc5d46a09 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 11 May 2022 16:11:29 +0200 Subject: random32: use real rng for non-deterministic randomness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit random32.c has two random number generators in it: one that is meant to be used deterministically, with some predefined seed, and one that does the same exact thing as random.c, except does it poorly. The first one has some use cases. The second one no longer does and can be replaced with calls to random.c's proper random number generator. The relatively recent siphash-based bad random32.c code was added in response to concerns that the prior random32.c was too deterministic. Out of fears that random.c was (at the time) too slow, this code was anonymously contributed. Then out of that emerged a kind of shadow entropy gathering system, with its own tentacles throughout various net code, added willy nilly. Stop👏making👏bespoke👏random👏number👏generators👏. Fortunately, recent advances in random.c mean that we can stop playing with this sketchiness, and just use get_random_u32(), which is now fast enough. In micro benchmarks using RDPMC, I'm seeing the same median cycle count between the two functions, with the mean being _slightly_ higher due to batches refilling (which we can optimize further need be). However, when doing *real* benchmarks of the net functions that actually use these random numbers, the mean cycles actually *decreased* slightly (with the median still staying the same), likely because the additional prandom code means icache misses and complexity, whereas random.c is generally already being used by something else nearby. The biggest benefit of this is that there are many users of prandom who probably should be using cryptographically secure random numbers. This makes all of those accidental cases become secure by just flipping a switch. Later on, we can do a tree-wide cleanup to remove the static inline wrapper functions that this commit adds. There are also some low-ish hanging fruits for making this even faster in the future: a get_random_u16() function for use in the networking stack will give a 2x performance boost there, using SIMD for ChaCha20 will let us compute 4 or 8 or 16 blocks of output in parallel, instead of just one, giving us large buffers for cheap, and introducing a get_random_*_bh() function that assumes irqs are already disabled will shave off a few cycles for ordinary calls. These are things we can chip away at down the road. Acked-by: Jakub Kicinski Acked-by: Theodore Ts'o Signed-off-by: Jason A. Donenfeld --- include/linux/prandom.h | 52 +++++++------------------------------------------ 1 file changed, 7 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/prandom.h b/include/linux/prandom.h index a4aadd2dc153..deace5fb4e62 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -10,53 +10,16 @@ #include #include -#include +#include -u32 prandom_u32(void); -void prandom_bytes(void *buf, size_t nbytes); -void prandom_seed(u32 seed); -void prandom_reseed_late(void); - -DECLARE_PER_CPU(unsigned long, net_rand_noise); - -#define PRANDOM_ADD_NOISE(a, b, c, d) \ - prandom_u32_add_noise((unsigned long)(a), (unsigned long)(b), \ - (unsigned long)(c), (unsigned long)(d)) - -#if BITS_PER_LONG == 64 -/* - * The core SipHash round function. Each line can be executed in - * parallel given enough CPU resources. - */ -#define PRND_SIPROUND(v0, v1, v2, v3) SIPHASH_PERMUTATION(v0, v1, v2, v3) - -#define PRND_K0 (SIPHASH_CONST_0 ^ SIPHASH_CONST_2) -#define PRND_K1 (SIPHASH_CONST_1 ^ SIPHASH_CONST_3) - -#elif BITS_PER_LONG == 32 -/* - * On 32-bit machines, we use HSipHash, a reduced-width version of SipHash. - * This is weaker, but 32-bit machines are not used for high-traffic - * applications, so there is less output for an attacker to analyze. - */ -#define PRND_SIPROUND(v0, v1, v2, v3) HSIPHASH_PERMUTATION(v0, v1, v2, v3) -#define PRND_K0 (HSIPHASH_CONST_0 ^ HSIPHASH_CONST_2) -#define PRND_K1 (HSIPHASH_CONST_1 ^ HSIPHASH_CONST_3) - -#else -#error Unsupported BITS_PER_LONG -#endif +static inline u32 prandom_u32(void) +{ + return get_random_u32(); +} -static inline void prandom_u32_add_noise(unsigned long a, unsigned long b, - unsigned long c, unsigned long d) +static inline void prandom_bytes(void *buf, size_t nbytes) { - /* - * This is not used cryptographically; it's just - * a convenient 4-word hash function. (3 xor, 2 add, 2 rol) - */ - a ^= raw_cpu_read(net_rand_noise); - PRND_SIPROUND(a, b, c, d); - raw_cpu_write(net_rand_noise, d); + return get_random_bytes(buf, nbytes); } struct rnd_state { @@ -108,7 +71,6 @@ static inline void prandom_seed_state(struct rnd_state *state, u64 seed) state->s2 = __seed(i, 8U); state->s3 = __seed(i, 16U); state->s4 = __seed(i, 128U); - PRANDOM_ADD_NOISE(state, i, 0, 0); } /* Pseudo random number generator from numerical recipes. */ -- cgit v1.2.3 From 2f14062bb14b0fcfcc21e6dc7d5b5c0d25966164 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 5 May 2022 02:20:22 +0200 Subject: random: handle latent entropy and command line from random_init() Currently, start_kernel() adds latent entropy and the command line to the entropy bool *after* the RNG has been initialized, deferring when it's actually used by things like stack canaries until the next time the pool is seeded. This surely is not intended. Rather than splitting up which entropy gets added where and when between start_kernel() and random_init(), just do everything in random_init(), which should eliminate these kinds of bugs in the future. While we're at it, rename the awkwardly titled "rand_initialize()" to the more standard "random_init()" nomenclature. Reviewed-by: Dominik Brodowski Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index f673fbb838b3..b52963955a99 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -14,22 +14,21 @@ struct notifier_block; extern void add_device_randomness(const void *, size_t); extern void add_bootloader_randomness(const void *, size_t); +extern void add_input_randomness(unsigned int type, unsigned int code, + unsigned int value) __latent_entropy; +extern void add_interrupt_randomness(int irq) __latent_entropy; +extern void add_hwgenerator_randomness(const void *buffer, size_t count, + size_t entropy); #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) static inline void add_latent_entropy(void) { - add_device_randomness((const void *)&latent_entropy, - sizeof(latent_entropy)); + add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); } #else static inline void add_latent_entropy(void) {} #endif -extern void add_input_randomness(unsigned int type, unsigned int code, - unsigned int value) __latent_entropy; -extern void add_interrupt_randomness(int irq) __latent_entropy; -extern void add_hwgenerator_randomness(const void *buffer, size_t count, - size_t entropy); #if IS_ENABLED(CONFIG_VMGENID) extern void add_vmfork_randomness(const void *unique_vm_id, size_t size); extern int register_random_vmfork_notifier(struct notifier_block *nb); @@ -41,7 +40,7 @@ static inline int unregister_random_vmfork_notifier(struct notifier_block *nb) { extern void get_random_bytes(void *buf, size_t nbytes); extern int wait_for_random_bytes(void); -extern int __init rand_initialize(void); +extern int __init random_init(const char *command_line); extern bool rng_is_initialized(void); extern int register_random_ready_notifier(struct notifier_block *nb); extern int unregister_random_ready_notifier(struct notifier_block *nb); -- cgit v1.2.3 From 354201c53e61e493017b15327294b0c8ab522d69 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 May 2022 15:09:21 +0200 Subject: nvme: add support for TP4084 - Time-to-Ready Enhancements Add support for using longer timeouts during controller initialization and letting the controller come up with namespaces that are not ready for I/O yet. We skip these not ready namespaces during scanning and only bring them online once anoter scan is kicked off by the AEN that is set when the NRDY bit gets set in the I/O Command Set Independent Identify Namespace Data Structure. This asynchronous probing avoids blocking the kernel boot when controllers take a very long time to recover after unclean shutdowns (up to minutes). Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke --- include/linux/nvme.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5f6d432fa06a..29ec3e3481ff 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -137,6 +137,7 @@ enum { NVME_REG_CMBMSC = 0x0050, /* Controller Memory Buffer Memory * Space Control */ + NVME_REG_CRTO = 0x0068, /* Controller Ready Timeouts */ NVME_REG_PMRCAP = 0x0e00, /* Persistent Memory Capabilities */ NVME_REG_PMRCTL = 0x0e04, /* Persistent Memory Region Control */ NVME_REG_PMRSTS = 0x0e08, /* Persistent Memory Region Status */ @@ -161,6 +162,9 @@ enum { #define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) #define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) +#define NVME_CRTO_CRIMT(crto) ((crto) >> 16) +#define NVME_CRTO_CRWMT(crto) ((crto) & 0xffff) + enum { NVME_CMBSZ_SQS = 1 << 0, NVME_CMBSZ_CQS = 1 << 1, @@ -204,6 +208,7 @@ enum { NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + NVME_CC_CRIME = 1 << 24, }; enum { @@ -227,6 +232,11 @@ enum { NVME_CAP_CSS_CSI = 1 << 6, }; +enum { + NVME_CAP_CRMS_CRIMS = 1ULL << 59, + NVME_CAP_CRMS_CRWMS = 1ULL << 60, +}; + struct nvme_id_power_state { __le16 max_power; /* centiwatts */ __u8 rsvd2; @@ -414,6 +424,21 @@ struct nvme_id_ns { __u8 vs[3712]; }; +/* I/O Command Set Independent Identify Namespace Data Structure */ +struct nvme_id_ns_cs_indep { + __u8 nsfeat; + __u8 nmic; + __u8 rescap; + __u8 fpi; + __le32 anagrpid; + __u8 nsattr; + __u8 rsvd9; + __le16 nvmsetid; + __le16 endgid; + __u8 nstat; + __u8 rsvd15[4081]; +}; + struct nvme_zns_lbafe { __le64 zsze; __u8 zdes; @@ -478,6 +503,7 @@ enum { NVME_ID_CNS_NS_DESC_LIST = 0x03, NVME_ID_CNS_CS_NS = 0x05, NVME_ID_CNS_CS_CTRL = 0x06, + NVME_ID_CNS_NS_CS_INDEP = 0x08, NVME_ID_CNS_NS_PRESENT_LIST = 0x10, NVME_ID_CNS_NS_PRESENT = 0x11, NVME_ID_CNS_CTRL_NS_LIST = 0x12, @@ -531,6 +557,10 @@ enum { NVME_NS_DPS_PI_TYPE3 = 3, }; +enum { + NVME_NSTAT_NRDY = 1 << 0, +}; + enum { NVME_NVM_NS_16B_GUARD = 0, NVME_NVM_NS_32B_GUARD = 1, @@ -1592,6 +1622,7 @@ enum { NVME_SC_NS_WRITE_PROTECTED = 0x20, NVME_SC_CMD_INTERRUPTED = 0x21, NVME_SC_TRANSIENT_TR_ERR = 0x22, + NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY = 0x24, NVME_SC_INVALID_IO_CMD_SET = 0x2C, NVME_SC_LBA_RANGE = 0x80, -- cgit v1.2.3 From 75dbb685f4e8786c33ddef8279bab0eadfb0731f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 14 May 2022 12:16:47 +0200 Subject: libceph: fix potential use-after-free on linger ping and resends request_reinit() is not only ugly as the comment rightfully suggests, but also unsafe. Even though it is called with osdc->lock held for write in all cases, resetting the OSD request refcount can still race with handle_reply() and result in use-after-free. Taking linger ping as an example: handle_timeout thread handle_reply thread down_read(&osdc->lock) req = lookup_request(...) ... finish_request(req) # unregisters up_read(&osdc->lock) __complete_request(req) linger_ping_cb(req) # req->r_kref == 2 because handle_reply still holds its ref down_write(&osdc->lock) send_linger_ping(lreq) req = lreq->ping_req # same req # cancel_linger_request is NOT # called - handle_reply already # unregistered request_reinit(req) WARN_ON(req->r_kref != 1) # fires request_init(req) kref_init(req->r_kref) # req->r_kref == 1 after kref_init ceph_osdc_put_request(req) kref_put(req->r_kref) # req->r_kref == 0 after kref_put, req is freed !!! This happens because send_linger_ping() always (re)uses the same OSD request for watch ping requests, relying on cancel_linger_request() to unregister it from the OSD client and rip its messages out from the messenger. send_linger() does the same for watch/notify registration and watch reconnect requests. Unfortunately cancel_request() doesn't guarantee that after it returns the OSD client would be completely done with the OSD request -- a ref could still be held and the callback (if specified) could still be invoked too. The original motivation for request_reinit() was inability to deal with allocation failures in send_linger() and send_linger_ping(). Switching to using osdc->req_mempool (currently only used by CephFS) respects that and allows us to get rid of request_reinit(). Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov Reviewed-by: Xiubo Li Acked-by: Jeff Layton --- include/linux/ceph/osd_client.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 3431011f364d..cba8a6ffc329 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -287,6 +287,9 @@ struct ceph_osd_linger_request { rados_watcherrcb_t errcb; void *data; + struct ceph_pagelist *request_pl; + struct page **notify_id_pages; + struct page ***preply_pages; size_t *preply_len; }; -- cgit v1.2.3 From 3f68e69520d3d52d66a6ad872a75b7d8f2ea7665 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Thu, 19 May 2022 10:45:12 +0530 Subject: riscv/efi_stub: Add support for RISCV_EFI_BOOT_PROTOCOL Add support for getting the boot hart ID from the Linux EFI stub using RISCV_EFI_BOOT_PROTOCOL. This method is preferred over the existing DT based approach since it works irrespective of DT or ACPI. The specification of the protocol is hosted at: https://github.com/riscv-non-isa/riscv-uefi Signed-off-by: Sunil V L Acked-by: Palmer Dabbelt Reviewed-by: Heinrich Schuchardt Link: https://lore.kernel.org/r/20220519051512.136724-2-sunilvl@ventanamicro.com [ardb: minor tweaks for coding style and whitespace] Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 580ce607d6f5..0412304ce34e 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -410,6 +410,8 @@ void efi_native_runtime_setup(void); #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID EFI_GUID(0xc451ed2b, 0x9694, 0x45d3, 0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89) #define LINUX_EFI_COCO_SECRET_AREA_GUID EFI_GUID(0xadf956ad, 0xe98c, 0x484c, 0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47) +#define RISCV_EFI_BOOT_PROTOCOL_GUID EFI_GUID(0xccd15fec, 0x6f73, 0x4eec, 0x83, 0x95, 0x3e, 0x69, 0xe4, 0xb9, 0x40, 0xbf) + /* * This GUID may be installed onto the kernel image's handle as a NULL protocol * to signal to the stub that the placement of the image should be respected, -- cgit v1.2.3 From 238e34ad7d5ce36939ac1442c98c7cbb7771f4de Mon Sep 17 00:00:00 2001 From: Jishnu Prakash Date: Sun, 3 Apr 2022 18:47:47 +0530 Subject: iio: adc: qcom-vadc-common: add reverse scaling for PMIC5 Gen2 ADC_TM Add reverse scaling function for PMIC5 Gen2 ADC_TM, to convert temperature to raw ADC code, for setting thresholds for thermistor channels. Signed-off-by: Jishnu Prakash Acked-by: Jonathan Cameron Link: https://lore.kernel.org/r/1648991869-20899-3-git-send-email-quic_jprakash@quicinc.com Signed-off-by: Daniel Lezcano --- include/linux/iio/adc/qcom-vadc-common.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/adc/qcom-vadc-common.h b/include/linux/iio/adc/qcom-vadc-common.h index ce78d4804994..aa21b032e861 100644 --- a/include/linux/iio/adc/qcom-vadc-common.h +++ b/include/linux/iio/adc/qcom-vadc-common.h @@ -152,6 +152,8 @@ int qcom_adc5_hw_scale(enum vadc_scale_fn_type scaletype, u16 qcom_adc_tm5_temp_volt_scale(unsigned int prescale_ratio, u32 full_scale_code_volt, int temp); +u16 qcom_adc_tm5_gen2_temp_res_scale(int temp); + int qcom_adc5_prescaling_from_dt(u32 num, u32 den); int qcom_adc5_hw_settle_time_from_dt(u32 value, const unsigned int *hw_settle); -- cgit v1.2.3 From bf70c577516b8d9fbe703371aa98bbea13661cec Mon Sep 17 00:00:00 2001 From: Manaf Meethalavalappu Pallikunhi Date: Wed, 9 Mar 2022 00:56:26 +0530 Subject: thermal/drivers/thermal_of: Add change_mode ops support for thermal_of sensor The sensor driver which register through thermal_of interface doesn't have an option to get thermal zone mode change notification from thermal core. Add support for change_mode ops in thermal_of interface so that sensor driver can use this ops for mode change notification. Signed-off-by: Manaf Meethalavalappu Pallikunhi Link: https://lore.kernel.org/r/1646767586-31908-1-git-send-email-quic_manafm@quicinc.com Signed-off-by: Daniel Lezcano --- include/linux/thermal.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index c314893970b3..365733b428d8 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -299,6 +299,8 @@ struct thermal_zone_params { * temperature. * @set_trip_temp: a pointer to a function that sets the trip temperature on * hardware. + * @change_mode: a pointer to a function that notifies the thermal zone + * mode change. */ struct thermal_zone_of_device_ops { int (*get_temp)(void *, int *); @@ -306,6 +308,7 @@ struct thermal_zone_of_device_ops { int (*set_trips)(void *, int, int); int (*set_emul_temp)(void *, int); int (*set_trip_temp)(void *, int, int); + int (*change_mode) (void *, enum thermal_device_mode); }; /* Function declarations */ -- cgit v1.2.3 From 7782cfeca7d420e8bb707613d4cfb0f7ff29bb3a Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 13 May 2022 12:29:38 +0200 Subject: random: remove extern from functions in header Accoriding to the kernel style guide, having `extern` on functions in headers is old school and deprecated, and doesn't add anything. So remove them from random.h, and tidy up the file a little bit too. Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 77 ++++++++++++++++++++------------------------------ 1 file changed, 31 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index b52963955a99..97f879ea78a5 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -12,13 +12,12 @@ struct notifier_block; -extern void add_device_randomness(const void *, size_t); -extern void add_bootloader_randomness(const void *, size_t); -extern void add_input_randomness(unsigned int type, unsigned int code, - unsigned int value) __latent_entropy; -extern void add_interrupt_randomness(int irq) __latent_entropy; -extern void add_hwgenerator_randomness(const void *buffer, size_t count, - size_t entropy); +void add_device_randomness(const void *, size_t); +void add_bootloader_randomness(const void *, size_t); +void add_input_randomness(unsigned int type, unsigned int code, + unsigned int value) __latent_entropy; +void add_interrupt_randomness(int irq) __latent_entropy; +void add_hwgenerator_randomness(const void *buffer, size_t count, size_t entropy); #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) static inline void add_latent_entropy(void) @@ -26,30 +25,20 @@ static inline void add_latent_entropy(void) add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); } #else -static inline void add_latent_entropy(void) {} +static inline void add_latent_entropy(void) { } #endif #if IS_ENABLED(CONFIG_VMGENID) -extern void add_vmfork_randomness(const void *unique_vm_id, size_t size); -extern int register_random_vmfork_notifier(struct notifier_block *nb); -extern int unregister_random_vmfork_notifier(struct notifier_block *nb); +void add_vmfork_randomness(const void *unique_vm_id, size_t size); +int register_random_vmfork_notifier(struct notifier_block *nb); +int unregister_random_vmfork_notifier(struct notifier_block *nb); #else static inline int register_random_vmfork_notifier(struct notifier_block *nb) { return 0; } static inline int unregister_random_vmfork_notifier(struct notifier_block *nb) { return 0; } #endif -extern void get_random_bytes(void *buf, size_t nbytes); -extern int wait_for_random_bytes(void); -extern int __init random_init(const char *command_line); -extern bool rng_is_initialized(void); -extern int register_random_ready_notifier(struct notifier_block *nb); -extern int unregister_random_ready_notifier(struct notifier_block *nb); -extern size_t __must_check get_random_bytes_arch(void *buf, size_t nbytes); - -#ifndef MODULE -extern const struct file_operations random_fops, urandom_fops; -#endif - +void get_random_bytes(void *buf, size_t nbytes); +size_t __must_check get_random_bytes_arch(void *buf, size_t nbytes); u32 get_random_u32(void); u64 get_random_u64(void); static inline unsigned int get_random_int(void) @@ -81,11 +70,17 @@ static inline unsigned long get_random_long(void) static inline unsigned long get_random_canary(void) { - unsigned long val = get_random_long(); - - return val & CANARY_MASK; + return get_random_long() & CANARY_MASK; } +unsigned long randomize_page(unsigned long start, unsigned long range); + +int __init random_init(const char *command_line); +bool rng_is_initialized(void); +int wait_for_random_bytes(void); +int register_random_ready_notifier(struct notifier_block *nb); +int unregister_random_ready_notifier(struct notifier_block *nb); + /* Calls wait_for_random_bytes() and then calls get_random_bytes(buf, nbytes). * Returns the result of the call to wait_for_random_bytes. */ static inline int get_random_bytes_wait(void *buf, size_t nbytes) @@ -109,8 +104,6 @@ declare_get_random_var_wait(int) declare_get_random_var_wait(long) #undef declare_get_random_var -unsigned long randomize_page(unsigned long start, unsigned long range); - /* * This is designed to be standalone for just prandom * users, but for now we include it from @@ -121,22 +114,10 @@ unsigned long randomize_page(unsigned long start, unsigned long range); #ifdef CONFIG_ARCH_RANDOM # include #else -static inline bool __must_check arch_get_random_long(unsigned long *v) -{ - return false; -} -static inline bool __must_check arch_get_random_int(unsigned int *v) -{ - return false; -} -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) -{ - return false; -} -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) -{ - return false; -} +static inline bool __must_check arch_get_random_long(unsigned long *v) { return false; } +static inline bool __must_check arch_get_random_int(unsigned int *v) { return false; } +static inline bool __must_check arch_get_random_seed_long(unsigned long *v) { return false; } +static inline bool __must_check arch_get_random_seed_int(unsigned int *v) { return false; } #endif /* @@ -160,8 +141,12 @@ static inline bool __init arch_get_random_long_early(unsigned long *v) #endif #ifdef CONFIG_SMP -extern int random_prepare_cpu(unsigned int cpu); -extern int random_online_cpu(unsigned int cpu); +int random_prepare_cpu(unsigned int cpu); +int random_online_cpu(unsigned int cpu); +#endif + +#ifndef MODULE +extern const struct file_operations random_fops, urandom_fops; #endif #endif /* _LINUX_RANDOM_H */ -- cgit v1.2.3 From 7c3a8a1db5e03d02cc0abb3357a84b8b326dfac3 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 13 May 2022 12:32:23 +0200 Subject: random: use proper return types on get_random_{int,long}_wait() Before these were returning signed values, but the API is intended to be used with unsigned values. Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 97f879ea78a5..883bf9a23d84 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -90,18 +90,18 @@ static inline int get_random_bytes_wait(void *buf, size_t nbytes) return ret; } -#define declare_get_random_var_wait(var) \ - static inline int get_random_ ## var ## _wait(var *out) { \ +#define declare_get_random_var_wait(name, ret_type) \ + static inline int get_random_ ## name ## _wait(ret_type *out) { \ int ret = wait_for_random_bytes(); \ if (unlikely(ret)) \ return ret; \ - *out = get_random_ ## var(); \ + *out = get_random_ ## name(); \ return 0; \ } -declare_get_random_var_wait(u32) -declare_get_random_var_wait(u64) -declare_get_random_var_wait(int) -declare_get_random_var_wait(long) +declare_get_random_var_wait(u32, u32) +declare_get_random_var_wait(u64, u32) +declare_get_random_var_wait(int, unsigned int) +declare_get_random_var_wait(long, unsigned long) #undef declare_get_random_var /* -- cgit v1.2.3 From a19402634c435a4eae226df53c141cdbb9922e7b Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 13 May 2022 13:18:46 +0200 Subject: random: make consistent use of buf and len The current code was a mix of "nbytes", "count", "size", "buffer", "in", and so forth. Instead, let's clean this up by naming input parameters "buf" (or "ubuf") and "len", so that you always understand that you're reading this variety of function argument. Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 883bf9a23d84..fc82f1dc36f1 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -12,12 +12,12 @@ struct notifier_block; -void add_device_randomness(const void *, size_t); -void add_bootloader_randomness(const void *, size_t); +void add_device_randomness(const void *buf, size_t len); +void add_bootloader_randomness(const void *buf, size_t len); void add_input_randomness(unsigned int type, unsigned int code, unsigned int value) __latent_entropy; void add_interrupt_randomness(int irq) __latent_entropy; -void add_hwgenerator_randomness(const void *buffer, size_t count, size_t entropy); +void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy); #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) static inline void add_latent_entropy(void) @@ -29,7 +29,7 @@ static inline void add_latent_entropy(void) { } #endif #if IS_ENABLED(CONFIG_VMGENID) -void add_vmfork_randomness(const void *unique_vm_id, size_t size); +void add_vmfork_randomness(const void *unique_vm_id, size_t len); int register_random_vmfork_notifier(struct notifier_block *nb); int unregister_random_vmfork_notifier(struct notifier_block *nb); #else @@ -37,8 +37,8 @@ static inline int register_random_vmfork_notifier(struct notifier_block *nb) { r static inline int unregister_random_vmfork_notifier(struct notifier_block *nb) { return 0; } #endif -void get_random_bytes(void *buf, size_t nbytes); -size_t __must_check get_random_bytes_arch(void *buf, size_t nbytes); +void get_random_bytes(void *buf, size_t len); +size_t __must_check get_random_bytes_arch(void *buf, size_t len); u32 get_random_u32(void); u64 get_random_u64(void); static inline unsigned int get_random_int(void) -- cgit v1.2.3 From 248561ad25a8ba4ecbc7df42f9a5a82fd5fbb4f6 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sat, 14 May 2022 13:09:17 +0200 Subject: random: remove get_random_bytes_arch() and add rng_has_arch_random() The RNG incorporates RDRAND into its state at boot and every time it reseeds, so there's no reason for callers to use it directly. The hashing that the RNG does on it is preferable to using the bytes raw. The only current use case of get_random_bytes_arch() is vsprintf's siphash key for pointer hashing, which uses it to initialize the pointer secret earlier than usual if RDRAND is available. In order to replace this narrow use case, just expose whether RDRAND is mixed into the RNG, with a new function called rng_has_arch_random(). With that taken care of, there are no users of get_random_bytes_arch() left, so it can be removed. Later, if trust_cpu gets turned on by default (as most distros are doing), this one use of rng_has_arch_random() can probably go away as well. Cc: Steven Rostedt Cc: Sergey Senozhatsky Acked-by: Petr Mladek # for vsprintf.c Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index fc82f1dc36f1..6af130c6edb9 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -38,7 +38,6 @@ static inline int unregister_random_vmfork_notifier(struct notifier_block *nb) { #endif void get_random_bytes(void *buf, size_t len); -size_t __must_check get_random_bytes_arch(void *buf, size_t len); u32 get_random_u32(void); u64 get_random_u64(void); static inline unsigned int get_random_int(void) @@ -77,6 +76,7 @@ unsigned long randomize_page(unsigned long start, unsigned long range); int __init random_init(const char *command_line); bool rng_is_initialized(void); +bool rng_has_arch_random(void); int wait_for_random_bytes(void); int register_random_ready_notifier(struct notifier_block *nb); int unregister_random_ready_notifier(struct notifier_block *nb); -- cgit v1.2.3 From 6701de6c51c172b5de5633374479503c81fefc0b Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 15 May 2022 15:06:18 +0200 Subject: random: remove mostly unused async readiness notifier The register_random_ready_notifier() notifier is somewhat complicated, and was already recently rewritten to use notifier blocks. It is only used now by one consumer in the kernel, vsprintf.c, for which the async mechanism is really overly complex for what it actually needs. This commit removes register_random_ready_notifier() and unregister_random_ ready_notifier(), because it just adds complication with little utility, and changes vsprintf.c to just check on `!rng_is_initialized() && !rng_has_arch_random()`, which will eventually be true. Performance- wise, that code was already using a static branch, so there's basically no overhead at all to this change. Cc: Steven Rostedt Cc: Sergey Senozhatsky Acked-by: Petr Mladek # for vsprintf.c Reviewed-by: Petr Mladek Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 6af130c6edb9..d2360b2825b6 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -78,8 +78,6 @@ int __init random_init(const char *command_line); bool rng_is_initialized(void); bool rng_has_arch_random(void); int wait_for_random_bytes(void); -int register_random_ready_notifier(struct notifier_block *nb); -int unregister_random_ready_notifier(struct notifier_block *nb); /* Calls wait_for_random_bytes() and then calls get_random_bytes(buf, nbytes). * Returns the result of the call to wait_for_random_bytes. */ -- cgit v1.2.3 From 5ad7dd882e45d7fe432c32e896e2aaa0b21746ea Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sat, 14 May 2022 13:59:30 +0200 Subject: random: move randomize_page() into mm where it belongs randomize_page is an mm function. It is documented like one. It contains the history of one. It has the naming convention of one. It looks just like another very similar function in mm, randomize_stack_top(). And it has always been maintained and updated by mm people. There is no need for it to be in random.c. In the "which shape does not look like the other ones" test, pointing to randomize_page() is correct. So move randomize_page() into mm/util.c, right next to the similar randomize_stack_top() function. This commit contains no actual code changes. Cc: Andrew Morton Signed-off-by: Jason A. Donenfeld --- include/linux/mm.h | 1 + include/linux/random.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9f44254af8ce..b0183450e484 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2677,6 +2677,7 @@ extern int install_special_mapping(struct mm_struct *mm, unsigned long flags, struct page **pages); unsigned long randomize_stack_top(unsigned long stack_top); +unsigned long randomize_page(unsigned long start, unsigned long range); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/include/linux/random.h b/include/linux/random.h index d2360b2825b6..fae0c84027fd 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -72,8 +72,6 @@ static inline unsigned long get_random_canary(void) return get_random_long() & CANARY_MASK; } -unsigned long randomize_page(unsigned long start, unsigned long range); - int __init random_init(const char *command_line); bool rng_is_initialized(void); bool rng_has_arch_random(void); -- cgit v1.2.3 From 135c579d77d066aece83609329150b87fe81e454 Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Mon, 2 May 2022 18:25:05 +0900 Subject: tty: serial: samsung_tty: Fix suspend/resume on S5L We were restoring the IRQ masks then clearing them again, because ucon_mask wasn't set properly. Adding that makes suspend/resume work as intended. Signed-off-by: Hector Martin Link: https://lore.kernel.org/r/20220502092505.30934-1-marcan@marcan.st Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_s3c.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h index f6c3323fc4c5..dec15f5b3dec 100644 --- a/include/linux/serial_s3c.h +++ b/include/linux/serial_s3c.h @@ -256,6 +256,9 @@ #define APPLE_S5L_UCON_DEFAULT (S3C2410_UCON_TXIRQMODE | \ S3C2410_UCON_RXIRQMODE | \ S3C2410_UCON_RXFIFO_TOI) +#define APPLE_S5L_UCON_MASK (APPLE_S5L_UCON_RXTO_ENA_MSK | \ + APPLE_S5L_UCON_RXTHRESH_ENA_MSK | \ + APPLE_S5L_UCON_TXTHRESH_ENA_MSK) #define APPLE_S5L_UTRSTAT_RXTHRESH (1<<4) #define APPLE_S5L_UTRSTAT_TXTHRESH (1<<5) -- cgit v1.2.3 From 0b6c14bdd908879078ec85d65cfb78472a97e4e7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 1 Apr 2022 13:10:31 -0400 Subject: SUNRPC: Make cache_req::thread_wait an unsigned long The second parameter of wait_for_completion_interruptible_timeout() is a jiffies value whose type is "unsigned long". Avoid an unnecessary and potentially fraught implicit type conversion at the wait_for_completion_interruptible_timeout() call site in cache_wait_req(). Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index b134b2b3371c..ec5a555df96f 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -121,17 +121,17 @@ struct cache_detail { struct net *net; }; - /* this must be embedded in any request structure that * identifies an object that will want a callback on * a cache fill */ struct cache_req { struct cache_deferred_req *(*defer)(struct cache_req *req); - int thread_wait; /* How long (jiffies) we can block the - * current thread to wait for updates. - */ + unsigned long thread_wait; /* How long (jiffies) we can block the + * current thread to wait for updates. + */ }; + /* this must be embedded in a deferred_request that is being * delayed awaiting cache-fill */ -- cgit v1.2.3 From 983084b2672c593959e3148d6a17c8b920797dde Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 6 Apr 2022 14:38:59 -0400 Subject: SUNRPC: Remove svc_rqst::rq_xprt_hlen Clean up: This field is now always set to zero. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 217711fc9cac..b73704fbd09b 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -257,7 +257,6 @@ struct svc_rqst { void * rq_xprt_ctxt; /* transport specific context ptr */ struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */ - size_t rq_xprt_hlen; /* xprt header len */ struct xdr_buf rq_arg; struct xdr_stream rq_arg_stream; struct xdr_stream rq_res_stream; @@ -397,7 +396,6 @@ struct svc_deferred_req { size_t daddrlen; void *xprt_ctxt; struct cache_deferred_req handle; - size_t xprt_hlen; int argslen; __be32 args[]; }; -- cgit v1.2.3 From 591502c5cb325b1c6ec59ab161927d606b918aa0 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Mon, 2 May 2022 14:19:24 -0700 Subject: fs/lock: add helper locks_owner_has_blockers to check for blockers Add helper locks_owner_has_blockers to check if there is any blockers for a given lockowner. Reviewed-by: J. Bruce Fields Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton --- include/linux/fs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..b8ed7f974fb4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1174,6 +1174,8 @@ extern void lease_unregister_notifier(struct notifier_block *); struct files_struct; extern void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files); +extern bool locks_owner_has_blockers(struct file_lock_context *flctx, + fl_owner_t owner); #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) @@ -1309,6 +1311,11 @@ static inline int lease_modify(struct file_lock *fl, int arg, struct files_struct; static inline void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) {} +static inline bool locks_owner_has_blockers(struct file_lock_context *flctx, + fl_owner_t owner) +{ + return false; +} #endif /* !CONFIG_FILE_LOCKING */ static inline struct inode *file_inode(const struct file *f) -- cgit v1.2.3 From 2443da2259e97688f93d64d17ab69b15f466078a Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Mon, 2 May 2022 14:19:25 -0700 Subject: fs/lock: add 2 callbacks to lock_manager_operations to resolve conflict Add 2 new callbacks, lm_lock_expirable and lm_expire_lock, to lock_manager_operations to allow the lock manager to take appropriate action to resolve the lock conflict if possible. A new field, lm_mod_owner, is also added to lock_manager_operations. The lm_mod_owner is used by the fs/lock code to make sure the lock manager module such as nfsd, is not freed while lock conflict is being resolved. lm_lock_expirable checks and returns true to indicate that the lock conflict can be resolved else return false. This callback must be called with the flc_lock held so it can not block. lm_expire_lock is called to resolve the lock conflict if the returned value from lm_lock_expirable is true. This callback is called without the flc_lock held since it's allowed to block. Upon returning from this callback, the lock conflict should be resolved and the caller is expected to restart the conflict check from the beginnning of the list. Lock manager, such as NFSv4 courteous server, uses this callback to resolve conflict by destroying lock owner, or the NFSv4 courtesy client (client that has expired but allowed to maintains its states) that owns the lock. Reviewed-by: J. Bruce Fields Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b8ed7f974fb4..aa6c1bbdb8c4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1029,6 +1029,7 @@ struct file_lock_operations { }; struct lock_manager_operations { + void *lm_mod_owner; fl_owner_t (*lm_get_owner)(fl_owner_t); void (*lm_put_owner)(fl_owner_t); void (*lm_notify)(struct file_lock *); /* unblock callback */ @@ -1037,6 +1038,8 @@ struct lock_manager_operations { int (*lm_change)(struct file_lock *, int, struct list_head *); void (*lm_setup)(struct file_lock *, void **); bool (*lm_breaker_owns_lease)(struct file_lock *); + bool (*lm_lock_expirable)(struct file_lock *cfl); + void (*lm_expire_lock)(void); }; struct lock_manager { -- cgit v1.2.3 From 2059b698a2efcce3c67b0e61eab7d7680bbe10bd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 10 May 2022 11:45:53 -0400 Subject: SUNRPC: Simplify synopsis of svc_pool_for_cpu() Clean up: There is one caller. The @cpu argument can be made implicit now that a get_cpu/put_cpu pair is no longer needed. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index b73704fbd09b..daecb009c05b 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -504,7 +504,7 @@ int svc_register(const struct svc_serv *, struct net *, const int, void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); -struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); +struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv); char * svc_print_addr(struct svc_rqst *, char *, size_t); const char * svc_proc_name(const struct svc_rqst *rqstp); int svc_encode_result_payload(struct svc_rqst *rqstp, -- cgit v1.2.3 From 53c83d6d8e399fad3d3d25df0ea0d54eb0f94f88 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 18 May 2022 15:23:45 +0200 Subject: siphash: add SPDX tags as sole licensing authority The text "dual BSD/GPLv2 license" is somewhat ambiguous, and moving this over to SPDX is overdue. This commit adds SPDX tags to the relevant files and clarifies that it's GPLv2 only and 3-clause BSD. It also removes the old text, so that the SPDX tags are the only source of the information. Suggested-by: Thomas Gleixner Signed-off-by: Jason A. Donenfeld Signed-off-by: Greg Kroah-Hartman --- include/linux/siphash.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/siphash.h b/include/linux/siphash.h index cce8a9acc76c..9e822311b29e 100644 --- a/include/linux/siphash.h +++ b/include/linux/siphash.h @@ -1,6 +1,5 @@ -/* Copyright (C) 2016 Jason A. Donenfeld . All Rights Reserved. - * - * This file is provided under a dual BSD/GPLv2 license. +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */ +/* Copyright (C) 2016-2022 Jason A. Donenfeld . All Rights Reserved. * * SipHash: a fast short-input PRF * https://131002.net/siphash/ -- cgit v1.2.3 From e6d3c99adf54fc1dcd07729990dc32acd3be874b Mon Sep 17 00:00:00 2001 From: Abhyuday Godhasara Date: Wed, 27 Apr 2022 00:48:03 -0700 Subject: driver: soc: xilinx: Update function prototype for xlnx_unregister_event As per the current implementation only single callback data gets saved per event, driver is throwing an error if try to register multiple callback for same event. So at time of unregistration of any event required things are event details and callback handler as parameter of xlnx_unregister_event(). As part of adding support of multiple callbacks for same event also require change in prototype of xlnx_unregister_event(). During unregistration of any events, now required things are event details, callback handler and agent's private data as parameter of xlnx_unregister_event(). Also modify the usage of xlnx_unregister_event() in xilinx/zynqmp_power.c driver as per new implementation. Signed-off-by: Abhyuday Godhasara Link: https://lore.kernel.org/r/20220427074803.19009-3-abhyuday.godhasara@xilinx.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-event-manager.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-event-manager.h b/include/linux/firmware/xlnx-event-manager.h index 3f87c4929d21..82e8254b0f80 100644 --- a/include/linux/firmware/xlnx-event-manager.h +++ b/include/linux/firmware/xlnx-event-manager.h @@ -17,7 +17,7 @@ int xlnx_register_event(const enum pm_api_cb_id cb_type, const u32 node_id, event_cb_func_t cb_fun, void *data); int xlnx_unregister_event(const enum pm_api_cb_id cb_type, const u32 node_id, - const u32 event, event_cb_func_t cb_fun); + const u32 event, event_cb_func_t cb_fun, void *data); #else static inline int xlnx_register_event(const enum pm_api_cb_id cb_type, const u32 node_id, const u32 event, const bool wake, @@ -27,7 +27,7 @@ static inline int xlnx_register_event(const enum pm_api_cb_id cb_type, const u32 } static inline int xlnx_unregister_event(const enum pm_api_cb_id cb_type, const u32 node_id, - const u32 event, event_cb_func_t cb_fun) + const u32 event, event_cb_func_t cb_fun, void *data) { return -ENODEV; } -- cgit v1.2.3 From 885525c1e7e27ea6207d648a8db20dfbbd9e4238 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 27 Apr 2022 11:56:48 +0200 Subject: clk: renesas: r9a06g032: Export function to set dmamux The dmamux register is located within the system controller. Without syscon, we need an extra helper in order to give write access to this register to a dmamux driver. Signed-off-by: Miquel Raynal Acked-by: Stephen Boyd Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20220427095653.91804-5-miquel.raynal@bootlin.com Signed-off-by: Vinod Koul --- include/linux/soc/renesas/r9a06g032-sysctrl.h | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 include/linux/soc/renesas/r9a06g032-sysctrl.h (limited to 'include/linux') diff --git a/include/linux/soc/renesas/r9a06g032-sysctrl.h b/include/linux/soc/renesas/r9a06g032-sysctrl.h new file mode 100644 index 000000000000..066dfb15cbdd --- /dev/null +++ b/include/linux/soc/renesas/r9a06g032-sysctrl.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_SOC_RENESAS_R9A06G032_SYSCTRL_H__ +#define __LINUX_SOC_RENESAS_R9A06G032_SYSCTRL_H__ + +#ifdef CONFIG_CLK_R9A06G032 +int r9a06g032_sysctrl_set_dmamux(u32 mask, u32 val); +#else +static inline int r9a06g032_sysctrl_set_dmamux(u32 mask, u32 val) { return -ENODEV; } +#endif + +#endif /* __LINUX_SOC_RENESAS_R9A06G032_SYSCTRL_H__ */ -- cgit v1.2.3 From 13dfd97a341a5cf9d15f415dd469f45e971ef12a Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Thu, 19 May 2022 14:02:32 +0300 Subject: notifier: Add atomic_notifier_call_chain_is_empty() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add atomic_notifier_call_chain_is_empty() that returns true if given atomic call chain is empty. The first user of this new notifier API function will be the kernel power-off core code that will support power-off call chains. The core code will need to check whether there is a power-off handler registered at all in order to decide whether to halt machine or power it off. Reviewed-by: Michał Mirosław Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/notifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 87069b8459af..95e2440037de 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -173,6 +173,8 @@ extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v); +extern bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh); + #define NOTIFY_DONE 0x0000 /* Don't care */ #define NOTIFY_OK 0x0001 /* Suits me */ #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */ -- cgit v1.2.3 From c82f898d873ce10d88f8d60b38c8dd19f1ed7c66 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:10 +0300 Subject: notifier: Add blocking/atomic_notifier_chain_register_unique_prio() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add variant of blocking/atomic_notifier_chain_register() functions that allow registration of a notifier only if it has unique priority, otherwise -EBUSY error code is returned by the new functions. Reviewed-by: Michał Mirosław Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/notifier.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 95e2440037de..aef88c2d1173 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -150,6 +150,11 @@ extern int raw_notifier_chain_register(struct raw_notifier_head *nh, extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *nb); +extern int atomic_notifier_chain_register_unique_prio( + struct atomic_notifier_head *nh, struct notifier_block *nb); +extern int blocking_notifier_chain_register_unique_prio( + struct blocking_notifier_head *nh, struct notifier_block *nb); + extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, struct notifier_block *nb); extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, -- cgit v1.2.3 From 232edc2f72f5beda3586360de6254d443820050a Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:11 +0300 Subject: kernel/reboot: Introduce sys-off handler API In order to support power-off chaining we need to get rid of the global pm_* variables, replacing them with the new kernel API functions that support chaining. Introduce new generic sys-off handler API that brings the following features: 1. Power-off and restart handlers are registered using same API function that supports chaining, hence all power-off and restart modes will support chaining using this unified function. 2. Prevents notifier priority collisions by disallowing registration of multiple handlers at the non-default priority level. 3. Supports passing opaque user argument to callback, which allows us to remove global variables from drivers. This patch adds support of the following sys-off modes: - SYS_OFF_MODE_POWER_OFF_PREPARE that replaces global pm_power_off_prepare variable and provides chaining support for power-off-prepare handlers. - SYS_OFF_MODE_POWER_OFF that replaces global pm_power_off variable and provides chaining support for power-off handlers. - SYS_OFF_MODE_RESTART that provides a better restart API, removing a need from drivers to have a global scratch variable by utilizing the opaque callback argument. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/reboot.h | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index af907a3d68d1..b79f8942ea5f 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -7,6 +7,7 @@ #include struct device; +struct sys_off_handler; #define SYS_DOWN 0x0001 /* Notify of system down */ #define SYS_RESTART SYS_DOWN @@ -62,6 +63,82 @@ extern void machine_shutdown(void); struct pt_regs; extern void machine_crash_shutdown(struct pt_regs *); +/* + * sys-off handler API. + */ + +/* + * Standard sys-off priority levels. Users are expected to set priorities + * relative to the standard levels. + * + * SYS_OFF_PRIO_PLATFORM: Use this for platform-level handlers. + * + * SYS_OFF_PRIO_LOW: Use this for handler of last resort. + * + * SYS_OFF_PRIO_DEFAULT: Use this for normal handlers. + * + * SYS_OFF_PRIO_HIGH: Use this for higher priority handlers. + * + * SYS_OFF_PRIO_FIRMWARE: Use this if handler uses firmware call. + */ +#define SYS_OFF_PRIO_PLATFORM -256 +#define SYS_OFF_PRIO_LOW -128 +#define SYS_OFF_PRIO_DEFAULT 0 +#define SYS_OFF_PRIO_HIGH 192 +#define SYS_OFF_PRIO_FIRMWARE 224 + +enum sys_off_mode { + /** + * @SYS_OFF_MODE_POWER_OFF_PREPARE: + * + * Handlers prepare system to be powered off. Handlers are + * allowed to sleep. + */ + SYS_OFF_MODE_POWER_OFF_PREPARE, + + /** + * @SYS_OFF_MODE_POWER_OFF: + * + * Handlers power-off system. Handlers are disallowed to sleep. + */ + SYS_OFF_MODE_POWER_OFF, + + /** + * @SYS_OFF_MODE_RESTART: + * + * Handlers restart system. Handlers are disallowed to sleep. + */ + SYS_OFF_MODE_RESTART, +}; + +/** + * struct sys_off_data - sys-off callback argument + * + * @mode: Mode ID. Currently used only by the sys-off restart mode, + * see enum reboot_mode for the available modes. + * @cb_data: User's callback data. + * @cmd: Command string. Currently used only by the sys-off restart mode, + * NULL otherwise. + */ +struct sys_off_data { + int mode; + void *cb_data; + const char *cmd; +}; + +struct sys_off_handler * +register_sys_off_handler(enum sys_off_mode mode, + int priority, + int (*callback)(struct sys_off_data *data), + void *cb_data); +void unregister_sys_off_handler(struct sys_off_handler *handler); + +int devm_register_sys_off_handler(struct device *dev, + enum sys_off_mode mode, + int priority, + int (*callback)(struct sys_off_data *data), + void *cb_data); + /* * Architecture independent implemenations of sys_reboot commands. */ -- cgit v1.2.3 From 2b6aa7332f8020dfdaffe340ff038aac4df35238 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:13 +0300 Subject: kernel/reboot: Add do_kernel_power_off() Add do_kernel_power_off() helper that will remove open-coded pm_power_off invocations from the architecture code. This is the first step on the way to remove the global pm_power_off variable, which will allow us to implement consistent power-off chaining support. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/reboot.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index b79f8942ea5f..99a9753e77bc 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -63,6 +63,8 @@ extern void machine_shutdown(void); struct pt_regs; extern void machine_crash_shutdown(struct pt_regs *); +void do_kernel_power_off(void); + /* * sys-off handler API. */ -- cgit v1.2.3 From 0e2110d2e910e44cc7cf23fce14613e232602602 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:15 +0300 Subject: kernel/reboot: Add kernel_can_power_off() Add kernel_can_power_off() helper that replaces open-coded checks of the global pm_power_off variable. This is a necessary step towards supporting chained power-off handlers. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/reboot.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 99a9753e77bc..5837c6c2fb40 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -149,6 +149,7 @@ extern void kernel_restart_prepare(char *cmd); extern void kernel_restart(char *cmd); extern void kernel_halt(void); extern void kernel_power_off(void); +extern bool kernel_can_power_off(void); extern int C_A_D; /* for sysctl */ void ctrl_alt_del(void); -- cgit v1.2.3 From fb61375ecfba49e153af561402f49f6fe3bebd39 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:16 +0300 Subject: kernel/reboot: Add register_platform_power_off() Add platform-level registration helpers that will ease transition of the arch/platform power-off callbacks to the new sys-off based API, allowing us to remove the global pm_power_off variable in the future. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/reboot.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 5837c6c2fb40..a8fee1d09c65 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -141,6 +141,9 @@ int devm_register_sys_off_handler(struct device *dev, int (*callback)(struct sys_off_data *data), void *cb_data); +int register_platform_power_off(void (*power_off)(void)); +void unregister_platform_power_off(void (*power_off)(void)); + /* * Architecture independent implemenations of sys_reboot commands. */ -- cgit v1.2.3 From 5b71808eb7c97815fc3b9c386ca0ef6daf2dc053 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:32 +0300 Subject: reboot: Remove pm_power_off_prepare() All pm_power_off_prepare() users were converted to sys-off handler API. Remove the obsolete global callback variable. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index e65b3ab28377..3786d306dad2 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -21,7 +21,6 @@ * Callbacks for platform drivers to implement. */ extern void (*pm_power_off)(void); -extern void (*pm_power_off_prepare)(void); struct device; /* we have a circular dep with device.h */ #ifdef CONFIG_VT_CONSOLE_SLEEP -- cgit v1.2.3 From d2c5415327171e6c1bca2dc4d8a77f450ecf7150 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:34 +0300 Subject: kernel/reboot: Add devm_register_power_off_handler() Add devm_register_power_off_handler() helper that registers sys-off handler using power-off mode and with a default priority. Most drivers will want to register power-off handler with a default priority, so this helper will reduce the boilerplate code and make code easier to read and follow. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/reboot.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index a8fee1d09c65..3b063c5b6118 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -141,6 +141,10 @@ int devm_register_sys_off_handler(struct device *dev, int (*callback)(struct sys_off_data *data), void *cb_data); +int devm_register_power_off_handler(struct device *dev, + int (*callback)(struct sys_off_data *data), + void *cb_data); + int register_platform_power_off(void (*power_off)(void)); void unregister_platform_power_off(void (*power_off)(void)); -- cgit v1.2.3 From 6779db970bd287bb35b28bd5dc256fd7aef19d1c Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Tue, 10 May 2022 02:32:35 +0300 Subject: kernel/reboot: Add devm_register_restart_handler() Add devm_register_restart_handler() helper that registers sys-off handler using restart mode and with a default priority. Most drivers will want to register restart handler with a default priority, so this helper will reduce the boilerplate code and make code easier to read and follow. Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- include/linux/reboot.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 3b063c5b6118..d29d37a2abb6 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -145,6 +145,10 @@ int devm_register_power_off_handler(struct device *dev, int (*callback)(struct sys_off_data *data), void *cb_data); +int devm_register_restart_handler(struct device *dev, + int (*callback)(struct sys_off_data *data), + void *cb_data); + int register_platform_power_off(void (*power_off)(void)); void unregister_platform_power_off(void (*power_off)(void)); -- cgit v1.2.3 From 15f214f9bdb7c1f560b4bf863c5a72ff53b442a4 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 13 May 2022 11:34:33 +0200 Subject: topology: Remove unused cpu_cluster_mask() default_topology[] uses cpu_clustergroup_mask() for the CLS level (guarded by CONFIG_SCHED_CLUSTER) which is currently provided by x86 (arch/x86/kernel/smpboot.c) and arm64 (drivers/base/arch_topology.c). Fixes: 778c558f49a2 ("sched: Add cluster scheduler level in core and related Kconfig for ARM64") Acked-by: Barry Song Signed-off-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20220513093433.425163-1-dietmar.eggemann@arm.com Signed-off-by: Greg Kroah-Hartman --- include/linux/topology.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index f19bc3626297..4564faafd0e1 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -240,13 +240,6 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) } #endif -#if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask) -static inline const struct cpumask *cpu_cluster_mask(int cpu) -{ - return topology_cluster_cpumask(cpu); -} -#endif - static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); -- cgit v1.2.3 From 0651ab90e4ade17f1d4f4367b70f6120480410f3 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Wed, 18 May 2022 11:08:57 +0200 Subject: ACPI: CPPC: Check _OSC for flexible address space MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ACPI 6.2 Section 6.2.11.2 'Platform-Wide OSPM Capabilities': Starting with ACPI Specification 6.2, all _CPC registers can be in PCC, System Memory, System IO, or Functional Fixed Hardware address spaces. OSPM support for this more flexible register space scheme is indicated by the “Flexible Address Space for CPPC Registers” _OSC bit Otherwise (cf ACPI 6.1, s8.4.7.1.1.X), _CPC registers must be in: - PCC or Functional Fixed Hardware address space if defined - SystemMemory address space (NULL register) if not defined Add the corresponding _OSC bit and check it when parsing _CPC objects. Signed-off-by: Pierre Gondois Reviewed-by: Sudeep Holla Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index d7136d13aa44..03465db16b68 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -574,6 +574,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); #define OSC_SB_OSLPI_SUPPORT 0x00000100 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT 0x00001000 #define OSC_SB_GENERIC_INITIATOR_SUPPORT 0x00002000 +#define OSC_SB_CPC_FLEXIBLE_ADR_SPACE 0x00004000 #define OSC_SB_NATIVE_USB4_SUPPORT 0x00040000 #define OSC_SB_PRM_SUPPORT 0x00200000 @@ -581,6 +582,7 @@ extern bool osc_sb_apei_support_acked; extern bool osc_pc_lpi_support_confirmed; extern bool osc_sb_native_usb4_support_confirmed; extern bool osc_sb_cppc_not_supported; +extern bool osc_cpc_flexible_adr_space_confirmed; /* USB4 Capabilities */ #define OSC_USB_USB3_TUNNELING 0x00000001 -- cgit v1.2.3 From 66d29d802ef3bf55a49b07568b0048823d4a72a6 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 11 May 2022 16:56:56 +0200 Subject: PM: domains: Allocate gpd_timing_data dynamically based on governor If a genpd doesn't have an associated governor assigned, there's really no point to allocate the per device gpd_timing_data, as the data isn't being used by a governor anyway. To avoid wasting memory, let's therefore convert the corresponding td variable in the struct generic_pm_domain_data into a pointer and manage the allocation of its data dynamically. Signed-off-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- include/linux/pm_domain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 043d48e4420a..126a4b9ab215 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -193,7 +193,7 @@ struct pm_domain_data { struct generic_pm_domain_data { struct pm_domain_data base; - struct gpd_timing_data td; + struct gpd_timing_data *td; struct notifier_block nb; struct notifier_block *power_nb; int cpu; -- cgit v1.2.3 From 9c74f2ac4801473d03b0e29fab74eb94a9944521 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 11 May 2022 16:56:57 +0200 Subject: PM: domains: Move the next_wakeup variable into the struct gpd_timing_data If the corresponding genpd for the device doesn't use a governor, the variable next_wakeup within the struct generic_pm_domain_data becomes superfluous. To avoid wasting memory, let's move it into the struct gpd_timing_data, which is already being allocated based upon if there is governor assigned. Signed-off-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- include/linux/pm_domain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 126a4b9ab215..1f370f074f30 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -182,6 +182,7 @@ struct gpd_timing_data { s64 suspend_latency_ns; s64 resume_latency_ns; s64 effective_constraint_ns; + ktime_t next_wakeup; bool constraint_changed; bool cached_suspend_ok; }; @@ -200,7 +201,6 @@ struct generic_pm_domain_data { unsigned int performance_state; unsigned int default_pstate; unsigned int rpm_pstate; - ktime_t next_wakeup; void *data; }; -- cgit v1.2.3 From f38d1a6d002526a4e8840e9bb19733e9d4ce1a67 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 11 May 2022 16:57:02 +0200 Subject: PM: domains: Allocate governor data dynamically based on a genpd governor If a genpd doesn't have an associated governor assigned, several variables in the struct generic_pm_domain becomes superfluous. Rather than wasting memory in allocated genpds, let's move the variables from the struct generic_pm_domain into a new separate struct. In this way, we can instead dynamically decide when we need to allocate the corresponding data for it. Signed-off-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- include/linux/pm_domain.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 1f370f074f30..ebc351698090 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -91,6 +91,14 @@ struct gpd_dev_ops { int (*stop)(struct device *dev); }; +struct genpd_governor_data { + s64 max_off_time_ns; + bool max_off_time_changed; + ktime_t next_wakeup; + bool cached_power_down_ok; + bool cached_power_down_state_idx; +}; + struct genpd_power_state { s64 power_off_latency_ns; s64 power_on_latency_ns; @@ -114,6 +122,7 @@ struct generic_pm_domain { struct list_head child_links; /* Links with PM domain as a child */ struct list_head dev_list; /* List of devices */ struct dev_power_governor *gov; + struct genpd_governor_data *gd; /* Data used by a genpd governor. */ struct work_struct power_off_work; struct fwnode_handle *provider; /* Identity of the domain provider */ bool has_provider; @@ -134,11 +143,6 @@ struct generic_pm_domain { int (*set_performance_state)(struct generic_pm_domain *genpd, unsigned int state); struct gpd_dev_ops dev_ops; - s64 max_off_time_ns; /* Maximum allowed "suspended" time. */ - ktime_t next_wakeup; /* Maintained by the domain governor */ - bool max_off_time_changed; - bool cached_power_down_ok; - bool cached_power_down_state_idx; int (*attach_dev)(struct generic_pm_domain *domain, struct device *dev); void (*detach_dev)(struct generic_pm_domain *domain, -- cgit v1.2.3 From 4853f68d158ac59b05985a6af5b7da7ccdbc14c8 Mon Sep 17 00:00:00 2001 From: Liao Chang Date: Fri, 8 Apr 2022 18:09:09 +0800 Subject: kexec_file: Fix kexec_file.c build error for riscv platform When CONFIG_KEXEC_FILE is set for riscv platform, the compilation of kernel/kexec_file.c generate build error: kernel/kexec_file.c: In function 'crash_prepare_elf64_headers': ./arch/riscv/include/asm/page.h:110:71: error: request for member 'virt_addr' in something not a structure or union 110 | ((x) >= PAGE_OFFSET && (!IS_ENABLED(CONFIG_64BIT) || (x) < kernel_map.virt_addr)) | ^ ./arch/riscv/include/asm/page.h:131:2: note: in expansion of macro 'is_linear_mapping' 131 | is_linear_mapping(_x) ? \ | ^~~~~~~~~~~~~~~~~ ./arch/riscv/include/asm/page.h:140:31: note: in expansion of macro '__va_to_pa_nodebug' 140 | #define __phys_addr_symbol(x) __va_to_pa_nodebug(x) | ^~~~~~~~~~~~~~~~~~ ./arch/riscv/include/asm/page.h:143:24: note: in expansion of macro '__phys_addr_symbol' 143 | #define __pa_symbol(x) __phys_addr_symbol(RELOC_HIDE((unsigned long)(x), 0)) | ^~~~~~~~~~~~~~~~~~ kernel/kexec_file.c:1327:36: note: in expansion of macro '__pa_symbol' 1327 | phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); This occurs is because the "kernel_map" referenced in macro is_linear_mapping() is suppose to be the one of struct kernel_mapping defined in arch/riscv/mm/init.c, but the 2nd argument of crash_prepare_elf64_header() has same symbol name, in expansion of macro is_linear_mapping in function crash_prepare_elf64_header(), "kernel_map" actually is the local variable. Signed-off-by: Liao Chang Link: https://lore.kernel.org/r/20220408100914.150110-2-lizhengyu3@huawei.com Signed-off-by: Palmer Dabbelt --- include/linux/kexec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 58d1b58a971e..ebb1bffbf068 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -227,7 +227,7 @@ struct crash_mem { extern int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend); -extern int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, +extern int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz); #endif /* CONFIG_KEXEC_FILE */ -- cgit v1.2.3 From 6c1e423a3c84953edcf91ff03ab97829b287184a Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 18 May 2022 17:45:27 +0200 Subject: can: can-dev: remove obsolete CAN LED support Since commit 30f3b42147ba6f ("can: mark led trigger as broken") the CAN specific LED support was disabled and marked as BROKEN. As the common LED support with CONFIG_LEDS_TRIGGER_NETDEV should do this work now the code can be removed as preparation for a CAN netdevice Kconfig rework. Link: https://lore.kernel.org/all/20220518154527.29046-1-socketcan@hartkopp.net Suggested-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp [mkl: remove led.h from MAINTAINERS] Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 10 ---------- include/linux/can/led.h | 51 ------------------------------------------------- 2 files changed, 61 deletions(-) delete mode 100644 include/linux/can/led.h (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index c2ea47f30046..e22dc03c850e 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -85,15 +84,6 @@ struct can_priv { int (*do_get_berr_counter)(const struct net_device *dev, struct can_berr_counter *bec); int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); - -#ifdef CONFIG_CAN_LEDS - struct led_trigger *tx_led_trig; - char tx_led_trig_name[CAN_LED_NAME_SZ]; - struct led_trigger *rx_led_trig; - char rx_led_trig_name[CAN_LED_NAME_SZ]; - struct led_trigger *rxtx_led_trig; - char rxtx_led_trig_name[CAN_LED_NAME_SZ]; -#endif }; static inline bool can_tdc_is_enabled(const struct can_priv *priv) diff --git a/include/linux/can/led.h b/include/linux/can/led.h deleted file mode 100644 index 7c3cfd798c56..000000000000 --- a/include/linux/can/led.h +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright 2012, Fabio Baltieri - */ - -#ifndef _CAN_LED_H -#define _CAN_LED_H - -#include -#include -#include - -enum can_led_event { - CAN_LED_EVENT_OPEN, - CAN_LED_EVENT_STOP, - CAN_LED_EVENT_TX, - CAN_LED_EVENT_RX, -}; - -#ifdef CONFIG_CAN_LEDS - -/* keep space for interface name + "-tx"/"-rx"/"-rxtx" - * suffix and null terminator - */ -#define CAN_LED_NAME_SZ (IFNAMSIZ + 6) - -void can_led_event(struct net_device *netdev, enum can_led_event event); -void devm_can_led_init(struct net_device *netdev); -int __init can_led_notifier_init(void); -void __exit can_led_notifier_exit(void); - -#else - -static inline void can_led_event(struct net_device *netdev, - enum can_led_event event) -{ -} -static inline void devm_can_led_init(struct net_device *netdev) -{ -} -static inline int can_led_notifier_init(void) -{ - return 0; -} -static inline void can_led_notifier_exit(void) -{ -} - -#endif - -#endif /* !_CAN_LED_H */ -- cgit v1.2.3 From b265cdebdfefbee4ef1ae2972af36f7a7cac5148 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2022 14:08:48 -0700 Subject: sched: coredump.h: clarify the use of MMF_VM_HUGEPAGE Patch series "Make khugepaged collapse readonly FS THP more consistent", v4. The readonly FS THP relies on khugepaged to collapse THP for suitable vmas. But the behavior is inconsistent for "always" mode (https://lore.kernel.org/linux-mm/00f195d4-d039-3cf2-d3a1-a2c88de397a0@suse.cz/). The "always" mode means THP allocation should be tried all the time and khugepaged should try to collapse THP all the time. Of course the allocation and collapse may fail due to other factors and conditions. Currently file THP may not be collapsed by khugepaged even though all the conditions are met. That does break the semantics of "always" mode. So make sure readonly FS vmas are registered to khugepaged to fix the break. Register suitable vmas in common mmap path, that could cover both readonly FS vmas and shmem vmas, so remove the khugepaged calls in shmem.c. The patch 1-7 are minor bug fixes, clean up and preparation patches. Patch 8 is the real meat. Tested with khugepaged test in selftests and the testcase provided by Vlastimil Babka in https://lore.kernel.org/lkml/df3b5d1c-a36b-2c73-3e27-99e74983de3a@suse.cz/ by commenting out MADV_HUGEPAGE call. This patch (of 8): MMF_VM_HUGEPAGE is set as long as the mm is available for khugepaged by khugepaged_enter(), not only when VM_HUGEPAGE is set on vma. Correct the comment to avoid confusion. Link: https://lkml.kernel.org/r/20220510203222.24246-1-shy828301@gmail.com Link: https://lkml.kernel.org/r/20220510203222.24246-2-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Miaohe Lin Acked-by: Song Liu Acked-by: Vlastmil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Rik van Riel Cc: Song Liu Cc: Theodore Ts'o Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/sched/coredump.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 4d9e3a656875..4d0a5be28b70 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -57,7 +57,8 @@ static inline int get_dumpable(struct mm_struct *mm) #endif /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ -#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ +#define MMF_VM_HUGEPAGE 17 /* set when mm is available for + khugepaged */ /* * This one-shot flag is dropped due to necessity of changing exe once again * on NFS restore -- cgit v1.2.3 From 78d12c19e02d8151b1c82228ae6bb10f174a68e2 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2022 14:08:49 -0700 Subject: mm: thp: only regular file could be THP eligible Since commit a4aeaa06d45e ("mm: khugepaged: skip huge page collapse for special files"), khugepaged just collapses THP for regular file which is the intended usecase for readonly fs THP. Only show regular file as THP eligible accordingly. And make file_thp_enabled() available for khugepaged too in order to remove duplicate code. Link: https://lkml.kernel.org/r/20220510203222.24246-5-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Song Liu Acked-by: Vlastmil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Rik van Riel Cc: Song Liu Cc: Theodore Ts'o Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fbf36bb1be22..de29821231c9 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -173,6 +173,20 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) return false; } +static inline bool file_thp_enabled(struct vm_area_struct *vma) +{ + struct inode *inode; + + if (!vma->vm_file) + return false; + + inode = vma->vm_file->f_inode; + + return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) && + (vma->vm_flags & VM_EXEC) && + !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); +} + bool transparent_hugepage_active(struct vm_area_struct *vma); #define transparent_hugepage_use_zero_page() \ -- cgit v1.2.3 From d2081b2bf8195b8239c67fdd61518e077da7cbec Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2022 14:08:49 -0700 Subject: mm: khugepaged: make khugepaged_enter() void function The most callers of khugepaged_enter() don't care about the return value. Only dup_mmap(), anonymous THP page fault and MADV_HUGEPAGE handle the error by returning -ENOMEM. Actually it is not harmful for them to ignore the error case either. It also sounds overkilling to fail fork() and page fault early due to khugepaged_enter() error, and MADV_HUGEPAGE does set VM_HUGEPAGE flag regardless of the error. Link: https://lkml.kernel.org/r/20220510203222.24246-6-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Song Liu Acked-by: Vlastmil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Rik van Riel Cc: Song Liu Cc: Theodore Ts'o Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 2fcc01891b47..0423d3619f26 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -12,10 +12,10 @@ extern struct attribute_group khugepaged_attr_group; extern int khugepaged_init(void); extern void khugepaged_destroy(void); extern int start_stop_khugepaged(void); -extern int __khugepaged_enter(struct mm_struct *mm); +extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); -extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, - unsigned long vm_flags); +extern void khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); #ifdef CONFIG_SHMEM extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); @@ -40,11 +40,10 @@ static inline void collapse_pte_mapped_thp(struct mm_struct *mm, (transparent_hugepage_flags & \ (1<flags)) - return __khugepaged_enter(mm); - return 0; + __khugepaged_enter(mm); } static inline void khugepaged_exit(struct mm_struct *mm) @@ -53,7 +52,7 @@ static inline void khugepaged_exit(struct mm_struct *mm) __khugepaged_exit(mm); } -static inline int khugepaged_enter(struct vm_area_struct *vma, +static inline void khugepaged_enter(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) @@ -62,27 +61,22 @@ static inline int khugepaged_enter(struct vm_area_struct *vma, (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && !(vm_flags & VM_NOHUGEPAGE) && !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - if (__khugepaged_enter(vma->vm_mm)) - return -ENOMEM; - return 0; + __khugepaged_enter(vma->vm_mm); } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) +static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { - return 0; } static inline void khugepaged_exit(struct mm_struct *mm) { } -static inline int khugepaged_enter(struct vm_area_struct *vma, - unsigned long vm_flags) +static inline void khugepaged_enter(struct vm_area_struct *vma, + unsigned long vm_flags) { - return 0; } -static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma, - unsigned long vm_flags) +static inline void khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) { - return 0; } static inline void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) -- cgit v1.2.3 From 2647d11b9e71d495b2d4985ebaf7a734373a4b80 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2022 14:08:49 -0700 Subject: mm: khugepaged: make hugepage_vma_check() non-static The hugepage_vma_check() could be reused by khugepaged_enter() and khugepaged_enter_vma_merge(), but it is static in khugepaged.c. Make it non-static and declare it in khugepaged.h. Link: https://lkml.kernel.org/r/20220510203222.24246-7-shy828301@gmail.com Signed-off-by: Yang Shi Suggested-by: Vlastimil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Rik van Riel Cc: Song Liu Cc: Song Liu Cc: Theodore Ts'o Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 0423d3619f26..c340f6bb39d6 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -3,8 +3,6 @@ #define _LINUX_KHUGEPAGED_H #include /* MMF_VM_HUGEPAGE */ -#include - #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; @@ -12,6 +10,8 @@ extern struct attribute_group khugepaged_attr_group; extern int khugepaged_init(void); extern void khugepaged_destroy(void); extern int start_stop_khugepaged(void); +extern bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags); extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern void khugepaged_enter_vma_merge(struct vm_area_struct *vma, @@ -55,13 +55,11 @@ static inline void khugepaged_exit(struct mm_struct *mm) static inline void khugepaged_enter(struct vm_area_struct *vma, unsigned long vm_flags) { - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) - if ((khugepaged_always() || - (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) || - (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && - !(vm_flags & VM_NOHUGEPAGE) && - !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && + khugepaged_enabled()) { + if (hugepage_vma_check(vma, vm_flags)) __khugepaged_enter(vma->vm_mm); + } } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) -- cgit v1.2.3 From c791576c60288c89b351ea2d2098f6a872d78fa7 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2022 14:08:50 -0700 Subject: mm: khugepaged: introduce khugepaged_enter_vma() helper The khugepaged_enter_vma_merge() actually does as the same thing as the khugepaged_enter() section called by shmem_mmap(), so consolidate them into one helper and rename it to khugepaged_enter_vma(). Link: https://lkml.kernel.org/r/20220510203222.24246-8-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastmil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Rik van Riel Cc: Song Liu Cc: Song Liu Cc: Theodore Ts'o Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index c340f6bb39d6..392d34c3c59a 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -14,8 +14,8 @@ extern bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags); extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); -extern void khugepaged_enter_vma_merge(struct vm_area_struct *vma, - unsigned long vm_flags); +extern void khugepaged_enter_vma(struct vm_area_struct *vma, + unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); #ifdef CONFIG_SHMEM extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); @@ -72,8 +72,8 @@ static inline void khugepaged_enter(struct vm_area_struct *vma, unsigned long vm_flags) { } -static inline void khugepaged_enter_vma_merge(struct vm_area_struct *vma, - unsigned long vm_flags) +static inline void khugepaged_enter_vma(struct vm_area_struct *vma, + unsigned long vm_flags) { } static inline void collapse_pte_mapped_thp(struct mm_struct *mm, -- cgit v1.2.3 From bc4a68adb151f08b50822158c7767f5726346370 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:50 -0700 Subject: mm/swap: remove unneeded return value of free_swap_slot The return value of free_swap_slot is always 0 and also ignored now. Remove it to clean up the code. Link: https://lkml.kernel.org/r/20220509131416.17553-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alistair Popple Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/swap_slots.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index 347f1a304190..15adfb8c813a 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -24,7 +24,7 @@ struct swap_slots_cache { void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); void enable_swap_slots_cache(void); -int free_swap_slot(swp_entry_t entry); +void free_swap_slot(swp_entry_t entry); extern bool swap_slot_cache_enabled; -- cgit v1.2.3 From 3db3264d8a5f4816c022eb9052694ce51e657bfc Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:51 -0700 Subject: mm/swap: make page_swapcount and __lru_add_drain_all static Make page_swapcount and __lru_add_drain_all static. They are only used within the file now. Link: https://lkml.kernel.org/r/20220509131416.17553-9-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alistair Popple Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/swap.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 6bf1506e5080..0ff887cad383 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -485,7 +485,6 @@ int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); -extern int page_swapcount(struct page *); extern int __swap_count(swp_entry_t entry); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); @@ -557,12 +556,6 @@ static inline void put_swap_page(struct page *page, swp_entry_t swp) { } - -static inline int page_swapcount(struct page *page) -{ - return 0; -} - static inline int __swap_count(swp_entry_t entry) { return 0; -- cgit v1.2.3 From a930c210c42dec32b031ad7645bd6904701b5297 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:52 -0700 Subject: mm/swap: fix the obsolete comment for SWP_TYPE_SHIFT Since commit 3159f943aafd ("xarray: Replace exceptional entries"), there is only one bit of 'type' can be shifted up. Update the corresponding comment. Link: https://lkml.kernel.org/r/20220509131416.17553-13-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/swapops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 09945342bf76..fe220df499f1 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -13,10 +13,10 @@ * get good packing density in that tree, so the index should be dense in * the low-order bits. * - * We arrange the `type' and `offset' fields so that `type' is at the seven + * We arrange the `type' and `offset' fields so that `type' is at the six * high-order bits of the swp_entry_t and `offset' is right-aligned in the * remaining bits. Although `type' itself needs only five bits, we allow for - * shmem/tmpfs to shift it all up a further two bits: see swp_to_radix_entry(). + * shmem/tmpfs to shift it all up a further one bit: see swp_to_radix_entry(). * * swp_entry_t's are *never* stored anywhere in their arch-dependent format. */ -- cgit v1.2.3 From ff351f4bb9606a6c9a1f3fcdd918635b51514779 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 14:08:52 -0700 Subject: mm/swap: fix comment about swap extent Since commit 4efaceb1c5f8 ("mm, swap: use rbtree for swap_extent"), rbtree is used for swap extent. Also curr_swap_extent is removed at that time. Update the corresponding comment. Link: https://lkml.kernel.org/r/20220509131416.17553-16-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Alistair Popple Cc: David Hildenbrand Cc: David Howells Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0ff887cad383..04c0713c1d6e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -168,8 +168,8 @@ struct zone; /* * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of - * disk blocks. A list of swap extents maps the entire swapfile. (Where the - * term `swapfile' refers to either a blockdevice or an IS_REG file. Apart + * disk blocks. A rbtree of swap extents maps the entire swapfile (Where the + * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart * from setup, they're handled identically. * * We always assume that blocks are of size PAGE_SIZE. -- cgit v1.2.3 From f6498b776d280b30a4614d8261840961e993c2c8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 19 May 2022 14:08:53 -0700 Subject: mm: zswap: add basic meminfo and vmstat coverage Currently it requires poking at debugfs to figure out the size and population of the zswap cache on a host. There are no counters for reads and writes against the cache. As a result, it's difficult to understand zswap behavior on production systems. Print zswap memory consumption and how many pages are zswapped out in /proc/meminfo. Count zswapouts and zswapins in /proc/vmstat. Link: https://lkml.kernel.org/r/20220510152847.230957-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: David Hildenbrand Cc: Dan Streetman Cc: Michal Hocko Cc: Minchan Kim Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 +++++ include/linux/vm_event_item.h | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 04c0713c1d6e..f3ae17b43f20 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -620,6 +620,11 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) } #endif +#ifdef CONFIG_ZSWAP +extern u64 zswap_pool_total_size; +extern atomic_t zswap_stored_pages; +#endif + #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index e83967e4c20e..404024486fa5 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -136,6 +136,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_KSM COW_KSM, #endif +#ifdef CONFIG_ZSWAP + ZSWPIN, + ZSWPOUT, +#endif #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, DIRECT_MAP_LEVEL3_SPLIT, -- cgit v1.2.3 From f4840ccfca25db225b3371a8f7b5770febee87c5 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 19 May 2022 14:08:53 -0700 Subject: zswap: memcg accounting Applications can currently escape their cgroup memory containment when zswap is enabled. This patch adds per-cgroup tracking and limiting of zswap backend memory to rectify this. The existing cgroup2 memory.stat file is extended to show zswap statistics analogous to what's in meminfo and vmstat. Furthermore, two new control files, memory.zswap.current and memory.zswap.max, are added to allow tuning zswap usage on a per-workload basis. This is important since not all workloads benefit from zswap equally; some even suffer compared to disk swap when memory contents don't compress well. The optimal size of the zswap pool, and the threshold for writeback, also depends on the size of the workload's warm set. The implementation doesn't use a traditional page_counter transaction. zswap is unconventional as a memory consumer in that we only know the amount of memory to charge once expensive compression has occurred. If zwap is disabled or the limit is already exceeded we obviously don't want to compress page upon page only to reject them all. Instead, the limit is checked against current usage, then we compress and charge. This allows some limit overrun, but not enough to matter in practice. [hannes@cmpxchg.org: fix for CONFIG_SLOB builds] Link: https://lkml.kernel.org/r/YnwD14zxYjUJPc2w@cmpxchg.org [hannes@cmpxchg.org: opt out of cgroups v1] Link: https://lkml.kernel.org/r/Yn6it9mBYFA+/lTb@cmpxchg.org Link: https://lkml.kernel.org/r/20220510152847.230957-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Seth Jennings Cc: Dan Streetman Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8ea4b541c31e..9ecead1042b9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -35,6 +35,8 @@ enum memcg_stat_item { MEMCG_PERCPU_B, MEMCG_VMALLOC, MEMCG_KMEM, + MEMCG_ZSWAP_B, + MEMCG_ZSWAPPED, MEMCG_NR_STAT, }; @@ -252,6 +254,10 @@ struct mem_cgroup { /* Range enforcement for interrupt charges */ struct work_struct high_work; +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + unsigned long zswap_max; +#endif + unsigned long soft_limit; /* vmpressure notifications */ @@ -1273,6 +1279,10 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) return NULL; } +static inline void obj_cgroup_put(struct obj_cgroup *objcg) +{ +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } @@ -1694,6 +1704,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); struct obj_cgroup *get_obj_cgroup_from_current(void); +struct obj_cgroup *get_obj_cgroup_from_page(struct page *page); int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); @@ -1730,6 +1741,20 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) struct mem_cgroup *mem_cgroup_from_obj(void *p); +static inline void count_objcg_event(struct obj_cgroup *objcg, + enum vm_event_item idx) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_kmem_disabled()) + return; + + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + count_memcg_events(memcg, idx, 1); + rcu_read_unlock(); +} + #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1756,6 +1781,11 @@ static inline void __memcg_kmem_uncharge_page(struct page *page, int order) { } +static inline struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) +{ + return NULL; +} + static inline bool memcg_kmem_enabled(void) { return false; @@ -1771,6 +1801,30 @@ static inline struct mem_cgroup *mem_cgroup_from_obj(void *p) return NULL; } +static inline void count_objcg_event(struct obj_cgroup *objcg, + enum vm_event_item idx) +{ +} + #endif /* CONFIG_MEMCG_KMEM */ +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); +void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); +void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); +#else +static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) +{ + return true; +} +static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, + size_t size) +{ +} +static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, + size_t size) +{ +} +#endif + #endif /* _LINUX_MEMCONTROL_H */ -- cgit v1.2.3 From 6d4675e601357834dadd2ba1d803f6484596015c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 19 May 2022 14:08:54 -0700 Subject: mm: don't be stuck to rmap lock on reclaim path The rmap locks(i_mmap_rwsem and anon_vma->root->rwsem) could be contended under memory pressure if processes keep working on their vmas(e.g., fork, mmap, munmap). It makes reclaim path stuck. In our real workload traces, we see kswapd is waiting the lock for 300ms+(worst case, a sec) and it makes other processes entering direct reclaim, which were also stuck on the lock. This patch makes lru aging path try_lock mode like shink_page_list so the reclaim context will keep working with next lru pages without being stuck. if it found the rmap lock contended, it rotates the page back to head of lru in both active/inactive lrus to make them consistent behavior, which is basic starting point rather than adding more heristic. Since this patch introduces a new "contended" field as out-param along with try_lock in-param in rmap_walk_control, it's not immutable any longer if the try_lock is set so remove const keywords on rmap related functions. Since rmap walking is already expensive operation, I doubt the const would help sizable benefit( And we didn't have it until 5.17). In a heavy app workload in Android, trace shows following statistics. It almost removes rmap lock contention from reclaim path. Martin Liu reported: Before: max_dur(ms) min_dur(ms) max-min(dur)ms avg_dur(ms) sum_dur(ms) count blocked_function 1632 0 1631 151.542173 31672 209 page_lock_anon_vma_read 601 0 601 145.544681 28817 198 rmap_walk_file After: max_dur(ms) min_dur(ms) max-min(dur)ms avg_dur(ms) sum_dur(ms) count blocked_function NaN NaN NaN NaN NaN 0.0 NaN 0 0 0 0.127645 1 12 rmap_walk_file [minchan@kernel.org: add comment, per Matthew] Link: https://lkml.kernel.org/r/YnNqeB5tUf6LZ57b@google.com Link: https://lkml.kernel.org/r/20220510215423.164547-1-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Johannes Weiner Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: John Dias Cc: Tim Murray Cc: Matthew Wilcox Cc: Vladimir Davydov Cc: Martin Liu Cc: Minchan Kim Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/fs.h | 5 +++++ include/linux/ksm.h | 4 ++-- include/linux/rmap.h | 28 ++++++++++++++++++++-------- 3 files changed, 27 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b81cacc51d2f..044b67f8d861 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -477,6 +477,11 @@ static inline void i_mmap_unlock_write(struct address_space *mapping) up_write(&mapping->i_mmap_rwsem); } +static inline int i_mmap_trylock_read(struct address_space *mapping) +{ + return down_read_trylock(&mapping->i_mmap_rwsem); +} + static inline void i_mmap_lock_read(struct address_space *mapping) { down_read(&mapping->i_mmap_rwsem); diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 0630e545f4cb..0b4f17418f64 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -51,7 +51,7 @@ static inline void ksm_exit(struct mm_struct *mm) struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); -void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc); +void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); #else /* !CONFIG_KSM */ @@ -79,7 +79,7 @@ static inline struct page *ksm_might_need_to_copy(struct page *page, } static inline void rmap_walk_ksm(struct folio *folio, - const struct rmap_walk_control *rwc) + struct rmap_walk_control *rwc) { } diff --git a/include/linux/rmap.h b/include/linux/rmap.h index cbe279a6f0de..9ec23138e410 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -128,6 +128,11 @@ static inline void anon_vma_lock_read(struct anon_vma *anon_vma) down_read(&anon_vma->root->rwsem); } +static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) +{ + return down_read_trylock(&anon_vma->root->rwsem); +} + static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) { up_read(&anon_vma->root->rwsem); @@ -366,17 +371,14 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); -/* - * Called by memory-failure.c to kill processes. - */ -struct anon_vma *folio_lock_anon_vma_read(struct folio *folio); -void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); /* * rmap_walk_control: To control rmap traversing for specific needs * * arg: passed to rmap_one() and invalid_vma() + * try_lock: bail out if the rmap lock is contended + * contended: indicate the rmap traversal bailed out due to lock contention * rmap_one: executed on each vma where page is mapped * done: for checking traversing termination condition * anon_lock: for getting anon_lock by optimized way rather than default @@ -384,6 +386,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); */ struct rmap_walk_control { void *arg; + bool try_lock; + bool contended; /* * Return false if page table scanning in rmap_walk should be stopped. * Otherwise, return true. @@ -391,12 +395,20 @@ struct rmap_walk_control { bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct folio *folio); - struct anon_vma *(*anon_lock)(struct folio *folio); + struct anon_vma *(*anon_lock)(struct folio *folio, + struct rmap_walk_control *rwc); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; -void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc); -void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc); +void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); +void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); + +/* + * Called by memory-failure.c to kill processes. + */ +struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, + struct rmap_walk_control *rwc); +void page_unlock_anon_vma_read(struct anon_vma *anon_vma); #else /* !CONFIG_MMU */ -- cgit v1.2.3 From 3f913fc5f9745613088d3c569778c9813ab9c129 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 19 May 2022 14:08:55 -0700 Subject: mm: fix missing handler for __GFP_NOWARN We expect no warnings to be issued when we specify __GFP_NOWARN, but currently in paths like alloc_pages() and kmalloc(), there are still some warnings printed, fix it. But for some warnings that report usage problems, we don't deal with them. If such warnings are printed, then we should fix the usage problems. Such as the following case: WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); [zhengqi.arch@bytedance.com: v2] Link: https://lkml.kernel.org/r/20220511061951.1114-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20220510113809.80626-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Akinobu Mita Cc: Vlastimil Babka Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/fault-inject.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 2d04f6448cde..9f6e25467844 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -20,6 +20,7 @@ struct fault_attr { atomic_t space; unsigned long verbose; bool task_filter; + bool no_warn; unsigned long stacktrace_depth; unsigned long require_start; unsigned long require_end; @@ -39,6 +40,7 @@ struct fault_attr { .ratelimit_state = RATELIMIT_STATE_INIT_DISABLED, \ .verbose = 2, \ .dname = NULL, \ + .no_warn = false, \ } #define DECLARE_FAULT_ATTR(name) struct fault_attr name = FAULT_ATTR_INITIALIZER -- cgit v1.2.3 From 37462a920392cb86541650a6f4121155f11f1199 Mon Sep 17 00:00:00 2001 From: Christophe de Dinechin Date: Thu, 14 Apr 2022 17:08:54 +0200 Subject: nodemask.h: fix compilation error with GCC12 With gcc version 12.0.1 20220401 (Red Hat 12.0.1-0), building with defconfig results in the following compilation error: | CC mm/swapfile.o | mm/swapfile.c: In function `setup_swap_info': | mm/swapfile.c:2291:47: error: array subscript -1 is below array bounds | of `struct plist_node[]' [-Werror=array-bounds] | 2291 | p->avail_lists[i].prio = 1; | | ~~~~~~~~~~~~~~^~~ | In file included from mm/swapfile.c:16: | ./include/linux/swap.h:292:27: note: while referencing `avail_lists' | 292 | struct plist_node avail_lists[]; /* | | ^~~~~~~~~~~ This is due to the compiler detecting that the mask in node_states[__state] could theoretically be zero, which would lead to first_node() returning -1 through find_first_bit. I believe that the warning/error is legitimate. I first tried adding a test to check that the node mask is not emtpy, since a similar test exists in the case where MAX_NUMNODES == 1. However, adding the if statement causes other warnings to appear in for_each_cpu_node_but, because it introduces a dangling else ambiguity. And unfortunately, GCC is not smart enough to detect that the added test makes the case where (node) == -1 impossible, so it still complains with the same message. This is why I settled on replacing that with a harmless, but relatively useless (node) >= 0 test. Based on the warning for the dangling else, I also decided to fix the case where MAX_NUMNODES == 1 by moving the condition inside the for loop. It will still only be tested once. This ensures that the meaning of an else following for_each_node_mask or derivatives would not silently have a different meaning depending on the configuration. Link: https://lkml.kernel.org/r/20220414150855.2407137-3-dinechin@redhat.com Signed-off-by: Christophe de Dinechin Signed-off-by: Christophe de Dinechin Reviewed-by: Andrew Morton Cc: Ben Segall Cc: "Michael S. Tsirkin" Cc: Steven Rostedt Cc: Ingo Molnar Cc: Mel Gorman Cc: Dietmar Eggemann Cc: Vincent Guittot Cc: Paolo Bonzini Cc: Daniel Bristot de Oliveira Cc: Jason Wang Cc: Zhen Lei Cc: Juri Lelli Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton --- include/linux/nodemask.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 567c3ddba2c4..c6199dbe2591 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -375,14 +375,13 @@ static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, } #if MAX_NUMNODES > 1 -#define for_each_node_mask(node, mask) \ - for ((node) = first_node(mask); \ - (node) < MAX_NUMNODES; \ - (node) = next_node((node), (mask))) +#define for_each_node_mask(node, mask) \ + for ((node) = first_node(mask); \ + (node >= 0) && (node) < MAX_NUMNODES; \ + (node) = next_node((node), (mask))) #else /* MAX_NUMNODES == 1 */ -#define for_each_node_mask(node, mask) \ - if (!nodes_empty(mask)) \ - for ((node) = 0; (node) < 1; (node)++) +#define for_each_node_mask(node, mask) \ + for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++) #endif /* MAX_NUMNODES */ /* -- cgit v1.2.3 From 991d8d8142cad94f9c5c05db25e67fa83d6f772a Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 13 May 2022 11:34:33 +0200 Subject: topology: Remove unused cpu_cluster_mask() default_topology[] uses cpu_clustergroup_mask() for the CLS level (guarded by CONFIG_SCHED_CLUSTER) which is currently provided by x86 (arch/x86/kernel/smpboot.c) and arm64 (drivers/base/arch_topology.c). Fixes: 778c558f49a2c ("sched: Add cluster scheduler level in core and related Kconfig for ARM64") Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Barry Song Link: https://lore.kernel.org/r/20220513093433.425163-1-dietmar.eggemann@arm.com --- include/linux/topology.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index f19bc3626297..4564faafd0e1 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -240,13 +240,6 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) } #endif -#if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask) -static inline const struct cpumask *cpu_cluster_mask(int cpu) -{ - return topology_cluster_cpumask(cpu); -} -#endif - static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); -- cgit v1.2.3 From 5cebb40bc9554aafcc492431181f43c6231b0459 Mon Sep 17 00:00:00 2001 From: Harini Katakam Date: Wed, 18 May 2022 22:37:56 +0530 Subject: net: macb: Fix PTP one step sync support PTP one step sync packets cannot have CSUM padding and insertion in SW since time stamp is inserted on the fly by HW. In addition, ptp4l version 3.0 and above report an error when skb timestamps are reported for packets that not processed for TX TS after transmission. Add a helper to identify PTP one step sync and fix the above two errors. Add a common mask for PTP header flag field "twoStepflag". Also reset ptp OSS bit when one step is not selected. Fixes: ab91f0a9b5f4 ("net: macb: Add hardware PTP support") Fixes: 653e92a9175e ("net: macb: add support for padding and fcs computation") Signed-off-by: Harini Katakam Reviewed-by: Radhey Shyam Pandey Reviewed-by: Claudiu Beznea Link: https://lore.kernel.org/r/20220518170756.7752-1-harini.katakam@xilinx.com Signed-off-by: Jakub Kicinski --- include/linux/ptp_classify.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index fefa7790dc46..2b6ea36ad162 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -43,6 +43,9 @@ #define OFF_PTP_SOURCE_UUID 22 /* PTPv1 only */ #define OFF_PTP_SEQUENCE_ID 30 +/* PTP header flag fields */ +#define PTP_FLAG_TWOSTEP BIT(1) + /* Below defines should actually be removed at some point in time. */ #define IP6_HLEN 40 #define UDP_HLEN 8 -- cgit v1.2.3 From 827fc630e4c8087df5a8e8ee013b686bd6f13736 Mon Sep 17 00:00:00 2001 From: Muneendra Kumar Date: Thu, 19 May 2022 05:31:07 -0700 Subject: scsi: nvme-fc: Add new routine nvme_fc_io_getuuid() Add nvme_fc_io_getuuid() to the nvme-fc transport. The routine is invoked by the FC LLDD on a per-I/O request basis. The routine translates from the FC-specific request structure to the bio and the cgroup structure in order to obtain the FC appid stored in the cgroup structure. If a value is not set or a bio is not found, a NULL appid (aka uuid) will be returned to the LLDD. Link: https://lore.kernel.org/r/20220519123110.17361-2-jsmart2021@gmail.com Reviewed-by: Hannes Reinecke Reviewed-by: Himanshu Madhani Acked-by: Christoph Hellwig Signed-off-by: Muneendra Kumar Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- include/linux/nvme-fc-driver.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 5358a5facdee..fa092b9be2fd 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -564,6 +564,15 @@ int nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *remoteport, void *lsreqbuf, u32 lsreqbuf_len); +/* + * Routine called to get the appid field associated with request by the lldd + * + * If the return value is NULL : the user/libvirt has not set the appid to VM + * If the return value is non-zero: Returns the appid associated with VM + * + * @req: IO request from nvme fc to driver + */ +char *nvme_fc_io_getuuid(struct nvmefc_fcp_req *req); /* * *************** LLDD FC-NVME Target/Subsystem API *************** @@ -1048,5 +1057,10 @@ int nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *tgtport, void nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *fcpreq); +/* + * add a define, visible to the compiler, that indicates support + * for feature. Allows for conditional compilation in LLDDs. + */ +#define NVME_FC_FEAT_UUID 0x0001 #endif /* _NVME_FC_DRIVER_H */ -- cgit v1.2.3 From 59df85d5fbae17175c391d89ad03e9e7a01b7a55 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 1 Mar 2022 19:56:53 -0500 Subject: linux/mount.h: trim includes Signed-off-by: Al Viro --- include/linux/mount.h | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index 7f18a7555dff..b3b149dcbf96 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -11,17 +11,15 @@ #define _LINUX_MOUNT_H #include -#include -#include -#include -#include -#include +#include struct super_block; -struct vfsmount; struct dentry; -struct mnt_namespace; +struct user_namespace; +struct file_system_type; struct fs_context; +struct file; +struct path; #define MNT_NOSUID 0x01 #define MNT_NODEV 0x02 @@ -81,9 +79,6 @@ static inline struct user_namespace *mnt_user_ns(const struct vfsmount *mnt) return smp_load_acquire(&mnt->mnt_userns); } -struct file; /* forward dec */ -struct path; - extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write_file(struct file *file); extern void mnt_drop_write(struct vfsmount *mnt); @@ -94,12 +89,10 @@ extern struct vfsmount *mnt_clone_internal(const struct path *path); extern bool __mnt_is_readonly(struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); -struct path; extern struct vfsmount *clone_private_mount(const struct path *path); extern int __mnt_want_write(struct vfsmount *); extern void __mnt_drop_write(struct vfsmount *); -struct file_system_type; extern struct vfsmount *fc_mount(struct fs_context *fc); extern struct vfsmount *vfs_create_mount(struct fs_context *fc); extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, -- cgit v1.2.3 From 70f8d9c5750bbb0ca4ef7e23d6abcb05e6061138 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 2 Mar 2022 17:49:09 -0500 Subject: move mount-related externs from fs.h to mount.h Signed-off-by: Al Viro --- include/linux/fs.h | 11 ----------- include/linux/mount.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index bbde95387a23..14df7c049b43 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2461,22 +2461,11 @@ struct super_block *sget(struct file_system_type *type, extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); -extern struct vfsmount *kern_mount(struct file_system_type *); -extern void kern_unmount(struct vfsmount *mnt); -extern int may_umount_tree(struct vfsmount *); -extern int may_umount(struct vfsmount *); -extern long do_mount(const char *, const char __user *, - const char *, unsigned long, void *); -extern struct vfsmount *collect_mounts(const struct path *); -extern void drop_collected_mounts(struct vfsmount *); -extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, - struct vfsmount *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); extern int freeze_super(struct super_block *super); extern int thaw_super(struct super_block *super); -extern bool our_mnt(struct vfsmount *mnt); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); diff --git a/include/linux/mount.h b/include/linux/mount.h index b3b149dcbf96..55a4abaf6715 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -108,6 +108,18 @@ extern void mark_mounts_for_expiry(struct list_head *mounts); extern dev_t name_to_dev_t(const char *name); extern bool path_is_mountpoint(const struct path *path); +extern bool our_mnt(struct vfsmount *mnt); + +extern struct vfsmount *kern_mount(struct file_system_type *); +extern void kern_unmount(struct vfsmount *mnt); +extern int may_umount_tree(struct vfsmount *); +extern int may_umount(struct vfsmount *); +extern long do_mount(const char *, const char __user *, + const char *, unsigned long, void *); +extern struct vfsmount *collect_mounts(const struct path *); +extern void drop_collected_mounts(struct vfsmount *); +extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, + struct vfsmount *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); #endif /* _LINUX_MOUNT_H */ -- cgit v1.2.3 From 3bc253c2e652cf5f12cd8c00d80d8ec55d67d1a7 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 19 May 2022 16:30:10 -0700 Subject: bpf: Add bpf_skc_to_mptcp_sock_proto This patch implements a new struct bpf_func_proto, named bpf_skc_to_mptcp_sock_proto. Define a new bpf_id BTF_SOCK_TYPE_MPTCP, and a new helper bpf_skc_to_mptcp_sock(), which invokes another new helper bpf_mptcp_sock_from_subflow() in net/mptcp/bpf.c to get struct mptcp_sock from a given subflow socket. v2: Emit BTF type, add func_id checks in verifier.c and bpf_trace.c, remove build check for CONFIG_BPF_JIT v5: Drop EXPORT_SYMBOL (Martin) Co-developed-by: Nicolas Rybowski Co-developed-by: Matthieu Baerts Signed-off-by: Nicolas Rybowski Signed-off-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220519233016.105670-2-mathew.j.martineau@linux.intel.com --- include/linux/bpf.h | 1 + include/linux/btf_ids.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c107392b0ba7..a3ef078401cf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2231,6 +2231,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_unix_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_snprintf_proto; diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index bc5d9cc34e4c..335a19092368 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -178,7 +178,8 @@ extern struct btf_id_set name; BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock) + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_MPTCP, mptcp_sock) enum { #define BTF_SOCK_TYPE(name, str) name, -- cgit v1.2.3 From b23316aabffa835ecc516cb81daeef5b9155e8a5 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Thu, 19 May 2022 23:06:10 +0800 Subject: selftests/bpf: Add missing trampoline program type to trampoline_count test Currently the trampoline_count test doesn't include any fmod_ret bpf programs, fix it to make the test cover all possible trampoline program types. Since fmod_ret bpf programs can't be attached to __set_task_comm function, as it's neither whitelisted for error injection nor a security hook, change it to bpf_modify_return_test. This patch also does some other cleanups such as removing duplicate code, dropping inconsistent comments, etc. Signed-off-by: Yuntao Wang Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220519150610.601313-1-ytcoode@gmail.com --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a3ef078401cf..cc4d5e394031 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -724,7 +724,7 @@ struct btf_func_model { #define BPF_TRAMP_F_RET_FENTRY_RET BIT(4) /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 - * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 + * bytes on x86. */ #define BPF_MAX_TRAMP_LINKS 38 -- cgit v1.2.3 From bb12dd42d20f5513a8d1da225232af0a0743fd79 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 24 Sep 2021 12:56:53 +0200 Subject: powerpc/powermac: constify device_node in of_irq_parse_oldworld() The of_irq_parse_oldworld() does not modify passed device_node so make it a pointer to const for safety. Drop the extern while modifying the line. Signed-off-by: Krzysztof Kozlowski Acked-by: Rob Herring Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210924105653.46963-2-krzysztof.kozlowski@canonical.com --- include/linux/of_irq.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index aaf219bd0354..83fccd0c9bba 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -20,12 +20,12 @@ typedef int (*of_irq_init_cb_t)(struct device_node *, struct device_node *); #if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC) extern unsigned int of_irq_workarounds; extern struct device_node *of_irq_dflt_pic; -extern int of_irq_parse_oldworld(struct device_node *device, int index, - struct of_phandle_args *out_irq); +int of_irq_parse_oldworld(const struct device_node *device, int index, + struct of_phandle_args *out_irq); #else /* CONFIG_PPC32 && CONFIG_PPC_PMAC */ #define of_irq_workarounds (0) #define of_irq_dflt_pic (NULL) -static inline int of_irq_parse_oldworld(struct device_node *device, int index, +static inline int of_irq_parse_oldworld(const struct device_node *device, int index, struct of_phandle_args *out_irq) { return -EINVAL; -- cgit v1.2.3 From cd705ea857fdd859a9df09e8adda4cb4c906e8a2 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Fri, 1 Apr 2022 23:40:29 +0200 Subject: lib: add generic polynomial calculation Some temperature and voltage sensors use a polynomial to convert between raw data points and actual temperature or voltage. The polynomial is usually the result of a curve fitting of the diode characteristic. The BT1 PVT hwmon driver already uses such a polynonmial calculation which is rather generic. Move it to lib/ so other drivers can reuse it. Signed-off-by: Michael Walle Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220401214032.3738095-2-michael@walle.cc Signed-off-by: Guenter Roeck --- include/linux/polynomial.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 include/linux/polynomial.h (limited to 'include/linux') diff --git a/include/linux/polynomial.h b/include/linux/polynomial.h new file mode 100644 index 000000000000..9e074a0bb6fa --- /dev/null +++ b/include/linux/polynomial.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 BAIKAL ELECTRONICS, JSC + */ + +#ifndef _POLYNOMIAL_H +#define _POLYNOMIAL_H + +/* + * struct polynomial_term - one term descriptor of a polynomial + * @deg: degree of the term. + * @coef: multiplication factor of the term. + * @divider: distributed divider per each degree. + * @divider_leftover: divider leftover, which couldn't be redistributed. + */ +struct polynomial_term { + unsigned int deg; + long coef; + long divider; + long divider_leftover; +}; + +/* + * struct polynomial - a polynomial descriptor + * @total_divider: total data divider. + * @terms: polynomial terms, last term must have degree of 0 + */ +struct polynomial { + long total_divider; + struct polynomial_term terms[]; +}; + +long polynomial_calc(const struct polynomial *poly, long data); + +#endif -- cgit v1.2.3 From e5d21072054fbadf41cd56062a3a14e447e8c22b Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 11 May 2022 06:29:59 -0700 Subject: hwmon: Introduce hwmon_device_register_for_thermal The thermal subsystem registers a hwmon driver without providing chip or sysfs group information. This is for legacy reasons and would be difficult to change. At the same time, we want to enforce that chip information is provided when registering a hwmon device using hwmon_device_register_with_info(). To enable this, introduce a special API for use only by the thermal subsystem. Acked-by: Rafael J . Wysocki Signed-off-by: Guenter Roeck --- include/linux/hwmon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 4efaf06fd2b8..14325f93c6b2 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -450,6 +450,9 @@ hwmon_device_register_with_info(struct device *dev, const struct hwmon_chip_info *info, const struct attribute_group **extra_groups); struct device * +hwmon_device_register_for_thermal(struct device *dev, const char *name, + void *drvdata); +struct device * devm_hwmon_device_register_with_info(struct device *dev, const char *name, void *drvdata, const struct hwmon_chip_info *info, -- cgit v1.2.3 From cc4e7fa549cb69c18bc2ba7209df373ca06749e3 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:10:53 +0200 Subject: net: qed: fix typos in comments Spelling mistakes (triple letters) in comments. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- include/linux/qed/qed_fcoe_if.h | 4 ++-- include/linux/qed/qed_iscsi_if.h | 4 ++-- include/linux/qed/qed_nvmetcp_if.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/qed_fcoe_if.h b/include/linux/qed/qed_fcoe_if.h index 16752eca5cbd..90e3045b2dcb 100644 --- a/include/linux/qed/qed_fcoe_if.h +++ b/include/linux/qed/qed_fcoe_if.h @@ -76,7 +76,7 @@ void qed_fcoe_set_pf_params(struct qed_dev *cdev, * @fill_dev_info: fills FCoE specific information * @param cdev * @param info - * @return 0 on sucesss, otherwise error value. + * @return 0 on success, otherwise error value. * @register_ops: register FCoE operations * @param cdev * @param ops - specified using qed_iscsi_cb_ops @@ -96,7 +96,7 @@ void qed_fcoe_set_pf_params(struct qed_dev *cdev, * connection. * @param p_doorbell - qed will fill the address of the * doorbell. - * return 0 on sucesss, otherwise error value. + * return 0 on success, otherwise error value. * @release_conn: release a previously acquired fcoe connection * @param cdev * @param handle - the connection handle. diff --git a/include/linux/qed/qed_iscsi_if.h b/include/linux/qed/qed_iscsi_if.h index 494cdc3cd840..fbf7973ae9ba 100644 --- a/include/linux/qed/qed_iscsi_if.h +++ b/include/linux/qed/qed_iscsi_if.h @@ -133,7 +133,7 @@ struct qed_iscsi_cb_ops { * @fill_dev_info: fills iSCSI specific information * @param cdev * @param info - * @return 0 on sucesss, otherwise error value. + * @return 0 on success, otherwise error value. * @register_ops: register iscsi operations * @param cdev * @param ops - specified using qed_iscsi_cb_ops @@ -152,7 +152,7 @@ struct qed_iscsi_cb_ops { * connection. * @param p_doorbell - qed will fill the address of the * doorbell. - * @return 0 on sucesss, otherwise error value. + * @return 0 on success, otherwise error value. * @release_conn: release a previously acquired iscsi connection * @param cdev * @param handle - the connection handle. diff --git a/include/linux/qed/qed_nvmetcp_if.h b/include/linux/qed/qed_nvmetcp_if.h index 1d51df347560..bbfbfba51f37 100644 --- a/include/linux/qed/qed_nvmetcp_if.h +++ b/include/linux/qed/qed_nvmetcp_if.h @@ -132,7 +132,7 @@ struct nvmetcp_task_params { * connection. * @param p_doorbell - qed will fill the address of the * doorbell. - * @return 0 on sucesss, otherwise error value. + * @return 0 on success, otherwise error value. * @release_conn: release a previously acquired nvmetcp connection * @param cdev * @param handle - the connection handle. -- cgit v1.2.3 From ad25f5cb39872ca14bcbe00816ae65c22fe04b89 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 21 May 2022 08:45:28 +0100 Subject: rxrpc: Fix locking issue There's a locking issue with the per-netns list of calls in rxrpc. The pieces of code that add and remove a call from the list use write_lock() and the calls procfile uses read_lock() to access it. However, the timer callback function may trigger a removal by trying to queue a call for processing and finding that it's already queued - at which point it has a spare refcount that it has to do something with. Unfortunately, if it puts the call and this reduces the refcount to 0, the call will be removed from the list. Unfortunately, since the _bh variants of the locking functions aren't used, this can deadlock. ================================ WARNING: inconsistent lock state 5.18.0-rc3-build4+ #10 Not tainted -------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. ksoftirqd/2/25 [HC0[0]:SC1[1]:HE1:SE0] takes: ffff888107ac4038 (&rxnet->call_lock){+.?.}-{2:2}, at: rxrpc_put_call+0x103/0x14b {SOFTIRQ-ON-W} state was registered at: ... Possible unsafe locking scenario: CPU0 ---- lock(&rxnet->call_lock); lock(&rxnet->call_lock); *** DEADLOCK *** 1 lock held by ksoftirqd/2/25: #0: ffff8881008ffdb0 ((&call->timer)){+.-.}-{0:0}, at: call_timer_fn+0x5/0x23d Changes ======= ver #2) - Changed to using list_next_rcu() rather than rcu_dereference() directly. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- include/linux/list.h | 10 ++++++++++ include/linux/seq_file.h | 4 ++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index c147eeb2d39d..57e8b559cdf6 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -605,6 +605,16 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) +/** + * list_for_each_rcu - Iterate over a list in an RCU-safe fashion + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each_rcu(pos, head) \ + for (pos = rcu_dereference((head)->next); \ + !list_is_head(pos, (head)); \ + pos = rcu_dereference(pos->next)) + /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 60820ab511d2..bd023dd38ae6 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -277,6 +277,10 @@ extern struct list_head *seq_list_start_head(struct list_head *head, extern struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos); +extern struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos); +extern struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos); +extern struct list_head *seq_list_next_rcu(void *v, struct list_head *head, loff_t *ppos); + /* * Helpers for iteration over hlist_head-s in seq_files */ -- cgit v1.2.3 From c304eddcecfe2513ff98ce3ae97d1c196d82ba08 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 19 May 2022 13:20:54 -0700 Subject: net: wrap the wireless pointers in struct net_device in an ifdef Most protocol-specific pointers in struct net_device are under a respective ifdef. Wireless is the notable exception. Since there's a sizable number of custom-built kernels for datacenter workloads which don't build wireless it seems reasonable to ifdefy those pointers as well. While at it move IPv4 and IPv6 pointers up, those are special for obvious reasons. Acked-by: Johannes Berg Acked-by: Stefan Schmidt # ieee802154 Acked-by: Sven Eckelmann Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1ce91cb8074a..f615a66c89e9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2119,6 +2119,8 @@ struct net_device { /* Protocol-specific pointers */ + struct in_device __rcu *ip_ptr; + struct inet6_dev __rcu *ip6_ptr; #if IS_ENABLED(CONFIG_VLAN_8021Q) struct vlan_info __rcu *vlan_info; #endif @@ -2131,16 +2133,18 @@ struct net_device { #if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif - struct in_device __rcu *ip_ptr; #if IS_ENABLED(CONFIG_DECNET) struct dn_dev __rcu *dn_ptr; #endif - struct inet6_dev __rcu *ip6_ptr; #if IS_ENABLED(CONFIG_AX25) void *ax25_ptr; #endif +#if IS_ENABLED(CONFIG_CFG80211) struct wireless_dev *ieee80211_ptr; +#endif +#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN) struct wpan_dev *ieee802154_ptr; +#endif #if IS_ENABLED(CONFIG_MPLS_ROUTING) struct mpls_dev __rcu *mpls_ptr; #endif -- cgit v1.2.3 From 97d6fb1b48f5e6f6d58028593defe8a23641b0b4 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Tue, 12 Apr 2022 12:23:10 +0900 Subject: block: add sync_blockdev_range() sync_blockdev_range() is to support syncing multiple sectors with as few block device requests as possible, it is helpful to make the block device to give full play to its performance. Signed-off-by: Yuezhang Mo Suggested-by: Christoph Hellwig Reviewed-by: Andy Wu Reviewed-by: Aoyama Wataru Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe Acked-by: Sungjong Seo Signed-off-by: Namjae Jeon --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 60d016138997..331cc6918ee9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1547,6 +1547,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); +int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); void printk_all_partitions(void); -- cgit v1.2.3 From 100f59d964050020285f0c8264ce520f0c406c13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Fri, 6 May 2022 18:10:56 +0200 Subject: LSM: Remove double path_rename hook calls for RENAME_EXCHANGE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to be able to identify a file exchange with renameat2(2) and RENAME_EXCHANGE, which will be useful for Landlock [1], propagate the rename flags to LSMs. This may also improve performance because of the switch from two set of LSM hook calls to only one, and because LSMs using this hook may optimize the double check (e.g. only one lock, reduce the number of path walks). AppArmor, Landlock and Tomoyo are updated to leverage this change. This should not change the current behavior (same check order), except (different level of) speed boosts. [1] https://lore.kernel.org/r/20220221212522.320243-1-mic@digikod.net Cc: James Morris Cc: Kentaro Takeda Cc: Serge E. Hallyn Acked-by: John Johansen Acked-by: Tetsuo Handa Reviewed-by: Paul Moore Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20220506161102.525323-7-mic@digikod.net --- include/linux/lsm_hook_defs.h | 2 +- include/linux/lsm_hooks.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index db924fe379c9..eafa1d2489fd 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -100,7 +100,7 @@ LSM_HOOK(int, 0, path_link, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) LSM_HOOK(int, 0, path_rename, const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, - struct dentry *new_dentry) + struct dentry *new_dentry, unsigned int flags) LSM_HOOK(int, 0, path_chmod, const struct path *path, umode_t mode) LSM_HOOK(int, 0, path_chown, const struct path *path, kuid_t uid, kgid_t gid) LSM_HOOK(int, 0, path_chroot, const struct path *path) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 419b5febc3ca..9acf5e368d73 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -358,6 +358,7 @@ * @old_dentry contains the dentry structure of the old link. * @new_dir contains the path structure for parent of the new link. * @new_dentry contains the dentry structure of the new link. + * @flags may contain rename options such as RENAME_EXCHANGE. * Return 0 if permission is granted. * @path_chmod: * Check for permission to change a mode of the file @path. The new -- cgit v1.2.3 From fb70bf124b051d4ded4ce57511dfec6d3ebf2b43 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 30 Mar 2022 10:30:54 -0400 Subject: NFSD: Instantiate a struct file when creating a regular NFSv4 file There have been reports of races that cause NFSv4 OPEN(CREATE) to return an error even though the requested file was created. NFSv4 does not provide a status code for this case. To mitigate some of these problems, reorganize the NFSv4 OPEN(CREATE) logic to allocate resources before the file is actually created, and open the new file while the parent directory is still locked. Two new APIs are added: + Add an API that works like nfsd_file_acquire() but does not open the underlying file. The OPEN(CREATE) path can use this API when it already has an open file. + Add an API that is kin to dentry_open(). NFSD needs to create a file and grab an open "struct file *" atomically. The alloc_empty_file() has to be done before the inode create. If it fails (for example, because the NFS server has exceeded its max_files limit), we avoid creating the file and can still return an error to the NFS client. BugLink: https://bugzilla.linux-nfs.org/show_bug.cgi?id=382 Signed-off-by: Chuck Lever Tested-by: JianHong Yin --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index aa6c1bbdb8c4..b848355b5e6c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2640,6 +2640,8 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt, name, flags, mode); } extern struct file * dentry_open(const struct path *, int, const struct cred *); +extern struct file *dentry_create(const struct path *path, int flags, + umode_t mode, const struct cred *cred); extern struct file * open_with_fake_path(const struct path *, int, struct inode*, const struct cred *); static inline struct file *file_clone_open(struct file *file) -- cgit v1.2.3 From bca1a1004615efe141fd78f360ecc48c60bc4ad5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Ard=C3=B6?= Date: Thu, 31 Mar 2022 09:01:15 +0200 Subject: mailbox: forward the hrtimer if not queued and under a lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit c7dacf5b0f32957b24ef29df1207dc2cd8307743, "mailbox: avoid timer start from callback" The previous commit was reverted since it lead to a race that caused the hrtimer to not be started at all. The check for hrtimer_active() in msg_submit() will return true if the callback function txdone_hrtimer() is currently running. This function could return HRTIMER_NORESTART and then the timer will not be restarted, and also msg_submit() will not start the timer. This will lead to a message actually being submitted but no timer will start to check for its compleation. The original fix that added checking hrtimer_active() was added to avoid a warning with hrtimer_forward. Looking in the kernel another solution to avoid this warning is to check hrtimer_is_queued() before calling hrtimer_forward_now() instead. This however requires a lock so the timer is not started by msg_submit() inbetween this check and the hrtimer_forward() call. Fixes: c7dacf5b0f32 ("mailbox: avoid timer start from callback") Signed-off-by: Björn Ardö Signed-off-by: Jassi Brar --- include/linux/mailbox_controller.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index 36d6ce673503..6fee33cb52f5 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -83,6 +83,7 @@ struct mbox_controller { const struct of_phandle_args *sp); /* Internal to API */ struct hrtimer poll_hrt; + spinlock_t poll_hrt_lock; struct list_head node; }; -- cgit v1.2.3 From fe736565efb775620dbcf3c459c1cd80d3e868da Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 20 May 2022 16:57:53 -0700 Subject: bpf: Introduce bpf_arch_text_invalidate for bpf_prog_pack Introduce bpf_arch_text_invalidate and use it to fill unused part of the bpf_prog_pack with illegal instructions when a BPF program is freed. Fixes: 57631054fae6 ("bpf: Introduce bpf_prog_pack allocator") Fixes: 33c9805860e5 ("bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free]") Reported-by: Linus Torvalds Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220520235758.1858153-4-song@kernel.org --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc4d5e394031..a9b1875212f6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2365,6 +2365,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); void *bpf_arch_text_copy(void *dst, void *src, size_t len); +int bpf_arch_text_invalidate(void *dst, size_t len); struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); -- cgit v1.2.3 From 97e03f521050c092919591e668107b3d69c5f426 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 23 May 2022 14:07:07 -0700 Subject: bpf: Add verifier support for dynptrs This patch adds the bulk of the verifier work for supporting dynamic pointers (dynptrs) in bpf. A bpf_dynptr is opaque to the bpf program. It is a 16-byte structure defined internally as: struct bpf_dynptr_kern { void *data; u32 size; u32 offset; } __aligned(8); The upper 8 bits of *size* is reserved (it contains extra metadata about read-only status and dynptr type). Consequently, a dynptr only supports memory less than 16 MB. There are different types of dynptrs (eg malloc, ringbuf, ...). In this patchset, the most basic one, dynptrs to a bpf program's local memory, is added. For now only local memory that is of reg type PTR_TO_MAP_VALUE is supported. In the verifier, dynptr state information will be tracked in stack slots. When the program passes in an uninitialized dynptr (ARG_PTR_TO_DYNPTR | MEM_UNINIT), the stack slots corresponding to the frame pointer where the dynptr resides at are marked STACK_DYNPTR. For helper functions that take in initialized dynptrs (eg bpf_dynptr_read + bpf_dynptr_write which are added later in this patchset), the verifier enforces that the dynptr has been initialized properly by checking that their corresponding stack slots have been marked as STACK_DYNPTR. The 6th patch in this patchset adds test cases that the verifier should successfully reject, such as for example attempting to use a dynptr after doing a direct write into it inside the bpf program. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20220523210712.3641569-2-joannelkoong@gmail.com --- include/linux/bpf.h | 28 ++++++++++++++++++++++++++++ include/linux/bpf_verifier.h | 18 ++++++++++++++++++ 2 files changed, 46 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a9b1875212f6..b26c8176b9e0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -392,10 +392,15 @@ enum bpf_type_flag { MEM_UNINIT = BIT(7 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to memory local to the bpf program. */ + DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; +#define DYNPTR_TYPE_FLAG_MASK DYNPTR_TYPE_LOCAL + /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -438,6 +443,7 @@ enum bpf_arg_type { ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ ARG_PTR_TO_KPTR, /* pointer to referenced kptr */ + ARG_PTR_TO_DYNPTR, /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */ __BPF_ARG_TYPE_MAX, /* Extended arg_types. */ @@ -2376,4 +2382,26 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, u32 **bin_buf, u32 num_args); void bpf_bprintf_cleanup(void); +/* the implementation of the opaque uapi struct bpf_dynptr */ +struct bpf_dynptr_kern { + void *data; + /* Size represents the number of usable bytes of dynptr data. + * If for example the offset is at 4 for a local dynptr whose data is + * of type u64, the number of usable bytes is 4. + * + * The upper 8 bits are reserved. It is as follows: + * Bits 0 - 23 = size + * Bits 24 - 30 = dynptr type + * Bit 31 = whether dynptr is read-only + */ + u32 size; + u32 offset; +} __aligned(8); + +enum bpf_dynptr_type { + BPF_DYNPTR_TYPE_INVALID, + /* Points to memory that is local to the bpf program */ + BPF_DYNPTR_TYPE_LOCAL, +}; + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1f1e7f2ea967..af5b2135215e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -72,6 +72,18 @@ struct bpf_reg_state { u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ + /* For dynptr stack slots */ + struct { + enum bpf_dynptr_type type; + /* A dynptr is 16 bytes so it takes up 2 stack slots. + * We need to track which slot is the first slot + * to protect against cases where the user may try to + * pass in an address starting at the second slot of the + * dynptr. + */ + bool first_slot; + } dynptr; + /* Max size from any of the above. */ struct { unsigned long raw1; @@ -174,9 +186,15 @@ enum bpf_stack_slot_type { STACK_SPILL, /* register spilled into stack */ STACK_MISC, /* BPF program wrote some data into this slot */ STACK_ZERO, /* BPF program wrote constant zero */ + /* A dynptr is stored in this stack slot. The type of dynptr + * is stored in bpf_stack_state->spilled_ptr.dynptr.type + */ + STACK_DYNPTR, }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ +#define BPF_DYNPTR_SIZE sizeof(struct bpf_dynptr_kern) +#define BPF_DYNPTR_NR_SLOTS (BPF_DYNPTR_SIZE / BPF_REG_SIZE) struct bpf_stack_state { struct bpf_reg_state spilled_ptr; -- cgit v1.2.3 From bc34dee65a65e9c920c420005b8a43f2a721a458 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 23 May 2022 14:07:09 -0700 Subject: bpf: Dynptr support for ring buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, our only way of writing dynamically-sized data into a ring buffer is through bpf_ringbuf_output but this incurs an extra memcpy cost. bpf_ringbuf_reserve + bpf_ringbuf_commit avoids this extra memcpy, but it can only safely support reservation sizes that are statically known since the verifier cannot guarantee that the bpf program won’t access memory outside the reserved space. The bpf_dynptr abstraction allows for dynamically-sized ring buffer reservations without the extra memcpy. There are 3 new APIs: long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr); void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags); void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags); These closely follow the functionalities of the original ringbuf APIs. For example, all ringbuffer dynptrs that have been reserved must be either submitted or discarded before the program exits. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20220523210712.3641569-4-joannelkoong@gmail.com --- include/linux/bpf.h | 15 ++++++++++++++- include/linux/bpf_verifier.h | 2 ++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b26c8176b9e0..c72321b6f306 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -395,11 +395,14 @@ enum bpf_type_flag { /* DYNPTR points to memory local to the bpf program. */ DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to a ringbuf record. */ + DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; -#define DYNPTR_TYPE_FLAG_MASK DYNPTR_TYPE_LOCAL +#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -2231,6 +2234,9 @@ extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; +extern const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto; +extern const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto; +extern const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto; extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; @@ -2402,6 +2408,13 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_INVALID, /* Points to memory that is local to the bpf program */ BPF_DYNPTR_TYPE_LOCAL, + /* Underlying data is a ringbuf record */ + BPF_DYNPTR_TYPE_RINGBUF, }; +void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, + enum bpf_dynptr_type type, u32 offset, u32 size); +void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); +int bpf_dynptr_check_size(u32 size); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index af5b2135215e..e8439f6cbe57 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -100,6 +100,8 @@ struct bpf_reg_state { * for the purpose of tracking that it's freed. * For PTR_TO_SOCKET this is used to share which pointers retain the * same reference to the socket, to determine proper reference freeing. + * For stack slots that are dynptrs, this is used to track references to + * the dynptr to determine proper reference freeing. */ u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned -- cgit v1.2.3 From 34d4ef5775f776ec4b0d53a02d588bf3195cada6 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 23 May 2022 14:07:11 -0700 Subject: bpf: Add dynptr data slices This patch adds a new helper function void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len); which returns a pointer to the underlying data of a dynptr. *len* must be a statically known value. The bpf program may access the returned data slice as a normal buffer (eg can do direct reads and writes), since the verifier associates the length with the returned pointer, and enforces that no out of bounds accesses occur. Signed-off-by: Joanne Koong Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220523210712.3641569-6-joannelkoong@gmail.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c72321b6f306..a7080c86fa76 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -488,6 +488,7 @@ enum bpf_return_type { RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is -- cgit v1.2.3 From 5d7c854593a460706dacf8e1b16c9bdcb1c2d7bb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 28 Mar 2022 08:26:48 +0200 Subject: livepatch: Remove klp_arch_set_pc() and asm/livepatch.h All three versions of klp_arch_set_pc() do exactly the same: they call ftrace_instruction_pointer_set(). Call ftrace_instruction_pointer_set() directly and remove klp_arch_set_pc(). As klp_arch_set_pc() was the only thing remaining in asm/livepatch.h on x86 and s390, remove asm/livepatch.h livepatch.h remains on powerpc but its content is exclusively used by powerpc specific code. Signed-off-by: Christophe Leroy Acked-by: Petr Mladek Acked-by: Peter Zijlstra (Intel) Acked-by: Miroslav Benes Signed-off-by: Petr Mladek --- include/linux/livepatch.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 2614247a9781..293e29960c6e 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -16,8 +16,6 @@ #if IS_ENABLED(CONFIG_LIVEPATCH) -#include - /* task patch states */ #define KLP_UNDEFINED -1 #define KLP_UNPATCHED 0 -- cgit v1.2.3 From 7b4537199a4a8480b8c3ba37a2d44765ce76cd9b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 13 May 2022 20:39:22 +0900 Subject: kbuild: link symbol CRCs at final link, removing CONFIG_MODULE_REL_CRCS include/{linux,asm-generic}/export.h defines a weak symbol, __crc_* as a placeholder. Genksyms writes the version CRCs into the linker script, which will be used for filling the __crc_* symbols. The linker script format depends on CONFIG_MODULE_REL_CRCS. If it is enabled, __crc_* holds the offset to the reference of CRC. It is time to get rid of this complexity. Now that modpost parses text files (.*.cmd) to collect all the CRCs, it can generate C code that will be linked to the vmlinux or modules. Generate a new C file, .vmlinux.export.c, which contains the CRCs of symbols exported by vmlinux. It is compiled and linked to vmlinux in scripts/link-vmlinux.sh. Put the CRCs of symbols exported by modules into the existing *.mod.c files. No additional build step is needed for modules. As before, *.mod.c are compiled and linked to *.ko in scripts/Makefile.modfinal. No linker magic is used here. The new C implementation works in the same way, whether CONFIG_RELOCATABLE is enabled or not. CONFIG_MODULE_REL_CRCS is no longer needed. Previously, Kbuild invoked additional $(LD) to update the CRCs in objects, but this step is unneeded too. Signed-off-by: Masahiro Yamada Tested-by: Nathan Chancellor Tested-by: Nicolas Schier Reviewed-by: Nicolas Schier Tested-by: Sedat Dilek # LLVM-14 (x86-64) --- include/linux/export-internal.h | 17 +++++++++++++++++ include/linux/export.h | 30 ++++++++---------------------- 2 files changed, 25 insertions(+), 22 deletions(-) create mode 100644 include/linux/export-internal.h (limited to 'include/linux') diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h new file mode 100644 index 000000000000..c2b1d4fd5987 --- /dev/null +++ b/include/linux/export-internal.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Please do not include this explicitly. + * This is used by C files generated by modpost. + */ + +#ifndef __LINUX_EXPORT_INTERNAL_H__ +#define __LINUX_EXPORT_INTERNAL_H__ + +#include +#include + +/* __used is needed to keep __crc_* for LTO */ +#define SYMBOL_CRC(sym, crc, sec) \ + u32 __section("___kcrctab" sec "+" #sym) __used __crc_##sym = crc + +#endif /* __LINUX_EXPORT_INTERNAL_H__ */ diff --git a/include/linux/export.h b/include/linux/export.h index 27d848712b90..565c5ffcb26f 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -11,6 +11,14 @@ * hackers place grumpy comments in header files. */ +/* + * This comment block is used by fixdep. Please do not remove. + * + * When CONFIG_MODVERSIONS is changed from n to y, all source files having + * EXPORT_SYMBOL variants must be re-compiled because genksyms is run as a + * side effect of the *.o build rule. + */ + #ifndef __ASSEMBLY__ #ifdef MODULE extern struct module __this_module; @@ -19,26 +27,6 @@ extern struct module __this_module; #define THIS_MODULE ((struct module *)0) #endif -#ifdef CONFIG_MODVERSIONS -/* Mark the CRC weak since genksyms apparently decides not to - * generate a checksums for some symbols */ -#if defined(CONFIG_MODULE_REL_CRCS) -#define __CRC_SYMBOL(sym, sec) \ - asm(" .section \"___kcrctab" sec "+" #sym "\", \"a\" \n" \ - " .weak __crc_" #sym " \n" \ - " .long __crc_" #sym " - . \n" \ - " .previous \n") -#else -#define __CRC_SYMBOL(sym, sec) \ - asm(" .section \"___kcrctab" sec "+" #sym "\", \"a\" \n" \ - " .weak __crc_" #sym " \n" \ - " .long __crc_" #sym " \n" \ - " .previous \n") -#endif -#else -#define __CRC_SYMBOL(sym, sec) -#endif - #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS #include /* @@ -85,7 +73,6 @@ struct kernel_symbol { /* * For every exported symbol, do the following: * - * - If applicable, place a CRC entry in the __kcrctab section. * - Put the name of the symbol and namespace (empty string "" for none) in * __ksymtab_strings. * - Place a struct kernel_symbol entry in the __ksymtab section. @@ -98,7 +85,6 @@ struct kernel_symbol { extern typeof(sym) sym; \ extern const char __kstrtab_##sym[]; \ extern const char __kstrtabns_##sym[]; \ - __CRC_SYMBOL(sym, sec); \ asm(" .section \"__ksymtab_strings\",\"aMS\",%progbits,1 \n" \ "__kstrtab_" #sym ": \n" \ " .asciz \"" #sym "\" \n" \ -- cgit v1.2.3 From 421cfe6596f6cb316991c02bf30a93bd81092853 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Thu, 19 May 2022 14:33:11 -0400 Subject: vfio: remove VFIO_GROUP_NOTIFY_SET_KVM Rather than relying on a notifier for associating the KVM with the group, let's assume that the association has already been made prior to device_open. The first time a device is opened associate the group KVM with the device. This fixes a user-triggerable oops in GVT. Reviewed-by: Tony Krowiak Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Signed-off-by: Matthew Rosato Reviewed-by: Jason Gunthorpe Acked-by: Zhi Wang Link: https://lore.kernel.org/r/20220519183311.582380-2-mjrosato@linux.ibm.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 45b287826ce6..aa888cc51757 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -36,6 +36,8 @@ struct vfio_device { struct vfio_device_set *dev_set; struct list_head dev_set_list; unsigned int migration_flags; + /* Driver must reference the kvm during open_device or never touch it */ + struct kvm *kvm; /* Members below here are private, not for driver use */ refcount_t refcount; @@ -155,15 +157,11 @@ extern int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, /* each type has independent events */ enum vfio_notify_type { VFIO_IOMMU_NOTIFY = 0, - VFIO_GROUP_NOTIFY = 1, }; /* events for VFIO_IOMMU_NOTIFY */ #define VFIO_IOMMU_NOTIFY_DMA_UNMAP BIT(0) -/* events for VFIO_GROUP_NOTIFY */ -#define VFIO_GROUP_NOTIFY_SET_KVM BIT(0) - extern int vfio_register_notifier(struct vfio_device *device, enum vfio_notify_type type, unsigned long *required_events, -- cgit v1.2.3 From eadb2f47a3ced5c64b23b90fd2a3463f63726066 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Mon, 23 May 2022 19:11:02 +0100 Subject: lockdown: also lock down previous kgdb use KGDB and KDB allow read and write access to kernel memory, and thus should be restricted during lockdown. An attacker with access to a serial port (for example, via a hypervisor console, which some cloud vendors provide over the network) could trigger the debugger so it is important that the debugger respect the lockdown mode when/if it is triggered. Fix this by integrating lockdown into kdb's existing permissions mechanism. Unfortunately kgdb does not have any permissions mechanism (although it certainly could be added later) so, for now, kgdb is simply and brutally disabled by immediately exiting the gdb stub without taking any action. For lockdowns established early in the boot (e.g. the normal case) then this should be fine but on systems where kgdb has set breakpoints before the lockdown is enacted than "bad things" will happen. CVE: CVE-2022-21499 Co-developed-by: Stephen Brennan Signed-off-by: Stephen Brennan Reviewed-by: Douglas Anderson Signed-off-by: Daniel Thompson Signed-off-by: Linus Torvalds --- include/linux/security.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 25b3ef71f495..7fc4e9f49f54 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -121,10 +121,12 @@ enum lockdown_reason { LOCKDOWN_DEBUGFS, LOCKDOWN_XMON_WR, LOCKDOWN_BPF_WRITE_USER, + LOCKDOWN_DBG_WRITE_KERNEL, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, LOCKDOWN_BPF_READ_KERNEL, + LOCKDOWN_DBG_READ_KERNEL, LOCKDOWN_PERF, LOCKDOWN_TRACEFS, LOCKDOWN_XMON_RW, -- cgit v1.2.3 From 43994049180704fd1faf78623fabd9a5cd443708 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 4 May 2022 12:36:31 +0900 Subject: kprobes: Fix build errors with CONFIG_KRETPROBES=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Max Filippov reported: When building kernel with CONFIG_KRETPROBES=n kernel/kprobes.c compilation fails with the following messages: kernel/kprobes.c: In function ‘recycle_rp_inst’: kernel/kprobes.c:1273:32: error: implicit declaration of function ‘get_kretprobe’ kernel/kprobes.c: In function ‘kprobe_flush_task’: kernel/kprobes.c:1299:35: error: ‘struct task_struct’ has no member named ‘kretprobe_instances’ This came from the commit d741bf41d7c7 ("kprobes: Remove kretprobe hash") which introduced get_kretprobe() and kretprobe_instances member in task_struct when CONFIG_KRETPROBES=y, but did not make recycle_rp_inst() and kprobe_flush_task() depending on CONFIG_KRETPORBES. Since those functions are only used for kretprobe, move those functions into #ifdef CONFIG_KRETPROBE area. Link: https://lkml.kernel.org/r/165163539094.74407.3838114721073251225.stgit@devnote2 Reported-by: Max Filippov Fixes: d741bf41d7c7 ("kprobes: Remove kretprobe hash") Cc: "Naveen N . Rao" Cc: Anil S Keshavamurthy Cc: "David S . Miller" Cc: Peter Zijlstra Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Tested-by: Max Filippov Signed-off-by: Steven Rostedt (Google) --- include/linux/kprobes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 157168769fc2..55041d2f884d 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -424,7 +424,7 @@ void unregister_kretprobe(struct kretprobe *rp); int register_kretprobes(struct kretprobe **rps, int num); void unregister_kretprobes(struct kretprobe **rps, int num); -#ifdef CONFIG_KRETPROBE_ON_RETHOOK +#if defined(CONFIG_KRETPROBE_ON_RETHOOK) || !defined(CONFIG_KRETPROBES) #define kprobe_flush_task(tk) do {} while (0) #else void kprobe_flush_task(struct task_struct *tk); -- cgit v1.2.3 From 3a2bfec0b02f2226ff3376a5d2ff604d799bd7ea Mon Sep 17 00:00:00 2001 From: Li kunyu Date: Wed, 18 May 2022 10:36:40 +0800 Subject: ftrace: Remove return value of ftrace_arch_modify_*() All instances of the function ftrace_arch_modify_prepare() and ftrace_arch_modify_post_process() return zero. There's no point in checking their return value. Just have them be void functions. Link: https://lkml.kernel.org/r/20220518023639.4065-1-kunyu@nfschina.com Signed-off-by: Li kunyu Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4816b7e11047..a5f74f6e7e4e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -449,8 +449,8 @@ static inline void stack_tracer_enable(void) { } #ifdef CONFIG_DYNAMIC_FTRACE -int ftrace_arch_code_modify_prepare(void); -int ftrace_arch_code_modify_post_process(void); +void ftrace_arch_code_modify_prepare(void); +void ftrace_arch_code_modify_post_process(void); enum ftrace_bug_type { FTRACE_BUG_UNKNOWN, -- cgit v1.2.3 From 656d054e0a15ec327bd82801ccd58201e59f6896 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 May 2022 12:30:20 +0200 Subject: jump_label,noinstr: Avoid instrumentation for JUMP_LABEL=n builds When building x86_64 with JUMP_LABEL=n it's possible for instrumentation to sneak into noinstr: vmlinux.o: warning: objtool: exit_to_user_mode+0x14: call to static_key_count.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_exit_to_user_mode+0x2d: call to static_key_count.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_exit_to_user_mode+0x1b: call to static_key_count.constprop.0() leaves .noinstr.text section Switch to arch_ prefixed atomic to avoid the explicit instrumentation. Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) --- include/linux/jump_label.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 107751cc047b..bf1eef337a07 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -256,9 +256,9 @@ extern void static_key_disable_cpuslocked(struct static_key *key); #include #include -static inline int static_key_count(struct static_key *key) +static __always_inline int static_key_count(struct static_key *key) { - return atomic_read(&key->enabled); + return arch_atomic_read(&key->enabled); } static __always_inline void jump_label_init(void) -- cgit v1.2.3 From 620f8d3bd3d5e82dff8cc591c831827d4beeae2e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 7 May 2022 13:35:37 +0200 Subject: context_tracking: Always inline empty stubs Because GCC is seriously challenged.. vmlinux.o: warning: objtool: enter_from_user_mode+0x85: call to context_tracking_enabled() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_enter_from_user_mode+0x8f: call to context_tracking_enabled() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_enter_from_user_mode_prepare+0x85: call to context_tracking_enabled() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_enter_from_user_mode+0x85: call to context_tracking_enabled() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Link: https://lkml.kernel.org/r/20220526105958.134113388@infradead.org --- include/linux/context_tracking_state.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 65a60d3313b0..ae1e63e26947 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -46,10 +46,10 @@ static __always_inline bool context_tracking_in_user(void) return __this_cpu_read(context_tracking.state) == CONTEXT_USER; } #else -static inline bool context_tracking_in_user(void) { return false; } -static inline bool context_tracking_enabled(void) { return false; } -static inline bool context_tracking_enabled_cpu(int cpu) { return false; } -static inline bool context_tracking_enabled_this_cpu(void) { return false; } +static __always_inline bool context_tracking_in_user(void) { return false; } +static __always_inline bool context_tracking_enabled(void) { return false; } +static __always_inline bool context_tracking_enabled_cpu(int cpu) { return false; } +static __always_inline bool context_tracking_enabled_this_cpu(void) { return false; } #endif /* CONFIG_CONTEXT_TRACKING */ #endif -- cgit v1.2.3 From b9684a71fca793213378dd410cd11675d973eaa1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 May 2022 07:58:06 +0200 Subject: block, loop: support partitions without scanning Historically we did distinguish between a flag that surpressed partition scanning, and a combinations of the minors variable and another flag if any partitions were supported. This was generally confusing and doesn't make much sense, but some corner case uses of the loop driver actually do want to support manually added partitions on a device that does not actively scan for partitions. To make things worsee the loop driver also wants to dynamically toggle the scanning for partitions on a live gendisk, which makes the disk->flags updates non-atomic. Introduce a new GD_SUPPRESS_PART_SCAN bit in disk->state that disables just scanning for partitions, and toggle that instead of GENHD_FL_NO_PART in the loop driver. Fixes: 1ebe2e5f9d68 ("block: remove GENHD_FL_EXT_DEVT") Reported-by: Ming Lei Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20220527055806.1972352-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index afad3d1d0dac..691b4c15b8ce 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -147,6 +147,7 @@ struct gendisk { #define GD_DEAD 2 #define GD_NATIVE_CAPACITY 3 #define GD_ADDED 4 +#define GD_SUPPRESS_PART_SCAN 5 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ -- cgit v1.2.3 From 3e35142ef99fe6b4fe5d834ad43ee13cca10a2dc Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 19 May 2022 14:42:37 +0530 Subject: kexec_file: drop weak attribute from arch_kexec_apply_relocations[_add] Since commit d1bcae833b32f1 ("ELF: Don't generate unused section symbols") [1], binutils (v2.36+) started dropping section symbols that it thought were unused. This isn't an issue in general, but with kexec_file.c, gcc is placing kexec_arch_apply_relocations[_add] into a separate .text.unlikely section and the section symbol ".text.unlikely" is being dropped. Due to this, recordmcount is unable to find a non-weak symbol in .text.unlikely to generate a relocation record against. Address this by dropping the weak attribute from these functions. Instead, follow the existing pattern of having architectures #define the name of the function they want to override in their headers. [1] https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=d1bcae833b32f1 [akpm@linux-foundation.org: arch/s390/include/asm/kexec.h needs linux/module.h] Link: https://lkml.kernel.org/r/20220519091237.676736-1-naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Michael Ellerman Signed-off-by: Naveen N. Rao Cc: "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton --- include/linux/kexec.h | 46 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 58d1b58a971e..fcd5035209f1 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -193,14 +193,6 @@ void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len); void *arch_kexec_kernel_image_load(struct kimage *image); -int arch_kexec_apply_relocations_add(struct purgatory_info *pi, - Elf_Shdr *section, - const Elf_Shdr *relsec, - const Elf_Shdr *symtab); -int arch_kexec_apply_relocations(struct purgatory_info *pi, - Elf_Shdr *section, - const Elf_Shdr *relsec, - const Elf_Shdr *symtab); int arch_kimage_file_post_load_cleanup(struct kimage *image); #ifdef CONFIG_KEXEC_SIG int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, @@ -229,6 +221,44 @@ extern int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mend); extern int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, void **addr, unsigned long *sz); + +#ifndef arch_kexec_apply_relocations_add +/* + * arch_kexec_apply_relocations_add - apply relocations of type RELA + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELAs. + * @symtab: Corresponding symtab. + * + * Return: 0 on success, negative errno on error. + */ +static inline int +arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, + const Elf_Shdr *relsec, const Elf_Shdr *symtab) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} +#endif + +#ifndef arch_kexec_apply_relocations +/* + * arch_kexec_apply_relocations - apply relocations of type REL + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELs. + * @symtab: Corresponding symtab. + * + * Return: 0 on success, negative errno on error. + */ +static inline int +arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section, + const Elf_Shdr *relsec, const Elf_Shdr *symtab) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} +#endif #endif /* CONFIG_KEXEC_FILE */ #ifdef CONFIG_KEXEC_ELF -- cgit v1.2.3 From 9f186f9e5fa9ebdaef909fd45f825a6ce281f13c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 19 May 2022 20:50:26 +0800 Subject: mm/swapfile: unuse_pte can map random data if swap read fails Patch series "A few fixup patches for mm", v4. This series contains a few patches to avoid mapping random data if swap read fails and fix lost swap bits in unuse_pte. Also we free hwpoison and swapin error entry in madvise_free_pte_range and so on. More details can be found in the respective changelogs. This patch (of 5): There is a bug in unuse_pte(): when swap page happens to be unreadable, page filled with random data is mapped into user address space. In case of error, a special swap entry indicating swap read fails is set to the page table. So the swapcache page can be freed and the user won't end up with a permanently mounted swap because a sector is bad. And if the page is accessed later, the user process will be killed so that corrupted data is never consumed. On the other hand, if the page is never accessed, the user won't even notice it. Link: https://lkml.kernel.org/r/20220519125030.21486-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220519125030.21486-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: David Howells Cc: NeilBrown Cc: Alistair Popple Cc: Suren Baghdasaryan Cc: Peter Xu Cc: Ralph Campbell Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- include/linux/swap.h | 7 ++++++- include/linux/swapops.h | 10 ++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index f3ae17b43f20..0c0fed1b348f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -55,6 +55,10 @@ static inline int current_is_kswapd(void) * actions on faults. */ +#define SWP_SWAPIN_ERROR_NUM 1 +#define SWP_SWAPIN_ERROR (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ + SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \ + SWP_PTE_MARKER_NUM) /* * PTE markers are used to persist information onto PTEs that are mapped with * file-backed memories. As its name "PTE" hints, it should only be applied to @@ -120,7 +124,8 @@ static inline int current_is_kswapd(void) #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM) + SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ + SWP_PTE_MARKER_NUM - SWP_SWAPIN_ERROR_NUM) /* * Magic header for a swap area. The first part of the union is diff --git a/include/linux/swapops.h b/include/linux/swapops.h index fe220df499f1..f24775b41880 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -108,6 +108,16 @@ static inline void *swp_to_radix_entry(swp_entry_t entry) return xa_mk_value(entry.val); } +static inline swp_entry_t make_swapin_error_entry(struct page *page) +{ + return swp_entry(SWP_SWAPIN_ERROR, page_to_pfn(page)); +} + +static inline int is_swapin_error_entry(swp_entry_t entry) +{ + return swp_type(entry) == SWP_SWAPIN_ERROR; +} + #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { -- cgit v1.2.3 From 1c563432588dbffa71e67ca6e37c826f9fa86e04 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 24 May 2022 10:15:25 -0700 Subject: mm: fix is_pinnable_page against a cma page Pages in the CMA area could have MIGRATE_ISOLATE as well as MIGRATE_CMA so the current is_pinnable_page() could miss CMA pages which have MIGRATE_ISOLATE. It ends up pinning CMA pages as longterm for the pin_user_pages() API so CMA allocations keep failing until the pin is released. CPU 0 CPU 1 - Task B cma_alloc alloc_contig_range pin_user_pages_fast(FOLL_LONGTERM) change pageblock as MIGRATE_ISOLATE internal_get_user_pages_fast lockless_pages_from_mm gup_pte_range try_grab_folio is_pinnable_page return true; So, pinned the page successfully. page migration failure with pinned page .. .. After 30 sec unpin_user_page(page) CMA allocation succeeded after 30 sec. The CMA allocation path protects the migration type change race using zone->lock but what GUP path need to know is just whether the page is on CMA area or not rather than exact migration type. Thus, we don't need zone->lock but just checks migration type in either of (MIGRATE_ISOLATE and MIGRATE_CMA). Adding the MIGRATE_ISOLATE check in is_pinnable_page could cause rejecting of pinning pages on MIGRATE_ISOLATE pageblocks even though it's neither CMA nor movable zone if the page is temporarily unmovable. However, such a migration failure by unexpected temporal refcount holding is general issue, not only come from MIGRATE_ISOLATE and the MIGRATE_ISOLATE is also transient state like other temporal elevated refcount problem. Link: https://lkml.kernel.org/r/20220524171525.976723-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: John Hubbard Acked-by: Paul E. McKenney Cc: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index de32c0383387..93a46ff33dc2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1594,8 +1594,13 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, #ifdef CONFIG_MIGRATION static inline bool is_pinnable_page(struct page *page) { - return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) || - is_zero_pfn(page_to_pfn(page)); +#ifdef CONFIG_CMA + int mt = get_pageblock_migratetype(page); + + if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) + return false; +#endif + return !(is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page))); } #else static inline bool is_pinnable_page(struct page *page) -- cgit v1.2.3 From 98d40e76652e9aeb3aec4065f600d633ed335e94 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 24 May 2022 07:56:30 +0200 Subject: block: document BLK_STS_AGAIN usage BLK_STS_AGAIN should only be used if RQF_NOWAIT is set and the bio would block. So we'd better document that to avoid accidental misuse. Signed-off-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220524055631.85480-2-hare@suse.de Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 40e815400611..8b38367d1bc7 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -105,6 +105,10 @@ typedef u16 blk_short_t; /* hack for device mapper, don't use elsewhere: */ #define BLK_STS_DM_REQUEUE ((__force blk_status_t)11) +/* + * BLK_STS_AGAIN should only be returned if RQF_NOWAIT is set + * and the bio would block (cf bio_wouldblock_error()) + */ #define BLK_STS_AGAIN ((__force blk_status_t)12) /* -- cgit v1.2.3 From e2e530867245d051dc7800b0d07193b3e581f5b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 May 2022 14:15:30 +0200 Subject: blk-mq: remove the done argument to blk_execute_rq_nowait Let the caller set it together with the end_io_data instead of passing a pointless argument. Note the the target code did in fact already set it and then just overrode it again by calling blk_execute_rq_nowait. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220524121530.943123-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9f07061418db..e2d9daf7e8dd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -969,8 +969,7 @@ int blk_rq_unmap_user(struct bio *); int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); int blk_rq_append_bio(struct request *rq, struct bio *bio); -void blk_execute_rq_nowait(struct request *rq, bool at_head, - rq_end_io_fn *end_io); +void blk_execute_rq_nowait(struct request *rq, bool at_head); blk_status_t blk_execute_rq(struct request *rq, bool at_head); struct req_iterator { -- cgit v1.2.3 From 3e0b8f529c10037ae0b369fc892e524eae5a5485 Mon Sep 17 00:00:00 2001 From: Arun Ajith S Date: Mon, 30 May 2022 10:14:14 +0000 Subject: net/ipv6: Expand and rename accept_unsolicited_na to accept_untracked_na RFC 9131 changes default behaviour of handling RX of NA messages when the corresponding entry is absent in the neighbour cache. The current implementation is limited to accept just unsolicited NAs. However, the RFC is more generic where it also accepts solicited NAs. Both types should result in adding a STALE entry for this case. Expand accept_untracked_na behaviour to also accept solicited NAs to be compliant with the RFC and rename the sysctl knob to accept_untracked_na. Fixes: f9a2fb73318e ("net/ipv6: Introduce accept_unsolicited_na knob to implement router-side changes for RFC9131") Signed-off-by: Arun Ajith S Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220530101414.65439-1-aajith@arista.com Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 38c8203d52cb..37dfdcfcdd54 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -61,7 +61,7 @@ struct ipv6_devconf { __s32 suppress_frag_ndisc; __s32 accept_ra_mtu; __s32 drop_unsolicited_na; - __s32 accept_unsolicited_na; + __s32 accept_untracked_na; struct ipv6_stable_secret { bool initialized; struct in6_addr secret; -- cgit v1.2.3 From 13b00b135665c92065a27c0c39dd97e0f380bd4f Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 18 May 2022 16:38:00 +0300 Subject: vdpa: Add support for querying vendor statistics Allows to read vendor statistics of a vdpa device. The specific statistics data are received from the upstream driver in the form of an (attribute name, attribute value) pairs. An example of statistics for mlx5_vdpa device are: received_desc - number of descriptors received by the virtqueue completed_desc - number of descriptors completed by the virtqueue A descriptor using indirect buffers is still counted as 1. In addition, N chained descriptors are counted correctly N times as one would expect. A new callback was added to vdpa_config_ops which provides the means for the vdpa driver to return statistics results. The interface allows for reading all the supported virtqueues, including the control virtqueue if it exists. Below are some examples taken from mlx5_vdpa which are introduced in the following patch: 1. Read statistics for the virtqueue at index 1 $ vdpa dev vstats show vdpa-a qidx 1 vdpa-a: queue_type tx queue_index 1 received_desc 3844836 completed_desc 3844836 2. Read statistics for the virtqueue at index 32 $ vdpa dev vstats show vdpa-a qidx 32 vdpa-a: queue_type control_vq queue_index 32 received_desc 62 completed_desc 62 3. Read statisitics for the virtqueue at index 0 with json output $ vdpa -j dev vstats show vdpa-a qidx 0 {"vstats":{"vdpa-a":{ "queue_type":"rx","queue_index":0,"name":"received_desc","value":417776,\ "name":"completed_desc","value":417548}}} 4. Read statistics for the virtqueue at index 0 with preety json output $ vdpa -jp dev vstats show vdpa-a qidx 0 { "vstats": { "vdpa-a": { "queue_type": "rx", "queue_index": 0, "name": "received_desc", "value": 417776, "name": "completed_desc", "value": 417548 } } } Signed-off-by: Eli Cohen Message-Id: <20220518133804.1075129-3-elic@nvidia.com> Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 8943a209202e..2ae8443331e1 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -276,6 +276,9 @@ struct vdpa_config_ops { const struct vdpa_vq_state *state); int (*get_vq_state)(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state); + int (*get_vendor_vq_stats)(struct vdpa_device *vdev, u16 idx, + struct sk_buff *msg, + struct netlink_ext_ack *extack); struct vdpa_notification_area (*get_vq_notification)(struct vdpa_device *vdev, u16 idx); /* vq irq is not expected to be changed once DRIVER_OK is set */ -- cgit v1.2.3 From a6a51adc6e8aafebfe0c4beb80e99694ea562b40 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 18 May 2022 16:38:02 +0300 Subject: net/vdpa: Use readers/writers semaphore instead of cf_mutex Replace cf_mutex with rw_semaphore to reflect the fact that some calls could be called concurrently but can suffice with read lock. Suggested-by: Si-Wei Liu Signed-off-by: Eli Cohen Message-Id: <20220518133804.1075129-5-elic@nvidia.com> Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 2ae8443331e1..2cb14847831e 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -66,7 +66,7 @@ struct vdpa_mgmt_dev; * @dma_dev: the actual device that is performing DMA * @driver_override: driver name to force a match * @config: the configuration ops for this device. - * @cf_mutex: Protects get and set access to configuration layout. + * @cf_lock: Protects get and set access to configuration layout. * @index: device index * @features_valid: were features initialized? for legacy guests * @use_va: indicate whether virtual address must be used by this device @@ -79,7 +79,7 @@ struct vdpa_device { struct device *dma_dev; const char *driver_override; const struct vdpa_config_ops *config; - struct mutex cf_mutex; /* Protects get/set config */ + struct rw_semaphore cf_lock; /* Protects get/set config */ unsigned int index; bool features_valid; bool use_va; @@ -398,10 +398,10 @@ static inline int vdpa_reset(struct vdpa_device *vdev) const struct vdpa_config_ops *ops = vdev->config; int ret; - mutex_lock(&vdev->cf_mutex); + down_write(&vdev->cf_lock); vdev->features_valid = false; ret = ops->reset(vdev); - mutex_unlock(&vdev->cf_mutex); + up_write(&vdev->cf_lock); return ret; } @@ -420,9 +420,9 @@ static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features) { int ret; - mutex_lock(&vdev->cf_mutex); + down_write(&vdev->cf_lock); ret = vdpa_set_features_unlocked(vdev, features); - mutex_unlock(&vdev->cf_mutex); + up_write(&vdev->cf_lock); return ret; } -- cgit v1.2.3 From 1892a3d425bf525ac98d6d3534035e6ed2bfab50 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 18 May 2022 16:38:03 +0300 Subject: vdpa/mlx5: Add support for reading descriptor statistics Implement the get_vq_stats calback of vdpa_config_ops to return the statistics for a virtqueue. The statistics are provided as vendor specific statistics where the driver provides a pair of attribute name and attribute value. Currently supported are received descriptors and completed descriptors. Signed-off-by: Eli Cohen Message-Id: <20220518133804.1075129-6-elic@nvidia.com> Signed-off-by: Michael S. Tsirkin --- include/linux/mlx5/mlx5_ifc.h | 1 + include/linux/mlx5/mlx5_ifc_vdpa.h | 39 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 78b3d3465dd7..2a8334bb5f82 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -87,6 +87,7 @@ enum { enum { MLX5_OBJ_TYPE_GENEVE_TLV_OPT = 0x000b, MLX5_OBJ_TYPE_VIRTIO_NET_Q = 0x000d, + MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS = 0x001c, MLX5_OBJ_TYPE_MATCH_DEFINER = 0x0018, MLX5_OBJ_TYPE_MKEY = 0xff01, MLX5_OBJ_TYPE_QP = 0xff02, diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h index 1a9c9d94cb59..4414ed5b6ed2 100644 --- a/include/linux/mlx5/mlx5_ifc_vdpa.h +++ b/include/linux/mlx5/mlx5_ifc_vdpa.h @@ -165,4 +165,43 @@ struct mlx5_ifc_modify_virtio_net_q_out_bits { struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr; }; +struct mlx5_ifc_virtio_q_counters_bits { + u8 modify_field_select[0x40]; + u8 reserved_at_40[0x40]; + u8 received_desc[0x40]; + u8 completed_desc[0x40]; + u8 error_cqes[0x20]; + u8 bad_desc_errors[0x20]; + u8 exceed_max_chain[0x20]; + u8 invalid_buffer[0x20]; + u8 reserved_at_180[0x280]; +}; + +struct mlx5_ifc_create_virtio_q_counters_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_virtio_q_counters_bits virtio_q_counters; +}; + +struct mlx5_ifc_create_virtio_q_counters_out_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_virtio_q_counters_bits virtio_q_counters; +}; + +struct mlx5_ifc_destroy_virtio_q_counters_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; +}; + +struct mlx5_ifc_destroy_virtio_q_counters_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr; +}; + +struct mlx5_ifc_query_virtio_q_counters_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; +}; + +struct mlx5_ifc_query_virtio_q_counters_out_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_virtio_q_counters_bits counters; +}; + #endif /* __MLX5_IFC_VDPA_H_ */ -- cgit v1.2.3 From d4821902e43453b85b31329441a9f6ac071228a8 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:45 +0530 Subject: vdpa: introduce virtqueue groups This patch introduces virtqueue groups to vDPA device. The virtqueue group is the minimal set of virtqueues that must share an address space. And the address space identifier could only be attached to a specific virtqueue group. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-6-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 2cb14847831e..e4e53574183e 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -85,6 +85,7 @@ struct vdpa_device { bool use_va; u32 nvqs; struct vdpa_mgmt_dev *mdev; + unsigned int ngroups; }; /** @@ -172,6 +173,10 @@ struct vdpa_map_file { * for the device * @vdev: vdpa device * Returns virtqueue algin requirement + * @get_vq_group: Get the group id for a specific virtqueue + * @vdev: vdpa device + * @idx: virtqueue index + * Returns u32: group id for this virtqueue * @get_device_features: Get virtio features supported by the device * @vdev: vdpa device * Returns the virtio features support by the @@ -286,6 +291,7 @@ struct vdpa_config_ops { /* Device ops */ u32 (*get_vq_align)(struct vdpa_device *vdev); + u32 (*get_vq_group)(struct vdpa_device *vdev, u16 idx); u64 (*get_device_features)(struct vdpa_device *vdev); int (*set_driver_features)(struct vdpa_device *vdev, u64 features); u64 (*get_driver_features)(struct vdpa_device *vdev); @@ -318,6 +324,7 @@ struct vdpa_config_ops { struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, + unsigned int ngroups, size_t size, const char *name, bool use_va); @@ -328,17 +335,18 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, * @member: the name of struct vdpa_device within the @dev_struct * @parent: the parent device * @config: the bus operations that is supported by this device + * @ngroups: the number of virtqueue groups supported by this device * @name: name of the vdpa device * @use_va: indicate whether virtual address must be used by this device * * Return allocated data structure or ERR_PTR upon error */ -#define vdpa_alloc_device(dev_struct, member, parent, config, name, use_va) \ - container_of(__vdpa_alloc_device( \ - parent, config, \ +#define vdpa_alloc_device(dev_struct, member, parent, config, ngroups, name, use_va) \ + container_of((__vdpa_alloc_device( \ + parent, config, ngroups, \ sizeof(dev_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ - dev_struct, member)), name, use_va), \ + dev_struct, member)), name, use_va)), \ dev_struct, member) int vdpa_register_device(struct vdpa_device *vdev, u32 nvqs); -- cgit v1.2.3 From db9adcbf4286ad1c1fca091a870db6e49bb0df07 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:46 +0530 Subject: vdpa: multiple address spaces support This patches introduces the multiple address spaces support for vDPA device. This idea is to identify a specific address space via an dedicated identifier - ASID. During vDPA device allocation, vDPA device driver needs to report the number of address spaces supported by the device then the DMA mapping ops of the vDPA device needs to be extended to support ASID. This helps to isolate the environments for the virtqueue that will not be assigned directly. E.g in the case of virtio-net, the control virtqueue will not be assigned directly to guest. As a start, simply claim 1 virtqueue groups and 1 address spaces for all vDPA devices. And vhost-vDPA will simply reject the device with more than 1 virtqueue groups or address spaces. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-7-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index e4e53574183e..1515748e84ff 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -69,6 +69,8 @@ struct vdpa_mgmt_dev; * @cf_lock: Protects get and set access to configuration layout. * @index: device index * @features_valid: were features initialized? for legacy guests + * @ngroups: the number of virtqueue groups + * @nas: the number of address spaces * @use_va: indicate whether virtual address must be used by this device * @nvqs: maximum number of supported virtqueues * @mdev: management device pointer; caller must setup when registering device as part @@ -86,6 +88,7 @@ struct vdpa_device { u32 nvqs; struct vdpa_mgmt_dev *mdev; unsigned int ngroups; + unsigned int nas; }; /** @@ -241,6 +244,7 @@ struct vdpa_map_file { * Needed for device that using device * specific DMA translation (on-chip IOMMU) * @vdev: vdpa device + * @asid: address space identifier * @iotlb: vhost memory mapping to be * used by the vDPA * Returns integer: success (0) or error (< 0) @@ -249,6 +253,7 @@ struct vdpa_map_file { * specific DMA translation (on-chip IOMMU) * and preferring incremental map. * @vdev: vdpa device + * @asid: address space identifier * @iova: iova to be mapped * @size: size of the area * @pa: physical address for the map @@ -260,6 +265,7 @@ struct vdpa_map_file { * specific DMA translation (on-chip IOMMU) * and preferring incremental unmap. * @vdev: vdpa device + * @asid: address space identifier * @iova: iova to be unmapped * @size: size of the area * Returns integer: success (0) or error (< 0) @@ -313,10 +319,12 @@ struct vdpa_config_ops { struct vdpa_iova_range (*get_iova_range)(struct vdpa_device *vdev); /* DMA ops */ - int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb); - int (*dma_map)(struct vdpa_device *vdev, u64 iova, u64 size, - u64 pa, u32 perm, void *opaque); - int (*dma_unmap)(struct vdpa_device *vdev, u64 iova, u64 size); + int (*set_map)(struct vdpa_device *vdev, unsigned int asid, + struct vhost_iotlb *iotlb); + int (*dma_map)(struct vdpa_device *vdev, unsigned int asid, + u64 iova, u64 size, u64 pa, u32 perm, void *opaque); + int (*dma_unmap)(struct vdpa_device *vdev, unsigned int asid, + u64 iova, u64 size); /* Free device resources */ void (*free)(struct vdpa_device *vdev); @@ -324,7 +332,7 @@ struct vdpa_config_ops { struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, - unsigned int ngroups, + unsigned int ngroups, unsigned int nas, size_t size, const char *name, bool use_va); @@ -336,17 +344,19 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, * @parent: the parent device * @config: the bus operations that is supported by this device * @ngroups: the number of virtqueue groups supported by this device + * @nas: the number of address spaces * @name: name of the vdpa device * @use_va: indicate whether virtual address must be used by this device * * Return allocated data structure or ERR_PTR upon error */ -#define vdpa_alloc_device(dev_struct, member, parent, config, ngroups, name, use_va) \ +#define vdpa_alloc_device(dev_struct, member, parent, config, ngroups, nas, \ + name, use_va) \ container_of((__vdpa_alloc_device( \ - parent, config, ngroups, \ - sizeof(dev_struct) + \ + parent, config, ngroups, nas, \ + (sizeof(dev_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ - dev_struct, member)), name, use_va)), \ + dev_struct, member))), name, use_va)), \ dev_struct, member) int vdpa_register_device(struct vdpa_device *vdev, u32 nvqs); -- cgit v1.2.3 From 46d554b1bcd19133401d2d5c0728b85e7bfd1358 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:47 +0530 Subject: vdpa: introduce config operations for associating ASID to a virtqueue group This patch introduces a new bus operation to allow the vDPA bus driver to associate an ASID to a virtqueue group. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-8-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 1515748e84ff..f336d253db3d 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -240,6 +240,12 @@ struct vdpa_map_file { * @vdev: vdpa device * Returns the iova range supported by * the device. + * @set_group_asid: Set address space identifier for a + * virtqueue group + * @vdev: vdpa device + * @group: virtqueue group + * @asid: address space id for this group + * Returns integer: success (0) or error (< 0) * @set_map: Set device memory mapping (optional) * Needed for device that using device * specific DMA translation (on-chip IOMMU) @@ -325,6 +331,8 @@ struct vdpa_config_ops { u64 iova, u64 size, u64 pa, u32 perm, void *opaque); int (*dma_unmap)(struct vdpa_device *vdev, unsigned int asid, u64 iova, u64 size); + int (*set_group_asid)(struct vdpa_device *vdev, unsigned int group, + unsigned int asid); /* Free device resources */ void (*free)(struct vdpa_device *vdev); -- cgit v1.2.3 From 1cb108994c6830cc6a6e066ad7d9a22ef59fa167 Mon Sep 17 00:00:00 2001 From: Gautam Dawar Date: Wed, 30 Mar 2022 23:33:48 +0530 Subject: vhost_iotlb: split out IOTLB initialization This patch splits out IOTLB initialization to make sure it could be reused by external modules. Signed-off-by: Jason Wang Signed-off-by: Gautam Dawar Message-Id: <20220330180436.24644-9-gdawar@xilinx.com> Signed-off-by: Michael S. Tsirkin --- include/linux/vhost_iotlb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vhost_iotlb.h b/include/linux/vhost_iotlb.h index 2d0e2f52f938..e79a40838998 100644 --- a/include/linux/vhost_iotlb.h +++ b/include/linux/vhost_iotlb.h @@ -36,6 +36,8 @@ int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, u64 start, u64 last, u64 addr, unsigned int perm); void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last); +void vhost_iotlb_init(struct vhost_iotlb *iotlb, unsigned int limit, + unsigned int flags); struct vhost_iotlb *vhost_iotlb_alloc(unsigned int limit, unsigned int flags); void vhost_iotlb_free(struct vhost_iotlb *iotlb); void vhost_iotlb_reset(struct vhost_iotlb *iotlb); -- cgit v1.2.3 From ffbda8e9df10d1784d5427ec199e7d8308e3763f Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Fri, 29 Apr 2022 17:10:30 +0800 Subject: vdpa/vp_vdpa : add vdpa tool support in vp_vdpa this patch is to add the support for vdpa tool in vp_vdpa here is the example steps modprobe vp_vdpa modprobe vhost_vdpa echo 0000:00:06.0>/sys/bus/pci/drivers/virtio-pci/unbind echo 1af4 1041 > /sys/bus/pci/drivers/vp-vdpa/new_id vdpa dev add name vdpa1 mgmtdev pci/0000:00:06.0 Signed-off-by: Cindy Lu Message-Id: <20220429091030.547434-1-lulu@redhat.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- include/linux/vdpa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index f336d253db3d..15af802d41c4 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -492,7 +492,7 @@ struct vdpa_mgmtdev_ops { struct vdpa_mgmt_dev { struct device *device; const struct vdpa_mgmtdev_ops *ops; - const struct virtio_device_id *id_table; + struct virtio_device_id *id_table; u64 config_attr_mask; struct list_head list; u64 supported_features; -- cgit v1.2.3 From 48b69959a8559db5df78661d9c6d5138c2f20871 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 27 May 2022 14:01:14 +0800 Subject: virtio: introduce config op to synchronize vring callbacks This patch introduces new virtio config op to vring callbacks. Transport specific method is required to make sure the write before this function is visible to the vring_interrupt() that is called after the return of this function. For the transport that doesn't provide synchronize_vqs(), use synchornize_rcu() which synchronize with IRQ implicitly as a fallback. Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: "Paul E. McKenney" Cc: Marc Zyngier Cc: Halil Pasic Cc: Cornelia Huck Cc: Vineeth Vijayan Cc: Peter Oberparleiter Cc: linux-s390@vger.kernel.org Reviewed-by: Cornelia Huck Signed-off-by: Jason Wang Message-Id: <20220527060120.20964-4-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo Reviewed-by: Stefano Garzarella --- include/linux/virtio_config.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index b341dd62aa4d..25be018810a7 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -57,6 +57,11 @@ struct virtio_shm_region { * include a NULL entry for vqs unused by driver * Returns 0 on success or error status * @del_vqs: free virtqueues found by find_vqs(). + * @synchronize_cbs: synchronize with the virtqueue callbacks (optional) + * The function guarantees that all memory operations on the + * queue before it are visible to the vring_interrupt() that is + * called after it. + * vdev: the virtio_device * @get_features: get the array of feature bits for this device. * vdev: the virtio_device * Returns the first 64 feature bits (all we currently need). @@ -89,6 +94,7 @@ struct virtio_config_ops { const char * const names[], const bool *ctx, struct irq_affinity *desc); void (*del_vqs)(struct virtio_device *); + void (*synchronize_cbs)(struct virtio_device *); u64 (*get_features)(struct virtio_device *vdev); int (*finalize_features)(struct virtio_device *vdev); const char *(*bus_name)(struct virtio_device *vdev); @@ -217,6 +223,25 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs, desc); } +/** + * virtio_synchronize_cbs - synchronize with virtqueue callbacks + * @vdev: the device + */ +static inline +void virtio_synchronize_cbs(struct virtio_device *dev) +{ + if (dev->config->synchronize_cbs) { + dev->config->synchronize_cbs(dev); + } else { + /* + * A best effort fallback to synchronize with + * interrupts, preemption and softirq disabled + * regions. See comment above synchronize_rcu(). + */ + synchronize_rcu(); + } +} + /** * virtio_device_ready - enable vq use in probe function * @vdev: the device -- cgit v1.2.3 From be83f04d2529e8dc4273efdd1ccf7b7502741071 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 27 May 2022 14:01:18 +0800 Subject: virtio: allow to unbreak virtqueue This patch allows the new introduced __virtio_break_device() to unbreak the virtqueue. Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: "Paul E. McKenney" Cc: Marc Zyngier Cc: Halil Pasic Cc: Cornelia Huck Cc: Vineeth Vijayan Cc: Peter Oberparleiter Cc: linux-s390@vger.kernel.org Signed-off-by: Jason Wang Message-Id: <20220527060120.20964-8-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo --- include/linux/virtio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 5464f398912a..d8fdf170637c 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -131,6 +131,7 @@ void unregister_virtio_device(struct virtio_device *dev); bool is_virtio_device(struct device *dev); void virtio_break_device(struct virtio_device *dev); +void __virtio_unbreak_device(struct virtio_device *dev); void virtio_config_changed(struct virtio_device *dev); #ifdef CONFIG_PM_SLEEP -- cgit v1.2.3 From 8b4ec69d7e098a7ddf832e1e7840de53ed474c77 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 27 May 2022 14:01:19 +0800 Subject: virtio: harden vring IRQ This is a rework on the previous IRQ hardening that is done for virtio-pci where several drawbacks were found and were reverted: 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ that is used by some device such as virtio-blk 2) done only for PCI transport The vq->broken is re-used in this patch for implementing the IRQ hardening. The vq->broken is set to true during both initialization and reset. And the vq->broken is set to false in virtio_device_ready(). Then vring_interrupt() can check and return when vq->broken is true. And in this case, switch to return IRQ_NONE to let the interrupt core aware of such invalid interrupt to prevent IRQ storm. The reason of using a per queue variable instead of a per device one is that we may need it for per queue reset hardening in the future. Note that the hardening is only done for vring interrupt since the config interrupt hardening is already done in commit 22b7050a024d7 ("virtio: defer config changed notifications"). But the method that is used by config interrupt can't be reused by the vring interrupt handler because it uses spinlock to do the synchronization which is expensive. Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: "Paul E. McKenney" Cc: Marc Zyngier Cc: Halil Pasic Cc: Cornelia Huck Cc: Vineeth Vijayan Cc: Peter Oberparleiter Cc: linux-s390@vger.kernel.org Signed-off-by: Jason Wang Message-Id: <20220527060120.20964-9-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo --- include/linux/virtio_config.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 25be018810a7..d4edfd7d91bb 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -256,6 +256,26 @@ void virtio_device_ready(struct virtio_device *dev) unsigned status = dev->config->get_status(dev); BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK); + + /* + * The virtio_synchronize_cbs() makes sure vring_interrupt() + * will see the driver specific setup if it sees vq->broken + * as false (even if the notifications come before DRIVER_OK). + */ + virtio_synchronize_cbs(dev); + __virtio_unbreak_device(dev); + /* + * The transport should ensure the visibility of vq->broken + * before setting DRIVER_OK. See the comments for the transport + * specific set_status() method. + * + * A well behaved device will only notify a virtqueue after + * DRIVER_OK, this means the device should "see" the coherenct + * memory write that set vq->broken as false which is done by + * the driver when it sees DRIVER_OK, then the following + * driver's vring_interrupt() will see vq->broken as false so + * we won't lose any notification. + */ dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK); } -- cgit v1.2.3 From 619e9e14ba3c97a80776c0b5a68a01caa41dd148 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 27 May 2022 14:01:20 +0800 Subject: virtio: use WARN_ON() to warning illegal status value We used to use BUG_ON() in virtio_device_ready() to detect illegal status value, this seems sub-optimal since the value is under the control of the device. Switch to use WARN_ON() instead. Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: "Paul E. McKenney" Cc: Marc Zyngier Cc: Halil Pasic Cc: Cornelia Huck Cc: Vineeth Vijayan Cc: Peter Oberparleiter Cc: linux-s390@vger.kernel.org Signed-off-by: Jason Wang Message-Id: <20220527060120.20964-10-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo --- include/linux/virtio_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index d4edfd7d91bb..9a36051ceb76 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -255,7 +255,7 @@ void virtio_device_ready(struct virtio_device *dev) { unsigned status = dev->config->get_status(dev); - BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK); + WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK); /* * The virtio_synchronize_cbs() makes sure vring_interrupt() -- cgit v1.2.3 From 3fc2a9e89b3508a5cc0c324f26d7b4740ba8c456 Mon Sep 17 00:00:00 2001 From: Changcheng Liu Date: Tue, 26 Apr 2022 21:28:14 +0800 Subject: net/mlx5: correct ECE offset in query qp output ECE field should be after opt_param_mask in query qp output. Fixes: 6b646a7e4af6 ("net/mlx5: Add ability to read and write ECE options") Signed-off-by: Changcheng Liu Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 78b3d3465dd7..2cd7d611e7b3 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5176,12 +5176,11 @@ struct mlx5_ifc_query_qp_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x20]; - u8 ece[0x20]; + u8 reserved_at_40[0x40]; u8 opt_param_mask[0x20]; - u8 reserved_at_a0[0x20]; + u8 ece[0x20]; struct mlx5_ifc_qpc_bits qpc; -- cgit v1.2.3 From c3ed222745d9ad7b69299b349a64ba533c64a34f Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Sat, 14 May 2022 07:05:13 -0400 Subject: NFSv4: Fix free of uninitialized nfs4_label on referral lookup. Send along the already-allocated fattr along with nfs4_fs_locations, and drop the memcpy of fattr. We end up growing two more allocations, but this fixes up a crash as: PID: 790 TASK: ffff88811b43c000 CPU: 0 COMMAND: "ls" #0 [ffffc90000857920] panic at ffffffff81b9bfde #1 [ffffc900008579c0] do_trap at ffffffff81023a9b #2 [ffffc90000857a10] do_error_trap at ffffffff81023b78 #3 [ffffc90000857a58] exc_stack_segment at ffffffff81be1f45 #4 [ffffc90000857a80] asm_exc_stack_segment at ffffffff81c009de #5 [ffffc90000857b08] nfs_lookup at ffffffffa0302322 [nfs] #6 [ffffc90000857b70] __lookup_slow at ffffffff813a4a5f #7 [ffffc90000857c60] walk_component at ffffffff813a86c4 #8 [ffffc90000857cb8] path_lookupat at ffffffff813a9553 #9 [ffffc90000857cf0] filename_lookup at ffffffff813ab86b Suggested-by: Trond Myklebust Fixes: 9558a007dbc3 ("NFS: Remove the label from the nfs4_lookup_res struct") Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 4a8ba84f848e..0e3aa0f5f324 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1222,7 +1222,7 @@ struct nfs4_fs_location { #define NFS4_FS_LOCATIONS_MAXENTRIES 10 struct nfs4_fs_locations { - struct nfs_fattr fattr; + struct nfs_fattr *fattr; const struct nfs_server *server; struct nfs4_pathname fs_path; int nlocations; -- cgit v1.2.3 From 118f09eda21d392e1eeb9f8a4bee044958cccf20 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 25 May 2022 12:12:59 -0400 Subject: NFSv4.1 mark qualified async operations as MOVEABLE tasks Mark async operations such as RENAME, REMOVE, COMMIT MOVEABLE for the nfsv4.1+ sessions. Fixes: 85e39feead948 ("NFSv4.1 identify and mark RPC tasks that can move between transports") Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 157d2bd6b241..ea2f7e6b1b0b 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -287,4 +287,5 @@ struct nfs_server { #define NFS_CAP_XATTR (1U << 28) #define NFS_CAP_READ_PLUS (1U << 29) #define NFS_CAP_FS_LOCATIONS (1U << 30) +#define NFS_CAP_MOVEABLE (1U << 31) #endif -- cgit v1.2.3 From 597b89d30b42dcc8e6b262e6876b42dde66f97f0 Mon Sep 17 00:00:00 2001 From: Mikko Perttunen Date: Mon, 16 May 2022 11:52:51 +0300 Subject: gpu: host1x: Add context bus The context bus is a "dummy" bus that contains struct devices that correspond to IOMMU contexts assigned through Host1x to processes. Even when host1x itself is built as a module, the bus is registered in built-in code so that the built-in ARM SMMU driver is able to reference it. Signed-off-by: Mikko Perttunen Signed-off-by: Thierry Reding --- include/linux/host1x_context_bus.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 include/linux/host1x_context_bus.h (limited to 'include/linux') diff --git a/include/linux/host1x_context_bus.h b/include/linux/host1x_context_bus.h new file mode 100644 index 000000000000..72462737a6db --- /dev/null +++ b/include/linux/host1x_context_bus.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021, NVIDIA Corporation. All rights reserved. + */ + +#ifndef __LINUX_HOST1X_CONTEXT_BUS_H +#define __LINUX_HOST1X_CONTEXT_BUS_H + +#include + +#ifdef CONFIG_TEGRA_HOST1X_CONTEXT_BUS +extern struct bus_type host1x_context_device_bus_type; +#endif + +#endif -- cgit v1.2.3 From 662ce1dc9caf493c309200edbe38d186f1ea20d0 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 1 Jun 2022 15:55:25 -0700 Subject: delayacct: track delays from write-protect copy Delay accounting does not track the delay of write-protect copy. When tasks trigger many write-protect copys(include COW and unsharing of anonymous pages[1]), it may spend a amount of time waiting for them. To get the delay of tasks in write-protect copy, could help users to evaluate the impact of using KSM or fork() or GUP. Also update tools/accounting/getdelays.c: / # ./getdelays -dl -p 231 print delayacct stats ON listen forever PID 231 CPU count real total virtual total delay total delay average 6247 1859000000 2154070021 1674255063 0.268ms IO count delay total delay average 0 0 0ms SWAP count delay total delay average 0 0 0ms RECLAIM count delay total delay average 0 0 0ms THRASHING count delay total delay average 0 0 0ms COMPACT count delay total delay average 3 72758 0ms WPCOPY count delay total delay average 3635 271567604 0ms [1] commit 31cc5bc4af70("mm: support GUP-triggered unsharing of anonymous pages") Link: https://lkml.kernel.org/r/20220409014342.2505532-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reviewed-by: David Hildenbrand Reviewed-by: Jiang Xuexin Reviewed-by: Ran Xiaokai Reviewed-by: wangyong Cc: Jonathan Corbet Cc: Balbir Singh Cc: Mike Kravetz Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/delayacct.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 6b16a6930a19..58aea2d7385c 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -45,9 +45,13 @@ struct task_delay_info { u64 compact_start; u64 compact_delay; /* wait for memory compact */ + u64 wpcopy_start; + u64 wpcopy_delay; /* wait for write-protect copy */ + u32 freepages_count; /* total count of memory reclaim */ u32 thrashing_count; /* total count of thrash waits */ u32 compact_count; /* total count of memory compact */ + u32 wpcopy_count; /* total count of write-protect copy */ }; #endif @@ -75,6 +79,8 @@ extern void __delayacct_swapin_start(void); extern void __delayacct_swapin_end(void); extern void __delayacct_compact_start(void); extern void __delayacct_compact_end(void); +extern void __delayacct_wpcopy_start(void); +extern void __delayacct_wpcopy_end(void); static inline void delayacct_tsk_init(struct task_struct *tsk) { @@ -191,6 +197,24 @@ static inline void delayacct_compact_end(void) __delayacct_compact_end(); } +static inline void delayacct_wpcopy_start(void) +{ + if (!static_branch_unlikely(&delayacct_key)) + return; + + if (current->delays) + __delayacct_wpcopy_start(); +} + +static inline void delayacct_wpcopy_end(void) +{ + if (!static_branch_unlikely(&delayacct_key)) + return; + + if (current->delays) + __delayacct_wpcopy_end(); +} + #else static inline void delayacct_init(void) {} @@ -225,6 +249,10 @@ static inline void delayacct_compact_start(void) {} static inline void delayacct_compact_end(void) {} +static inline void delayacct_wpcopy_start(void) +{} +static inline void delayacct_wpcopy_end(void) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ -- cgit v1.2.3 From 22296a5c0cd35aaf62e1af3266f82cdf6b0b9b78 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Jun 2022 09:18:58 -0700 Subject: net: add debug info to __skb_pull() While analyzing yet another syzbot report, I found the following patch very useful. It allows to better understand what went wrong. This debug info is only enabled if CONFIG_DEBUG_NET=y, which is the case for syzbot builds. Signed-off-by: Eric Dumazet Acked-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index da96f0d3e753..d3d10556f0fa 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2696,7 +2696,14 @@ void *skb_pull(struct sk_buff *skb, unsigned int len); static inline void *__skb_pull(struct sk_buff *skb, unsigned int len) { skb->len -= len; - BUG_ON(skb->len < skb->data_len); + if (unlikely(skb->len < skb->data_len)) { +#if defined(CONFIG_DEBUG_NET) + skb->len += len; + pr_err("__skb_pull(len=%u)\n", len); + skb_dump(KERN_ERR, skb, false); +#endif + BUG(); + } return skb->data += len; } -- cgit v1.2.3 From 46859ac8af52ae599e1b51992ddef3eb43f295fc Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 31 May 2022 18:04:12 +0800 Subject: LoongArch: Add multi-processor (SMP) support LoongArch-based procesors have 4, 8 or 16 cores per package. This patch adds multi-processor (SMP) support for LoongArch. Reviewed-by: WANG Xuerui Reviewed-by: Jiaxun Yang Signed-off-by: Huacai Chen --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index b66c5f389159..19f0dbfdd7fe 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -130,6 +130,7 @@ enum cpuhp_state { CPUHP_ZCOMP_PREPARE, CPUHP_TIMERS_PREPARE, CPUHP_MIPS_SOC_PREPARE, + CPUHP_LOONGARCH_SOC_PREPARE, CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, CPUHP_BRINGUP_CPU, -- cgit v1.2.3 From 6d7131bd52b3e0dd068a0067e5584b3f36cf17eb Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 11 Apr 2022 17:05:55 +0200 Subject: include/linux/find: Fix documentation The order of the arguments in function documentation doesn't fit the implementation. Change the documentation so that it corresponds to the code. This prevent people to get confused when reading the documentation. Signed-off-by: Anna-Maria Behnsen Reviewed-by: Andy Shevchenko Signed-off-by: Yury Norov --- include/linux/find.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/find.h b/include/linux/find.h index 5bb6db213bcb..424ef67d4a42 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -21,8 +21,8 @@ extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long siz /** * find_next_bit - find the next set bit in a memory region * @addr: The address to base the search on - * @offset: The bitnumber to start searching at * @size: The bitmap size in bits + * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. @@ -50,8 +50,8 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size, * find_next_and_bit - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on - * @offset: The bitnumber to start searching at * @size: The bitmap size in bits + * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. @@ -79,8 +79,8 @@ unsigned long find_next_and_bit(const unsigned long *addr1, /** * find_next_zero_bit - find the next cleared bit in a memory region * @addr: The address to base the search on - * @offset: The bitnumber to start searching at * @size: The bitmap size in bits + * @offset: The bitnumber to start searching at * * Returns the bit number of the next zero bit * If no bits are zero, returns @size. -- cgit v1.2.3 From e041e0ac53dd52d2d201aa87edc3adaca1085299 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 28 Apr 2022 13:51:12 -0700 Subject: lib/bitmap: extend comment for bitmap_(from,to)_arr32() On LE systems bitmaps are naturally ordered, therefore we can potentially use bitmap_copy routines when converting from 32-bit arrays, even if host system is 64-bit. But it may lead to out-of-bond access due to unsafe typecast, and the bitmap_(from,to)_arr32 comment doesn't explain that clearly CC: Alexander Gordeev CC: Andy Shevchenko CC: Christian Borntraeger CC: Claudio Imbrenda CC: David Hildenbrand CC: Heiko Carstens CC: Janosch Frank CC: Rasmus Villemoes CC: Sven Schnelle CC: Vasily Gorbik Signed-off-by: Yury Norov --- include/linux/bitmap.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 7dba0847510c..afcf7b8dddd1 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -264,8 +264,12 @@ static inline void bitmap_copy_clear_tail(unsigned long *dst, } /* - * On 32-bit systems bitmaps are represented as u32 arrays internally, and - * therefore conversion is not needed when copying data from/to arrays of u32. + * On 32-bit systems bitmaps are represented as u32 arrays internally. On LE64 + * machines the order of hi and lo parts of numbers match the bitmap structure. + * In both cases conversion is not needed when copying data from/to arrays of + * u32. But in LE64 case, typecast in bitmap_copy_clear_tail() may lead + * to out-of-bound access. To avoid that, both LE and BE variants of 64-bit + * architectures are not using bitmap_copy_clear_tail(). */ #if BITS_PER_LONG == 64 void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, -- cgit v1.2.3 From 0a97953fd2210d0ac8eb5c76f8bd08fb53b6d3d6 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 28 Apr 2022 13:51:13 -0700 Subject: lib: add bitmap_{from,to}_arr64 Manipulating 64-bit arrays with bitmap functions is potentially dangerous because on 32-bit BE machines the order of halfwords doesn't match. Another issue is that compiler may throw a warning about out-of-boundary access. This patch adds bitmap_{from,to}_arr64 functions in addition to existing bitmap_{from,to}_arr32. CC: Alexander Gordeev CC: Andy Shevchenko CC: Christian Borntraeger CC: Claudio Imbrenda CC: David Hildenbrand CC: Heiko Carstens CC: Janosch Frank CC: Rasmus Villemoes CC: Sven Schnelle CC: Vasily Gorbik Signed-off-by: Yury Norov --- include/linux/bitmap.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index afcf7b8dddd1..71147b7d721b 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -72,6 +72,8 @@ struct device; * bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region * bitmap_from_arr32(dst, buf, nbits) Copy nbits from u32[] buf to dst * bitmap_to_arr32(buf, src, nbits) Copy nbits from buf to u32[] dst + * bitmap_to_arr64(buf, src, nbits) Copy nbits from buf to u64[] dst + * bitmap_to_arr64(buf, src, nbits) Copy nbits from buf to u64[] dst * bitmap_get_value8(map, start) Get 8bit value from map at start * bitmap_set_value8(map, value, start) Set 8bit value to map at start * @@ -285,6 +287,22 @@ void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, (const unsigned long *) (bitmap), (nbits)) #endif +/* + * On 64-bit systems bitmaps are represented as u64 arrays internally. On LE32 + * machines the order of hi and lo parts of numbers match the bitmap structure. + * In both cases conversion is not needed when copying data from/to arrays of + * u64. + */ +#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN) +void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits); +void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits); +#else +#define bitmap_from_arr64(bitmap, buf, nbits) \ + bitmap_copy_clear_tail((unsigned long *)(bitmap), (const unsigned long *)(buf), (nbits)) +#define bitmap_to_arr64(buf, bitmap, nbits) \ + bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits)) +#endif + static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { @@ -518,10 +536,7 @@ static inline void bitmap_next_set_region(unsigned long *bitmap, */ static inline void bitmap_from_u64(unsigned long *dst, u64 mask) { - dst[0] = mask & ULONG_MAX; - - if (sizeof(mask) > sizeof(unsigned long)) - dst[1] = mask >> 32; + bitmap_from_arr64(dst, &mask, 64); } /** -- cgit v1.2.3 From 005f17007f47495dbbb659aa5db7e581065d16e7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 18 May 2022 13:52:22 -0700 Subject: bitmap: Fix return values to be unsigned Both nodemask and bitmap routines had mixed return values that provided potentially signed return values that could never happen. This was leading to the compiler getting confusing about the range of possible return values (it was thinking things could be negative where they could not be). In preparation for fixing nodemask, fix all the bitmap routines that should be returning unsigned (or bool) values. Cc: Yury Norov Cc: Rasmus Villemoes Cc: Christophe de Dinechin Cc: Alexey Dobriyan Cc: Andy Shevchenko Cc: Andrew Morton Cc: Zhen Lei Signed-off-by: Kees Cook Signed-off-by: Yury Norov --- include/linux/bitmap.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 71147b7d721b..2e6cd5681040 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -134,8 +134,8 @@ unsigned long *devm_bitmap_zalloc(struct device *dev, * lib/bitmap.c provides these functions: */ -int __bitmap_equal(const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); +bool __bitmap_equal(const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); bool __pure __bitmap_or_equal(const unsigned long *src1, const unsigned long *src2, const unsigned long *src3, @@ -159,10 +159,10 @@ int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, void __bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, unsigned int nbits); -int __bitmap_intersects(const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); -int __bitmap_subset(const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); +bool __bitmap_intersects(const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +bool __bitmap_subset(const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); void __bitmap_set(unsigned long *map, unsigned int start, int len); void __bitmap_clear(unsigned long *map, unsigned int start, int len); @@ -353,8 +353,8 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr #endif #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1) -static inline int bitmap_equal(const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) +static inline bool bitmap_equal(const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits)); @@ -384,8 +384,9 @@ static inline bool bitmap_or_equal(const unsigned long *src1, return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits)); } -static inline int bitmap_intersects(const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) +static inline bool bitmap_intersects(const unsigned long *src1, + const unsigned long *src2, + unsigned int nbits) { if (small_const_nbits(nbits)) return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; @@ -393,8 +394,8 @@ static inline int bitmap_intersects(const unsigned long *src1, return __bitmap_intersects(src1, src2, nbits); } -static inline int bitmap_subset(const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) +static inline bool bitmap_subset(const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits)); -- cgit v1.2.3 From 0dfe54071d7c828a02917b595456bfde1afdddc9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 18 May 2022 13:52:23 -0700 Subject: nodemask: Fix return values to be unsigned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nodemask routines had mixed return values that provided potentially signed return values that could never happen. This was leading to the compiler getting confusing about the range of possible return values (it was thinking things could be negative where they could not be). Fix all the nodemask routines that should be returning unsigned (or bool) values. Silences: mm/swapfile.c: In function ‘setup_swap_info’: mm/swapfile.c:2291:47: error: array subscript -1 is below array bounds of ‘struct plist_node[]’ [-Werror=array-bounds] 2291 | p->avail_lists[i].prio = 1; | ~~~~~~~~~~~~~~^~~ In file included from mm/swapfile.c:16: ./include/linux/swap.h:292:27: note: while referencing ‘avail_lists’ 292 | struct plist_node avail_lists[]; /* | ^~~~~~~~~~~ Reported-by: Christophe de Dinechin Link: https://lore.kernel.org/lkml/20220414150855.2407137-3-dinechin@redhat.com/ Cc: Alexey Dobriyan Cc: Yury Norov Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Andrew Morton Cc: Zhen Lei Signed-off-by: Kees Cook Signed-off-by: Yury Norov --- include/linux/nodemask.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 567c3ddba2c4..2c39663c3407 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -42,11 +42,11 @@ * void nodes_shift_right(dst, src, n) Shift right * void nodes_shift_left(dst, src, n) Shift left * - * int first_node(mask) Number lowest set bit, or MAX_NUMNODES - * int next_node(node, mask) Next node past 'node', or MAX_NUMNODES - * int next_node_in(node, mask) Next node past 'node', or wrap to first, + * unsigned int first_node(mask) Number lowest set bit, or MAX_NUMNODES + * unsigend int next_node(node, mask) Next node past 'node', or MAX_NUMNODES + * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first, * or MAX_NUMNODES - * int first_unset_node(mask) First node not set in mask, or + * unsigned int first_unset_node(mask) First node not set in mask, or * MAX_NUMNODES * * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set @@ -153,7 +153,7 @@ static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) #define node_test_and_set(node, nodemask) \ __node_test_and_set((node), &(nodemask)) -static inline int __node_test_and_set(int node, nodemask_t *addr) +static inline bool __node_test_and_set(int node, nodemask_t *addr) { return test_and_set_bit(node, addr->bits); } @@ -200,7 +200,7 @@ static inline void __nodes_complement(nodemask_t *dstp, #define nodes_equal(src1, src2) \ __nodes_equal(&(src1), &(src2), MAX_NUMNODES) -static inline int __nodes_equal(const nodemask_t *src1p, +static inline bool __nodes_equal(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); @@ -208,7 +208,7 @@ static inline int __nodes_equal(const nodemask_t *src1p, #define nodes_intersects(src1, src2) \ __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) -static inline int __nodes_intersects(const nodemask_t *src1p, +static inline bool __nodes_intersects(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); @@ -216,20 +216,20 @@ static inline int __nodes_intersects(const nodemask_t *src1p, #define nodes_subset(src1, src2) \ __nodes_subset(&(src1), &(src2), MAX_NUMNODES) -static inline int __nodes_subset(const nodemask_t *src1p, +static inline bool __nodes_subset(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) -static inline int __nodes_empty(const nodemask_t *srcp, unsigned int nbits) +static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) { return bitmap_empty(srcp->bits, nbits); } #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) -static inline int __nodes_full(const nodemask_t *srcp, unsigned int nbits) +static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) { return bitmap_full(srcp->bits, nbits); } @@ -260,15 +260,15 @@ static inline void __nodes_shift_left(nodemask_t *dstp, > MAX_NUMNODES, then the silly min_ts could be dropped. */ #define first_node(src) __first_node(&(src)) -static inline int __first_node(const nodemask_t *srcp) +static inline unsigned int __first_node(const nodemask_t *srcp) { - return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); + return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); } #define next_node(n, src) __next_node((n), &(src)) -static inline int __next_node(int n, const nodemask_t *srcp) +static inline unsigned int __next_node(int n, const nodemask_t *srcp) { - return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); + return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } /* @@ -276,7 +276,7 @@ static inline int __next_node(int n, const nodemask_t *srcp) * the first node in src if needed. Returns MAX_NUMNODES if src is empty. */ #define next_node_in(n, src) __next_node_in((n), &(src)) -int __next_node_in(int node, const nodemask_t *srcp); +unsigned int __next_node_in(int node, const nodemask_t *srcp); static inline void init_nodemask_of_node(nodemask_t *mask, int node) { @@ -296,9 +296,9 @@ static inline void init_nodemask_of_node(nodemask_t *mask, int node) }) #define first_unset_node(mask) __first_unset_node(&(mask)) -static inline int __first_unset_node(const nodemask_t *maskp) +static inline unsigned int __first_unset_node(const nodemask_t *maskp) { - return min_t(int,MAX_NUMNODES, + return min_t(unsigned int, MAX_NUMNODES, find_first_zero_bit(maskp->bits, MAX_NUMNODES)); } @@ -436,11 +436,11 @@ static inline int num_node_state(enum node_states state) #define first_online_node first_node(node_states[N_ONLINE]) #define first_memory_node first_node(node_states[N_MEMORY]) -static inline int next_online_node(int nid) +static inline unsigned int next_online_node(int nid) { return next_node(nid, node_states[N_ONLINE]); } -static inline int next_memory_node(int nid) +static inline unsigned int next_memory_node(int nid) { return next_node(nid, node_states[N_MEMORY]); } -- cgit v1.2.3 From a734510fa8b4e61e6a37176f0da01f4c55fa52de Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 25 May 2022 13:49:42 +0200 Subject: ata: libata: drop 'sas_last_tag' Unused now. Fixes: 4f1a22ee7b57 ("libata: Improve ATA queued command allocation") Cc: John Garry Signed-off-by: Hannes Reinecke Reviewed-by: John Garry Signed-off-by: Damien Le Moal --- include/linux/libata.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 732de9014626..0f2a59c9c735 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -822,7 +822,6 @@ struct ata_port { struct ata_queued_cmd qcmd[ATA_MAX_QUEUE + 1]; u64 qc_active; int nr_active_links; /* #links with active qcs */ - unsigned int sas_last_tag; /* track next tag hw expects */ struct ata_link link; /* host default link */ struct ata_link *slave_link; /* see ata_slave_link_init() */ -- cgit v1.2.3 From 2130a790ca49763f724ec45cf93b9dd765e2023e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 2 Jun 2022 15:05:26 +0200 Subject: kernel: add platform_has() infrastructure Add a simple infrastructure for setting, resetting and querying platform feature flags. Flags can be either global or architecture specific. Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Tested-by: Oleksandr Tyshchenko # Arm64 only Reviewed-by: Christoph Hellwig Acked-by: Borislav Petkov Signed-off-by: Juergen Gross --- include/linux/platform-feature.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 include/linux/platform-feature.h (limited to 'include/linux') diff --git a/include/linux/platform-feature.h b/include/linux/platform-feature.h new file mode 100644 index 000000000000..6ed859928b97 --- /dev/null +++ b/include/linux/platform-feature.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PLATFORM_FEATURE_H +#define _PLATFORM_FEATURE_H + +#include +#include + +/* The platform features are starting with the architecture specific ones. */ +#define PLATFORM_FEAT_N (0 + PLATFORM_ARCH_FEAT_N) + +void platform_set(unsigned int feature); +void platform_clear(unsigned int feature); +bool platform_has(unsigned int feature); + +#endif /* _PLATFORM_FEATURE_H */ -- cgit v1.2.3 From 3f9dfbebdc48cebfbda738f6f3d1dbf6d7232f90 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 6 Jun 2022 08:09:16 +0200 Subject: virtio: replace arch_has_restricted_virtio_memory_access() Instead of using arch_has_restricted_virtio_memory_access() together with CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS, replace those with platform_has() and a new platform feature PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS. Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Tested-by: Oleksandr Tyshchenko # Arm64 only Reviewed-by: Christoph Hellwig Acked-by: Borislav Petkov --- include/linux/platform-feature.h | 6 +++++- include/linux/virtio_config.h | 9 --------- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform-feature.h b/include/linux/platform-feature.h index 6ed859928b97..b2f48be999fa 100644 --- a/include/linux/platform-feature.h +++ b/include/linux/platform-feature.h @@ -6,7 +6,11 @@ #include /* The platform features are starting with the architecture specific ones. */ -#define PLATFORM_FEAT_N (0 + PLATFORM_ARCH_FEAT_N) + +/* Used to enable platform specific DMA handling for virtio devices. */ +#define PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS (0 + PLATFORM_ARCH_FEAT_N) + +#define PLATFORM_FEAT_N (1 + PLATFORM_ARCH_FEAT_N) void platform_set(unsigned int feature); void platform_clear(unsigned int feature); diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 9a36051ceb76..49c7c32815f1 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -604,13 +604,4 @@ static inline void virtio_cwrite64(struct virtio_device *vdev, _r; \ }) -#ifdef CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS -int arch_has_restricted_virtio_memory_access(void); -#else -static inline int arch_has_restricted_virtio_memory_access(void) -{ - return 0; -} -#endif /* CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS */ - #endif /* _LINUX_VIRTIO_CONFIG_H */ -- cgit v1.2.3 From 77991645952c21962a095910c51fe0f73d35bf91 Mon Sep 17 00:00:00 2001 From: Roger Knecht Date: Sat, 21 May 2022 14:47:45 +0200 Subject: crc-itu-t: fix typo in CRC ITU-T polynomial comment The code comment says that the polynomial is x^16 + x^12 + x^15 + 1, but the correct polynomial is x^16 + x^12 + x^5 + 1. Quoting from page 2 in the ITU-T V.41 specification [1]: 2 Encoding and checking process The service bits and information bits, taken in conjunction, correspond to the coefficients of a message polynomial having terms from x^(n-1) (n = total number of bits in a block or sequence) down to x^16. This polynomial is divided, modulo 2, by the generating polynomial x^16 + x^12 + x^5 + 1. The hex (truncated) polynomial 0x1021 and CRC code implementation are correct, however. [1] https://www.itu.int/rec/T-REC-V.41-198811-I/en Signed-off-by: Roger Knecht Acked-by: Randy Dunlap Signed-off-by: Jason A. Donenfeld --- include/linux/crc-itu-t.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/crc-itu-t.h b/include/linux/crc-itu-t.h index a4367051e192..2f991a427ade 100644 --- a/include/linux/crc-itu-t.h +++ b/include/linux/crc-itu-t.h @@ -4,7 +4,7 @@ * * Implements the standard CRC ITU-T V.41: * Width 16 - * Poly 0x1021 (x^16 + x^12 + x^15 + 1) + * Poly 0x1021 (x^16 + x^12 + x^5 + 1) * Init 0 */ -- cgit v1.2.3 From c4f135d643823a869becfa87539f7820ef9d5bfa Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 1 Jun 2022 16:32:47 +0900 Subject: workqueue: Wrap flush_workqueue() using a macro Since flush operation synchronously waits for completion, flushing system-wide WQs (e.g. system_wq) might introduce possibility of deadlock due to unexpected locking dependency. Tejun Heo commented at [1] that it makes no sense at all to call flush_workqueue() on the shared WQs as the caller has no idea what it's gonna end up waiting for. Although there is flush_scheduled_work() which flushes system_wq WQ with "Think twice before calling this function! It's very easy to get into trouble if you don't take great care." warning message, syzbot found a circular locking dependency caused by flushing system_wq WQ [2]. Therefore, let's change the direction to that developers had better use their local WQs if flush_scheduled_work()/flush_workqueue(system_*_wq) is inevitable. Steps for converting system-wide WQs into local WQs are explained at [3], and a conversion to stop flushing system-wide WQs is in progress. Now we want some mechanism for preventing developers who are not aware of this conversion from again start flushing system-wide WQs. Since I found that WARN_ON() is complete but awkward approach for teaching developers about this problem, let's use __compiletime_warning() for incomplete but handy approach. For completeness, we will also insert WARN_ON() into __flush_workqueue() after all in-tree users stopped calling flush_scheduled_work(). Link: https://lore.kernel.org/all/YgnQGZWT%2Fn3VAITX@slm.duckdns.org/ [1] Link: https://syzkaller.appspot.com/bug?extid=bde0f89deacca7c765b8 [2] Link: https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp [3] Signed-off-by: Tetsuo Handa Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 64 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 7fee9b6cfede..e1f1c8b1121b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -445,7 +445,7 @@ extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay); extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork); -extern void flush_workqueue(struct workqueue_struct *wq); +extern void __flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern int schedule_on_each_cpu(work_func_t func); @@ -563,15 +563,23 @@ static inline bool schedule_work(struct work_struct *work) return queue_work(system_wq, work); } +/* + * Detect attempt to flush system-wide workqueues at compile time when possible. + * + * See https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp + * for reasons and steps for converting system-wide workqueues into local workqueues. + */ +extern void __warn_flushing_systemwide_wq(void) + __compiletime_warning("Please avoid flushing system-wide workqueues."); + /** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its * completion. * - * Think twice before calling this function! It's very easy to get into - * trouble if you don't take great care. Either of the following situations - * will lead to deadlock: + * It's very easy to get into trouble if you don't take great care. + * Either of the following situations will lead to deadlock: * * One of the work items currently on the workqueue needs to acquire * a lock held by your code or its caller. @@ -586,11 +594,51 @@ static inline bool schedule_work(struct work_struct *work) * need to know that a particular work item isn't queued and isn't running. * In such cases you should use cancel_delayed_work_sync() or * cancel_work_sync() instead. + * + * Please stop calling this function! A conversion to stop flushing system-wide + * workqueues is in progress. This function will be removed after all in-tree + * users stopped calling this function. */ -static inline void flush_scheduled_work(void) -{ - flush_workqueue(system_wq); -} +/* + * The background of commit 771c035372a036f8 ("deprecate the + * '__deprecated' attribute warnings entirely and for good") is that, + * since Linus builds all modules between every single pull he does, + * the standard kernel build needs to be _clean_ in order to be able to + * notice when new problems happen. Therefore, don't emit warning while + * there are in-tree users. + */ +#define flush_scheduled_work() \ +({ \ + if (0) \ + __warn_flushing_systemwide_wq(); \ + __flush_workqueue(system_wq); \ +}) + +/* + * Although there is no longer in-tree caller, for now just emit warning + * in order to give out-of-tree callers time to update. + */ +#define flush_workqueue(wq) \ +({ \ + struct workqueue_struct *_wq = (wq); \ + \ + if ((__builtin_constant_p(_wq == system_wq) && \ + _wq == system_wq) || \ + (__builtin_constant_p(_wq == system_highpri_wq) && \ + _wq == system_highpri_wq) || \ + (__builtin_constant_p(_wq == system_long_wq) && \ + _wq == system_long_wq) || \ + (__builtin_constant_p(_wq == system_unbound_wq) && \ + _wq == system_unbound_wq) || \ + (__builtin_constant_p(_wq == system_freezable_wq) && \ + _wq == system_freezable_wq) || \ + (__builtin_constant_p(_wq == system_power_efficient_wq) && \ + _wq == system_power_efficient_wq) || \ + (__builtin_constant_p(_wq == system_freezable_power_efficient_wq) && \ + _wq == system_freezable_power_efficient_wq)) \ + __warn_flushing_systemwide_wq(); \ + __flush_workqueue(_wq); \ +}) /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay -- cgit v1.2.3 From 62ed448cc53b654036f7d7f3c99f299d79ad14c3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 7 Jun 2022 16:47:58 -0400 Subject: SUNRPC: Optimize xdr_reserve_space() Transitioning between encode buffers is quite infrequent. It happens about 1 time in 400 calls to xdr_reserve_space(), measured on NFSD with a typical build/test workload. Force the compiler to remove that code from xdr_reserve_space(), which is a hot path on both the server and the client. This change reduces the size of xdr_reserve_space() from 10 cache lines to 2 when compiled with -Os. Signed-off-by: Chuck Lever Reviewed-by: J. Bruce Fields --- include/linux/sunrpc/xdr.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 4417f667c757..5860f32e3958 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -243,7 +243,7 @@ extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); extern int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbytes); -extern void xdr_commit_encode(struct xdr_stream *xdr); +extern void __xdr_commit_encode(struct xdr_stream *xdr); extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len); extern int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen); extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, @@ -306,6 +306,20 @@ xdr_reset_scratch_buffer(struct xdr_stream *xdr) xdr_set_scratch_buffer(xdr, NULL, 0); } +/** + * xdr_commit_encode - Ensure all data is written to xdr->buf + * @xdr: pointer to xdr_stream + * + * Handle encoding across page boundaries by giving the caller a + * temporary location to write to, then later copying the data into + * place. __xdr_commit_encode() does that copying. + */ +static inline void xdr_commit_encode(struct xdr_stream *xdr) +{ + if (unlikely(xdr->scratch.iov_len)) + __xdr_commit_encode(xdr); +} + /** * xdr_stream_remaining - Return the number of bytes remaining in the stream * @xdr: pointer to struct xdr_stream -- cgit v1.2.3 From d5a37b19983725d2045588cfa3a4699f5b39ae26 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Jun 2022 08:34:07 +0200 Subject: block: remove bioset_init_from_src Unused now, and the interface never really made a whole lot of sense to start with. Signed-off-by: Christoph Hellwig Signed-off-by: Mike Snitzer --- include/linux/bio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 1cf3738ef1ea..992ee987f273 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -403,7 +403,6 @@ enum { extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags); extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); -extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, unsigned int opf, gfp_t gfp_mask, -- cgit v1.2.3 From 00d1f546470d89e072dd3cda12b5c794341e7268 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 9 Jun 2022 12:19:01 +0800 Subject: vdpa: make get_vq_group and set_group_asid optional This patch makes get_vq_group and set_group_asid optional. This is needed to unbreak the vDPA parent that doesn't support multiple address spaces. Cc: Gautam Dawar Fixes: aaca8373c4b1 ("vhost-vdpa: support ASID based IOTLB API") Signed-off-by: Jason Wang Message-Id: <20220609041901.2029-1-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 4700a88a28f6..7b4a13d3bd91 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -178,7 +178,8 @@ struct vdpa_map_file { * for the device * @vdev: vdpa device * Returns virtqueue algin requirement - * @get_vq_group: Get the group id for a specific virtqueue + * @get_vq_group: Get the group id for a specific + * virtqueue (optional) * @vdev: vdpa device * @idx: virtqueue index * Returns u32: group id for this virtqueue @@ -243,7 +244,7 @@ struct vdpa_map_file { * Returns the iova range supported by * the device. * @set_group_asid: Set address space identifier for a - * virtqueue group + * virtqueue group (optional) * @vdev: vdpa device * @group: virtqueue group * @asid: address space id for this group -- cgit v1.2.3 From 69a37a8ba1b408a1c7616494aa7018e4b3844cbe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Jun 2022 15:18:34 -0400 Subject: mm/huge_memory: Fix xarray node memory leak If xas_split_alloc() fails to allocate the necessary nodes to complete the xarray entry split, it sets the xa_state to -ENOMEM, which xas_nomem() then interprets as "Please allocate more memory", not as "Please free any unnecessary memory" (which was the intended outcome). It's confusing to use xas_nomem() to free memory in this context, so call xas_destroy() instead. Reported-by: syzbot+9e27a75a8c24f3fe75c1@syzkaller.appspotmail.com Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache") Cc: stable@vger.kernel.org Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/xarray.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 72feab5ea8d4..c29e11b2c073 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1508,6 +1508,7 @@ void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); void xas_init_marks(const struct xa_state *); bool xas_nomem(struct xa_state *, gfp_t); +void xas_destroy(struct xa_state *); void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); -- cgit v1.2.3 From 334f6f53abcf57782bd2fe81da1cbd893e4ef05c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Jun 2022 09:13:57 -0400 Subject: mm: Add kernel-doc for folio->mlock_count Fix "./include/linux/mm_types.h:279: warning: Function parameter or member 'mlock_count' not described in 'folio'". Also neaten the html by hiding the anon struct. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b34ff2cdbc4f..c29ab4c0cd5c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -227,6 +227,7 @@ struct page { * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. * @lru: Least Recently Used list; tracks how recently this folio was used. + * @mlock_count: Number of times this folio has been pinned by mlock(). * @mapping: The file this page belongs to, or refers to the anon_vma for * anonymous memory. * @index: Offset within the file, in units of pages. For anonymous memory, @@ -255,10 +256,14 @@ struct folio { unsigned long flags; union { struct list_head lru; + /* private: avoid cluttering the output */ struct { void *__filler; + /* public: */ unsigned int mlock_count; + /* private: */ }; + /* public: */ }; struct address_space *mapping; pgoff_t index; -- cgit v1.2.3 From 874c8ca1e60b2c564a48f7e7acc40d328d5c8733 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 9 Jun 2022 21:46:04 +0100 Subject: netfs: Fix gcc-12 warning by embedding vfs inode in netfs_i_context While randstruct was satisfied with using an open-coded "void *" offset cast for the netfs_i_context <-> inode casting, __builtin_object_size() as used by FORTIFY_SOURCE was not as easily fooled. This was causing the following complaint[1] from gcc v12: In file included from include/linux/string.h:253, from include/linux/ceph/ceph_debug.h:7, from fs/ceph/inode.c:2: In function 'fortify_memset_chk', inlined from 'netfs_i_context_init' at include/linux/netfs.h:326:2, inlined from 'ceph_alloc_inode' at fs/ceph/inode.c:463:2: include/linux/fortify-string.h:242:25: warning: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wattribute-warning] 242 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by embedding a struct inode into struct netfs_i_context (which should perhaps be renamed to struct netfs_inode). The struct inode vfs_inode fields are then removed from the 9p, afs, ceph and cifs inode structs and vfs_inode is then simply changed to "netfs.inode" in those filesystems. Further, rename netfs_i_context to netfs_inode, get rid of the netfs_inode() function that converted a netfs_i_context pointer to an inode pointer (that can now be done with &ctx->inode) and rename the netfs_i_context() function to netfs_inode() (which is now a wrapper around container_of()). Most of the changes were done with: perl -p -i -e 's/vfs_inode/netfs.inode/'g \ `git grep -l 'vfs_inode' -- fs/{9p,afs,ceph,cifs}/*.[ch]` Kees suggested doing it with a pair structure[2] and a special declarator to insert that into the network filesystem's inode wrapper[3], but I think it's cleaner to embed it - and then it doesn't matter if struct randomisation reorders things. Dave Chinner suggested using a filesystem-specific VFS_I() function in each filesystem to convert that filesystem's own inode wrapper struct into the VFS inode struct[4]. Version #2: - Fix a couple of missed name changes due to a disabled cifs option. - Rename nfs_i_context to nfs_inode - Use "netfs" instead of "nic" as the member name in per-fs inode wrapper structs. [ This also undoes commit 507160f46c55 ("netfs: gcc-12: temporarily disable '-Wattribute-warning' for now") that is no longer needed ] Fixes: bc899ee1c898 ("netfs: Add a netfs inode context") Reported-by: Jeff Layton Signed-off-by: David Howells Reviewed-by: Jeff Layton Reviewed-by: Kees Cook Reviewed-by: Xiubo Li cc: Jonathan Corbet cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: Marc Dionne cc: Ilya Dryomov cc: Steve French cc: William Kucharski cc: "Matthew Wilcox (Oracle)" cc: Dave Chinner cc: linux-doc@vger.kernel.org cc: v9fs-developer@lists.sourceforge.net cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: samba-technical@lists.samba.org cc: linux-fsdevel@vger.kernel.org cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/r/d2ad3a3d7bdd794c6efb562d2f2b655fb67756b9.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/20220517210230.864239-1-keescook@chromium.org/ [2] Link: https://lore.kernel.org/r/20220518202212.2322058-1-keescook@chromium.org/ [3] Link: https://lore.kernel.org/r/20220524101205.GI2306852@dread.disaster.area/ [4] Link: https://lore.kernel.org/r/165296786831.3591209.12111293034669289733.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165305805651.4094995.7763502506786714216.stgit@warthog.procyon.org.uk # v2 Signed-off-by: Linus Torvalds --- include/linux/netfs.h | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 77fa6a61706a..6dbb4c9ce50d 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -119,9 +119,10 @@ typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error, bool was_async); /* - * Per-inode description. This must be directly after the inode struct. + * Per-inode context. This wraps the VFS inode. */ -struct netfs_i_context { +struct netfs_inode { + struct inode inode; /* The VFS inode */ const struct netfs_request_ops *ops; #if IS_ENABLED(CONFIG_FSCACHE) struct fscache_cookie *cache; @@ -256,7 +257,7 @@ struct netfs_cache_ops { * boundary as appropriate. */ enum netfs_io_source (*prepare_read)(struct netfs_io_subrequest *subreq, - loff_t i_size); + loff_t i_size); /* Prepare a write operation, working out what part of the write we can * actually do. @@ -288,45 +289,35 @@ extern void netfs_put_subrequest(struct netfs_io_subrequest *subreq, extern void netfs_stats_show(struct seq_file *); /** - * netfs_i_context - Get the netfs inode context from the inode + * netfs_inode - Get the netfs inode context from the inode * @inode: The inode to query * * Get the netfs lib inode context from the network filesystem's inode. The * context struct is expected to directly follow on from the VFS inode struct. */ -static inline struct netfs_i_context *netfs_i_context(struct inode *inode) +static inline struct netfs_inode *netfs_inode(struct inode *inode) { - return (void *)inode + sizeof(*inode); + return container_of(inode, struct netfs_inode, inode); } /** - * netfs_inode - Get the netfs inode from the inode context - * @ctx: The context to query - * - * Get the netfs inode from the netfs library's inode context. The VFS inode - * is expected to directly precede the context struct. - */ -static inline struct inode *netfs_inode(struct netfs_i_context *ctx) -{ - return (void *)ctx - sizeof(struct inode); -} - -/** - * netfs_i_context_init - Initialise a netfs lib context + * netfs_inode_init - Initialise a netfslib inode context * @inode: The inode with which the context is associated * @ops: The netfs's operations list * * Initialise the netfs library context struct. This is expected to follow on * directly from the VFS inode struct. */ -static inline void netfs_i_context_init(struct inode *inode, - const struct netfs_request_ops *ops) +static inline void netfs_inode_init(struct inode *inode, + const struct netfs_request_ops *ops) { - struct netfs_i_context *ctx = netfs_i_context(inode); + struct netfs_inode *ctx = netfs_inode(inode); - memset(ctx, 0, sizeof(*ctx)); ctx->ops = ops; ctx->remote_i_size = i_size_read(inode); +#if IS_ENABLED(CONFIG_FSCACHE) + ctx->cache = NULL; +#endif } /** @@ -338,7 +329,7 @@ static inline void netfs_i_context_init(struct inode *inode, */ static inline void netfs_resize_file(struct inode *inode, loff_t new_i_size) { - struct netfs_i_context *ctx = netfs_i_context(inode); + struct netfs_inode *ctx = netfs_inode(inode); ctx->remote_i_size = new_i_size; } @@ -352,7 +343,7 @@ static inline void netfs_resize_file(struct inode *inode, loff_t new_i_size) static inline struct fscache_cookie *netfs_i_cookie(struct inode *inode) { #if IS_ENABLED(CONFIG_FSCACHE) - struct netfs_i_context *ctx = netfs_i_context(inode); + struct netfs_inode *ctx = netfs_inode(inode); return ctx->cache; #else return NULL; -- cgit v1.2.3 From 39e0f991a62ed5efabd20711a7b6e7da92603170 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 7 Jun 2022 17:00:16 +0200 Subject: random: mark bootloader randomness code as __init add_bootloader_randomness() and the variables it touches are only used during __init and not after, so mark these as __init. At the same time, unexport this, since it's only called by other __init code that's built-in. Cc: stable@vger.kernel.org Fixes: 428826f5358c ("fdt: add support for rng-seed") Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index fae0c84027fd..223b4bd584e7 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -13,7 +13,7 @@ struct notifier_block; void add_device_randomness(const void *buf, size_t len); -void add_bootloader_randomness(const void *buf, size_t len); +void __init add_bootloader_randomness(const void *buf, size_t len); void add_input_randomness(unsigned int type, unsigned int code, unsigned int value) __latent_entropy; void add_interrupt_randomness(int irq) __latent_entropy; -- cgit v1.2.3 From e052a478a7daeca67664f7addd308ff51dd40654 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 8 Jun 2022 10:31:25 +0200 Subject: random: remove rng_has_arch_random() With arch randomness being used by every distro and enabled in defconfigs, the distinction between rng_has_arch_random() and rng_is_initialized() is now rather small. In fact, the places where they differ are now places where paranoid users and system builders really don't want arch randomness to be used, in which case we should respect that choice, or places where arch randomness is known to be broken, in which case that choice is all the more important. So this commit just removes the function and its one user. Reviewed-by: Petr Mladek # for vsprintf.c Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 223b4bd584e7..20e389a14e5c 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -74,7 +74,6 @@ static inline unsigned long get_random_canary(void) int __init random_init(const char *command_line); bool rng_is_initialized(void); -bool rng_has_arch_random(void); int wait_for_random_bytes(void); /* Calls wait_for_random_bytes() and then calls get_random_bytes(buf, nbytes). -- cgit v1.2.3 From e81fb4198e27925b151aad1450e0fd607d6733f8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 9 Jun 2022 15:04:01 -0700 Subject: netfs: Further cleanups after struct netfs_inode wrapper introduced Change the signature of netfs helper functions to take a struct netfs_inode pointer rather than a struct inode pointer where appropriate, thereby relieving the need for the network filesystem to convert its internal inode format down to the VFS inode only for netfslib to bounce it back up. For type safety, it's better not to do that (and it's less typing too). Give netfs_write_begin() an extra argument to pass in a pointer to the netfs_inode struct rather than deriving it internally from the file pointer. Note that the ->write_begin() and ->write_end() ops are intended to be replaced in the future by netfslib code that manages this without the need to call in twice for each page. netfs_readpage() and similar are intended to be pointed at directly by the address_space_operations table, so must stick to the signature dictated by the function pointers there. Changes ======= - Updated the kerneldoc comments and documentation [DH]. Signed-off-by: David Howells cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/CAHk-=wgkwKyNmNdKpQkqZ6DnmUL-x9hp0YBnUGjaPFEAdxDTbw@mail.gmail.com/ --- include/linux/netfs.h | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 6dbb4c9ce50d..a62739f3726b 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -277,7 +277,8 @@ struct netfs_cache_ops { struct readahead_control; extern void netfs_readahead(struct readahead_control *); int netfs_read_folio(struct file *, struct folio *); -extern int netfs_write_begin(struct file *, struct address_space *, +extern int netfs_write_begin(struct netfs_inode *, + struct file *, struct address_space *, loff_t, unsigned int, struct folio **, void **); @@ -302,19 +303,17 @@ static inline struct netfs_inode *netfs_inode(struct inode *inode) /** * netfs_inode_init - Initialise a netfslib inode context - * @inode: The inode with which the context is associated + * @inode: The netfs inode to initialise * @ops: The netfs's operations list * * Initialise the netfs library context struct. This is expected to follow on * directly from the VFS inode struct. */ -static inline void netfs_inode_init(struct inode *inode, +static inline void netfs_inode_init(struct netfs_inode *ctx, const struct netfs_request_ops *ops) { - struct netfs_inode *ctx = netfs_inode(inode); - ctx->ops = ops; - ctx->remote_i_size = i_size_read(inode); + ctx->remote_i_size = i_size_read(&ctx->inode); #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif @@ -322,28 +321,25 @@ static inline void netfs_inode_init(struct inode *inode, /** * netfs_resize_file - Note that a file got resized - * @inode: The inode being resized + * @ctx: The netfs inode being resized * @new_i_size: The new file size * * Inform the netfs lib that a file got resized so that it can adjust its state. */ -static inline void netfs_resize_file(struct inode *inode, loff_t new_i_size) +static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size) { - struct netfs_inode *ctx = netfs_inode(inode); - ctx->remote_i_size = new_i_size; } /** * netfs_i_cookie - Get the cache cookie from the inode - * @inode: The inode to query + * @ctx: The netfs inode to query * * Get the caching cookie (if enabled) from the network filesystem's inode. */ -static inline struct fscache_cookie *netfs_i_cookie(struct inode *inode) +static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx) { #if IS_ENABLED(CONFIG_FSCACHE) - struct netfs_inode *ctx = netfs_inode(inode); return ctx->cache; #else return NULL; -- cgit v1.2.3 From 40a81101202300df7db273f77dda9fbe6271b1d2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 25 Feb 2022 11:19:14 +0000 Subject: netfs: Rename the netfs_io_request cleanup op and give it an op pointer The netfs_io_request cleanup op is now always in a position to be given a pointer to a netfs_io_request struct, so this can be passed in instead of the mapping and private data arguments (both of which are included in the struct). So rename the ->cleanup op to ->free_request (to match ->init_request) and pass in the I/O pointer. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com --- include/linux/netfs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index a62739f3726b..097cdd644665 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -206,7 +206,9 @@ struct netfs_io_request { */ struct netfs_request_ops { int (*init_request)(struct netfs_io_request *rreq, struct file *file); + void (*free_request)(struct netfs_io_request *rreq); int (*begin_cache_operation)(struct netfs_io_request *rreq); + void (*expand_readahead)(struct netfs_io_request *rreq); bool (*clamp_length)(struct netfs_io_subrequest *subreq); void (*issue_read)(struct netfs_io_subrequest *subreq); @@ -214,7 +216,6 @@ struct netfs_request_ops { int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, struct folio *folio, void **_fsdata); void (*done)(struct netfs_io_request *rreq); - void (*cleanup)(struct address_space *mapping, void *netfs_priv); }; /* -- cgit v1.2.3 From 8bee9dd953b69c634d1c9a3241a8b357469ad4aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Fri, 10 Jun 2022 01:41:10 +0200 Subject: workqueue: Switch to new kerneldoc syntax for named variable macro argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The syntax without dots is available since commit 43756e347f21 ("scripts/kernel-doc: Add support for named variable macro arguments"). The same HTML output is produced with and without this patch. Signed-off-by: Jonathan Neuschäfer Acked-by: Tejun Heo Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index e1f1c8b1121b..62e75dd40d9a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -406,7 +406,7 @@ alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); * alloc_ordered_workqueue - allocate an ordered workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) - * @args...: args for @fmt + * @args: args for @fmt * * Allocate an ordered workqueue. An ordered workqueue executes at * most one work item at any given time in the queued order. They are -- cgit v1.2.3