From a85857633b04d57f4524cca0a2bfaf87b2543f9f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 25 Apr 2018 13:26:23 -0400 Subject: nfsd4: support change_attr_type attribute The change attribute is what is used by clients to revalidate their caches. Our server may use i_version or ctime for that purpose. Those choices behave slightly differently, and it may be useful to the client to know which we're using. This attribute tells the client that. The Linux client doesn't yet use this attribute yet, though. Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- include/linux/nfs4.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 57ffaa20d564..0877ed333733 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -374,6 +374,13 @@ enum lock_type4 { NFS4_WRITEW_LT = 4 }; +enum change_attr_type4 { + NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR = 0, + NFS4_CHANGE_TYPE_IS_VERSION_COUNTER = 1, + NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS = 2, + NFS4_CHANGE_TYPE_IS_TIME_METADATA = 3, + NFS4_CHANGE_TYPE_IS_UNDEFINED = 4 +}; /* Mandatory Attributes */ #define FATTR4_WORD0_SUPPORTED_ATTRS (1UL << 0) @@ -441,6 +448,7 @@ enum lock_type4 { #define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) #define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4) #define FATTR4_WORD2_CLONE_BLKSIZE (1UL << 13) +#define FATTR4_WORD2_CHANGE_ATTR_TYPE (1UL << 15) #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) #define FATTR4_WORD2_MODE_UMASK (1UL << 17) -- cgit v1.2.3 From f2b1d2f94af887c91bb8a0cfb495e546331bc5ed Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 30 May 2018 17:25:13 +0200 Subject: soc: renesas: rcar-sysc: Provide helpers to power up/down CPUs Provide helpers to control CPU power areas from platform code, taking just a CPU index. This will avoid having to pass full CPU power area parameter blocks, and thus duplicating information already provided by SoC-specific SYSC drivers. This will be used on R-Car H1 only. Later R-Car generations rely on APMU/RST for CPU power area control. Signed-off-by: Geert Uytterhoeven Signed-off-by: Simon Horman --- include/linux/soc/renesas/rcar-sysc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/renesas/rcar-sysc.h b/include/linux/soc/renesas/rcar-sysc.h index 8a6086d2e9c3..9020da2111fd 100644 --- a/include/linux/soc/renesas/rcar-sysc.h +++ b/include/linux/soc/renesas/rcar-sysc.h @@ -13,5 +13,7 @@ struct rcar_sysc_ch { int rcar_sysc_power_down(const struct rcar_sysc_ch *sysc_ch); int rcar_sysc_power_up(const struct rcar_sysc_ch *sysc_ch); void rcar_sysc_init(phys_addr_t base, u32 syscier); +int rcar_sysc_power_down_cpu(unsigned int cpu); +int rcar_sysc_power_up_cpu(unsigned int cpu); #endif /* __LINUX_SOC_RENESAS_RCAR_SYSC_H__ */ -- cgit v1.2.3 From 7e8a50df26f4e7003d09f7e8d1e57fbbb7ebb750 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 30 May 2018 17:25:16 +0200 Subject: soc: renesas: rcar-sysc: Drop legacy handling Now the R-Car platform code no longer supports DTBs lacking a SYSC device node in DT, all legacy handling can be dropped from the R-Car SYSC driver: - Make rcar_sysc_ch private to the driver, - Make rcar_sysc_power_{down,up}() static (they have been replaced by rcar_sysc_power_{down,up}_cpu()), - Remove the legacy wrapper rcar_sysc_init(), and the check for double initialization (only the early_initcall is left). Signed-off-by: Geert Uytterhoeven Signed-off-by: Simon Horman --- include/linux/soc/renesas/rcar-sysc.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/renesas/rcar-sysc.h b/include/linux/soc/renesas/rcar-sysc.h index 9020da2111fd..00fae6fd234d 100644 --- a/include/linux/soc/renesas/rcar-sysc.h +++ b/include/linux/soc/renesas/rcar-sysc.h @@ -2,17 +2,6 @@ #ifndef __LINUX_SOC_RENESAS_RCAR_SYSC_H__ #define __LINUX_SOC_RENESAS_RCAR_SYSC_H__ -#include - -struct rcar_sysc_ch { - u16 chan_offs; - u8 chan_bit; - u8 isr_bit; -}; - -int rcar_sysc_power_down(const struct rcar_sysc_ch *sysc_ch); -int rcar_sysc_power_up(const struct rcar_sysc_ch *sysc_ch); -void rcar_sysc_init(phys_addr_t base, u32 syscier); int rcar_sysc_power_down_cpu(unsigned int cpu); int rcar_sysc_power_up_cpu(unsigned int cpu); -- cgit v1.2.3 From aece27a2f01be4bb7683790f69cd1bed3a0929a2 Mon Sep 17 00:00:00 2001 From: Samuel Morris Date: Tue, 29 May 2018 10:06:12 +0000 Subject: ata: ahci_platform: allow disabling of hotplug to save power A number of resources remain powered to support hotplug. On platforms I've worked with, allowing the ahci_platform to suspend saves about 150mW. This patch enables rpm and allows the device to be auto-suspended through sysfs. Signed-off-by: Samuel Morris Signed-off-by: Tejun Heo --- include/linux/ahci_platform.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h index 1b0a17b22cd3..6396e6982103 100644 --- a/include/linux/ahci_platform.h +++ b/include/linux/ahci_platform.h @@ -42,5 +42,7 @@ int ahci_platform_suspend_host(struct device *dev); int ahci_platform_resume_host(struct device *dev); int ahci_platform_suspend(struct device *dev); int ahci_platform_resume(struct device *dev); +int ahci_platform_runtime_suspend(struct device *dev); +int ahci_platform_runtime_resume(struct device *dev); #endif /* _AHCI_PLATFORM_H */ -- cgit v1.2.3 From 77bc8c28ddac90287bc42c129002eb703288d550 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 18 Jun 2018 17:32:30 +0200 Subject: ARM: mvebu: convert secondary CPU clock sync to hotplug state The current call site in boot_secondary is causing sleep in invalid context warnings, as this part of the code is running with interrrupts disabled and some of the calls into the clock framework might sleep on a mutex. Convert the secondary CPU clock sync to a hotplug state, which allows to call it from a sleepable context. Signed-off-by: Lucas Stach Signed-off-by: Gregory CLEMENT --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 8796ba387152..fa54e5fbea38 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -143,6 +143,7 @@ enum cpuhp_state { CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE, + CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS, CPUHP_AP_PERF_ONLINE, CPUHP_AP_PERF_X86_ONLINE, CPUHP_AP_PERF_X86_UNCORE_ONLINE, -- cgit v1.2.3 From dc8fbeb0ffde1f2395449006019e2c89c177df50 Mon Sep 17 00:00:00 2001 From: Janusz Krzysztofik Date: Fri, 22 Jun 2018 00:41:26 +0200 Subject: ARM: OMAP1: Get rid of Split the header file into two parts and move them to directories where they belong. Information on internal structure of FIQ buffer is moved to for ams-delta-serio driver use. Other information used by ams-delta board init file and FIQ code is made local to mach-omap1 root directory. Signed-off-by: Janusz Krzysztofik Acked-by: Dmitry Torokhov Signed-off-by: Tony Lindgren --- include/linux/platform_data/ams-delta-fiq.h | 62 +++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 include/linux/platform_data/ams-delta-fiq.h (limited to 'include/linux') diff --git a/include/linux/platform_data/ams-delta-fiq.h b/include/linux/platform_data/ams-delta-fiq.h new file mode 100644 index 000000000000..dc0f835ea918 --- /dev/null +++ b/include/linux/platform_data/ams-delta-fiq.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * include/linux/platform_data/ams-delta-fiq.h + * + * Taken from the original Amstrad modifications to fiq.h + * + * Copyright (c) 2004 Amstrad Plc + * Copyright (c) 2006 Matt Callow + * Copyright (c) 2010 Janusz Krzysztofik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __LINUX_PLATFORM_DATA_AMS_DELTA_FIQ_H +#define __LINUX_PLATFORM_DATA_AMS_DELTA_FIQ_H + +/* + * These are the offsets from the beginning of the fiq_buffer. They are put here + * since the buffer and header need to be accessed by drivers servicing devices + * which generate GPIO interrupts - e.g. keyboard, modem, hook switch. + */ +#define FIQ_MASK 0 +#define FIQ_STATE 1 +#define FIQ_KEYS_CNT 2 +#define FIQ_TAIL_OFFSET 3 +#define FIQ_HEAD_OFFSET 4 +#define FIQ_BUF_LEN 5 +#define FIQ_KEY 6 +#define FIQ_MISSED_KEYS 7 +#define FIQ_BUFFER_START 8 +#define FIQ_GPIO_INT_MASK 9 +#define FIQ_KEYS_HICNT 10 +#define FIQ_IRQ_PEND 11 +#define FIQ_SIR_CODE_L1 12 +#define IRQ_SIR_CODE_L2 13 + +#define FIQ_CNT_INT_00 14 +#define FIQ_CNT_INT_KEY 15 +#define FIQ_CNT_INT_MDM 16 +#define FIQ_CNT_INT_03 17 +#define FIQ_CNT_INT_HSW 18 +#define FIQ_CNT_INT_05 19 +#define FIQ_CNT_INT_06 20 +#define FIQ_CNT_INT_07 21 +#define FIQ_CNT_INT_08 22 +#define FIQ_CNT_INT_09 23 +#define FIQ_CNT_INT_10 24 +#define FIQ_CNT_INT_11 25 +#define FIQ_CNT_INT_12 26 +#define FIQ_CNT_INT_13 27 +#define FIQ_CNT_INT_14 28 +#define FIQ_CNT_INT_15 29 + +#define FIQ_CIRC_BUFF 30 /*Start of circular buffer */ + +#ifndef __ASSEMBLER__ +extern unsigned int fiq_buffer[]; +#endif + +#endif -- cgit v1.2.3 From 5f73861fae087df19f7337620da65c99e4260c72 Mon Sep 17 00:00:00 2001 From: Janusz Krzysztofik Date: Fri, 22 Jun 2018 00:41:28 +0200 Subject: Input: ams_delta_serio: Get FIQ buffer from platform_data Instead of exporting the FIQ buffer symbol to be used in ams-delta-serio driver, pass it to the driver as platform_data. Signed-off-by: Janusz Krzysztofik Acked-by: Dmitry Torokhov Signed-off-by: Tony Lindgren --- include/linux/platform_data/ams-delta-fiq.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/ams-delta-fiq.h b/include/linux/platform_data/ams-delta-fiq.h index dc0f835ea918..cf4589ccb720 100644 --- a/include/linux/platform_data/ams-delta-fiq.h +++ b/include/linux/platform_data/ams-delta-fiq.h @@ -55,8 +55,4 @@ #define FIQ_CIRC_BUFF 30 /*Start of circular buffer */ -#ifndef __ASSEMBLER__ -extern unsigned int fiq_buffer[]; -#endif - #endif -- cgit v1.2.3 From d082852f40de5cf55a7a689bf582fced39f5443e Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Fri, 22 Jun 2018 13:32:48 +0800 Subject: ARM: imx: enable bus auto clock gating function for i.mx6sll i.MX6SLL has HW bus auto clock gating function, enable it by default to save VDD_SOC_IN power, about 5% ~ 20% saved depends on different use cases. Signed-off-by: Anson Huang Acked-by: Lee Jones Signed-off-by: Shawn Guo --- include/linux/mfd/syscon/imx6q-iomuxc-gpr.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h b/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h index e06f5f79eaef..6c1ad160ed87 100644 --- a/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h +++ b/include/linux/mfd/syscon/imx6q-iomuxc-gpr.h @@ -457,4 +457,7 @@ #define MCLK_DIR(x) (x == 1 ? IMX6UL_GPR1_SAI1_MCLK_DIR : x == 2 ? \ IMX6UL_GPR1_SAI2_MCLK_DIR : IMX6UL_GPR1_SAI3_MCLK_DIR) +/* For imx6sll iomux gpr register field define */ +#define IMX6SLL_GPR5_AFCG_X_BYPASS_MASK (0x1f << 11) + #endif /* __LINUX_IMX6Q_IOMUXC_GPR_H */ -- cgit v1.2.3 From 63453b59e41173241c4efe9335815f6432fa8586 Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Wed, 20 Jun 2018 10:51:53 +0200 Subject: i2c: smbus: add unlocked __i2c_smbus_xfer variant Removes all locking from i2c_smbus_xfer and renames it to __i2c_smbus_xfer, then adds a new i2c_smbus_xfer function that simply grabs the lock while calling the unlocked variant. This is not perfectly equivalent, since i2c_smbus_xfer was callable from atomic/irq context if you happened to end up emulating SMBus with an I2C transfer, and that is no longer the case with this patch. It is unknown (to me) if anything depends on that quirk, but it seems fragile enough to simply break those cases and require them to call i2c_transfer directly instead. While at it, for consistency rename the 2nd to last argument (size) of the i2c_smbus_xfer declaration to protocol and remove the surplus extern marker. Signed-off-by: Peter Rosin Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 254cd34eeae2..465afb092fa7 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -140,9 +140,14 @@ extern int __i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, and probably just as fast. Note that we use i2c_adapter here, because you do not need a specific smbus adapter to call this function. */ -extern s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, - unsigned short flags, char read_write, u8 command, - int size, union i2c_smbus_data *data); +s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, + unsigned short flags, char read_write, u8 command, + int protocol, union i2c_smbus_data *data); + +/* Unlocked flavor */ +s32 __i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, + unsigned short flags, char read_write, u8 command, + int protocol, union i2c_smbus_data *data); /* Now follow the 'nice' access routines. These also document the calling conventions of i2c_smbus_xfer. */ -- cgit v1.2.3 From 7a872b6fb7fdc4213e9bb4e1c83a65e6b8af7ebd Mon Sep 17 00:00:00 2001 From: Keerthy Date: Wed, 4 Jul 2018 20:19:06 -0700 Subject: soc: ti: wkup_m3_ipc: Add rtc_only with ddr in self refresh mode support Adds rtc_only support. This needs resume function to shutdown and reboot the m3. Signed-off-by: Keerthy Signed-off-by: Santosh Shilimkar --- include/linux/wkup_m3_ipc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wkup_m3_ipc.h b/include/linux/wkup_m3_ipc.h index d6ba7d39a62f..d639df15e8ba 100644 --- a/include/linux/wkup_m3_ipc.h +++ b/include/linux/wkup_m3_ipc.h @@ -40,6 +40,7 @@ struct wkup_m3_ipc { struct mbox_chan *mbox; struct wkup_m3_ipc_ops *ops; + int is_rtc_only; }; struct wkup_m3_ipc_ops { @@ -48,8 +49,10 @@ struct wkup_m3_ipc_ops { int (*prepare_low_power)(struct wkup_m3_ipc *m3_ipc, int state); int (*finish_low_power)(struct wkup_m3_ipc *m3_ipc); int (*request_pm_status)(struct wkup_m3_ipc *m3_ipc); + void (*set_rtc_only)(struct wkup_m3_ipc *m3_ipc); }; struct wkup_m3_ipc *wkup_m3_ipc_get(void); void wkup_m3_ipc_put(struct wkup_m3_ipc *m3_ipc); +void wkup_m3_set_rtc_only_mode(void); #endif /* _LINUX_WKUP_M3_IPC_H */ -- cgit v1.2.3 From ec93b62fec9c7138d2b75334d192ecc12376f885 Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Wed, 4 Jul 2018 20:19:06 -0700 Subject: soc: ti: wkup_m3_ipc: Add wkup_m3_request_wake_src Add wkup_m3_request_wake_src to allow users to get the name of the wakeup source after a DeepSleep or Standby transition. Signed-off-by: Dave Gerlach Signed-off-by: Keerthy Signed-off-by: Santosh Shilimkar --- include/linux/wkup_m3_ipc.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wkup_m3_ipc.h b/include/linux/wkup_m3_ipc.h index d639df15e8ba..e497e621dbb7 100644 --- a/include/linux/wkup_m3_ipc.h +++ b/include/linux/wkup_m3_ipc.h @@ -43,12 +43,18 @@ struct wkup_m3_ipc { int is_rtc_only; }; +struct wkup_m3_wakeup_src { + int irq_nr; + char src[10]; +}; + struct wkup_m3_ipc_ops { void (*set_mem_type)(struct wkup_m3_ipc *m3_ipc, int mem_type); void (*set_resume_address)(struct wkup_m3_ipc *m3_ipc, void *addr); int (*prepare_low_power)(struct wkup_m3_ipc *m3_ipc, int state); int (*finish_low_power)(struct wkup_m3_ipc *m3_ipc); int (*request_pm_status)(struct wkup_m3_ipc *m3_ipc); + const char *(*request_wake_src)(struct wkup_m3_ipc *m3_ipc); void (*set_rtc_only)(struct wkup_m3_ipc *m3_ipc); }; -- cgit v1.2.3 From 0f725561e168485eff7277d683405c05b192f537 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 7 Jun 2018 09:56:59 -0700 Subject: iommu/vt-d: Add definitions for PFSID When SRIOV VF device IOTLB is invalidated, we need to provide the PF source ID such that IOMMU hardware can gauge the depth of invalidation queue which is shared among VFs. This is needed when device invalidation throttle (DIT) capability is supported. This patch adds bit definitions for checking and tracking PFSID. Signed-off-by: Jacob Pan Cc: stable@vger.kernel.org Cc: "Ashok Raj" Cc: "Lu Baolu" Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 1df940196ab2..3b1c37155572 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -114,6 +114,7 @@ * Extended Capability Register */ +#define ecap_dit(e) ((e >> 41) & 0x1) #define ecap_pasid(e) ((e >> 40) & 0x1) #define ecap_pss(e) ((e >> 35) & 0x1f) #define ecap_eafs(e) ((e >> 34) & 0x1) @@ -283,6 +284,7 @@ enum { #define QI_DEV_IOTLB_SID(sid) ((u64)((sid) & 0xffff) << 32) #define QI_DEV_IOTLB_QDEP(qdep) (((qdep) & 0x1f) << 16) #define QI_DEV_IOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) +#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xfff) << 52)) #define QI_DEV_IOTLB_SIZE 1 #define QI_DEV_IOTLB_MAX_INVS 32 @@ -307,6 +309,7 @@ enum { #define QI_DEV_EIOTLB_PASID(p) (((u64)p) << 32) #define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0xffff) << 16) #define QI_DEV_EIOTLB_QDEP(qd) ((u64)((qd) & 0x1f) << 4) +#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xfff) << 52)) #define QI_DEV_EIOTLB_MAX_INVS 32 #define QI_PGRP_IDX(idx) (((u64)(idx)) << 55) -- cgit v1.2.3 From 1c48db44924298ad0cb5a6386b88017539be8822 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 7 Jun 2018 09:57:00 -0700 Subject: iommu/vt-d: Fix dev iotlb pfsid use PFSID should be used in the invalidation descriptor for flushing device IOTLBs on SRIOV VFs. Signed-off-by: Jacob Pan Cc: stable@vger.kernel.org Cc: "Ashok Raj" Cc: "Lu Baolu" Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 3b1c37155572..6692b40ca814 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -455,9 +455,8 @@ extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, u64 type); extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type); -extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep, - u64 addr, unsigned mask); - +extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u16 qdep, u64 addr, unsigned mask); extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); extern int dmar_ir_support(void); -- cgit v1.2.3 From bad614b24293ae463e74d2465685f0e4e229baca Mon Sep 17 00:00:00 2001 From: Gary R Hook Date: Tue, 12 Jun 2018 16:41:21 -0500 Subject: iommu: Enable debugfs exposure of IOMMU driver internals Provide base enablement for using debugfs to expose internal data of an IOMMU driver. When called, create the /sys/kernel/debug/iommu directory. Emit a strong warning at boot time to indicate that this feature is enabled. This function is called from iommu_init, and creates the initial DebugFS directory. Drivers may then call iommu_debugfs_new_driver_dir() to instantiate a device-specific directory to expose internal data. It will return a pointer to the new dentry structure created in /sys/kernel/debug/iommu, or NULL in the event of a failure. Since the IOMMU driver can not be removed from the running system, there is no need for an "off" function. Signed-off-by: Gary R Hook Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 19938ee6eb31..7447b0b0579a 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -698,4 +698,11 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode) #endif /* CONFIG_IOMMU_API */ +#ifdef CONFIG_IOMMU_DEBUGFS +extern struct dentry *iommu_debugfs_dir; +void iommu_debugfs_setup(void); +#else +static inline void iommu_debugfs_setup(void) {} +#endif + #endif /* __LINUX_IOMMU_H */ -- cgit v1.2.3 From 818b7587b4d34e989ea6c042eeb8d50ffa5be13e Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Wed, 27 Jun 2018 10:31:20 -0500 Subject: x86: irq_remapping: Move irq remapping mode enum The enum is currently defined in Intel-specific DMAR header file, but it is also used by APIC common code. Therefore, move it to a more appropriate interrupt-remapping common header file. This will also be used by subsequent patches. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Joerg Roedel Signed-off-by: Suravee Suthikulpanit Signed-off-by: Joerg Roedel --- include/linux/dmar.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index e2433bc50210..843a41ba7e28 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -265,11 +265,6 @@ static inline void dmar_copy_shared_irte(struct irte *dst, struct irte *src) #define PDA_LOW_BIT 26 #define PDA_HIGH_BIT 32 -enum { - IRQ_REMAP_XAPIC_MODE, - IRQ_REMAP_X2APIC_MODE, -}; - /* Can't use the common MSI interrupt functions * since DMAR is not a pci device */ -- cgit v1.2.3 From 3ffa6583e24e1ad1abab836d24bfc9d2308074e5 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 25 Jun 2018 09:51:48 +0200 Subject: power: remove possible deadlock when unregistering power_supply If a device gets removed right after having registered a power_supply node, we might enter in a deadlock between the remove call (that has a lock on the parent device) and the deferred register work. Allow the deferred register work to exit without taking the lock when we are in the remove state. Stack trace on a Ubuntu 16.04: [16072.109121] INFO: task kworker/u16:2:1180 blocked for more than 120 seconds. [16072.109127] Not tainted 4.13.0-41-generic #46~16.04.1-Ubuntu [16072.109129] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [16072.109132] kworker/u16:2 D 0 1180 2 0x80000000 [16072.109142] Workqueue: events_power_efficient power_supply_deferred_register_work [16072.109144] Call Trace: [16072.109152] __schedule+0x3d6/0x8b0 [16072.109155] schedule+0x36/0x80 [16072.109158] schedule_preempt_disabled+0xe/0x10 [16072.109161] __mutex_lock.isra.2+0x2ab/0x4e0 [16072.109166] __mutex_lock_slowpath+0x13/0x20 [16072.109168] ? __mutex_lock_slowpath+0x13/0x20 [16072.109171] mutex_lock+0x2f/0x40 [16072.109174] power_supply_deferred_register_work+0x2b/0x50 [16072.109179] process_one_work+0x15b/0x410 [16072.109182] worker_thread+0x4b/0x460 [16072.109186] kthread+0x10c/0x140 [16072.109189] ? process_one_work+0x410/0x410 [16072.109191] ? kthread_create_on_node+0x70/0x70 [16072.109194] ret_from_fork+0x35/0x40 [16072.109199] INFO: task test:2257 blocked for more than 120 seconds. [16072.109202] Not tainted 4.13.0-41-generic #46~16.04.1-Ubuntu [16072.109204] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [16072.109206] test D 0 2257 2256 0x00000004 [16072.109208] Call Trace: [16072.109211] __schedule+0x3d6/0x8b0 [16072.109215] schedule+0x36/0x80 [16072.109218] schedule_timeout+0x1f3/0x360 [16072.109221] ? check_preempt_curr+0x5a/0xa0 [16072.109224] ? ttwu_do_wakeup+0x1e/0x150 [16072.109227] wait_for_completion+0xb4/0x140 [16072.109230] ? wait_for_completion+0xb4/0x140 [16072.109233] ? wake_up_q+0x70/0x70 [16072.109236] flush_work+0x129/0x1e0 [16072.109240] ? worker_detach_from_pool+0xb0/0xb0 [16072.109243] __cancel_work_timer+0x10f/0x190 [16072.109247] ? device_del+0x264/0x310 [16072.109250] ? __wake_up+0x44/0x50 [16072.109253] cancel_delayed_work_sync+0x13/0x20 [16072.109257] power_supply_unregister+0x37/0xb0 [16072.109260] devm_power_supply_release+0x11/0x20 [16072.109263] release_nodes+0x110/0x200 [16072.109266] devres_release_group+0x7c/0xb0 [16072.109274] wacom_remove+0xc2/0x110 [wacom] [16072.109279] hid_device_remove+0x6e/0xd0 [hid] [16072.109284] device_release_driver_internal+0x158/0x210 [16072.109288] device_release_driver+0x12/0x20 [16072.109291] bus_remove_device+0xec/0x160 [16072.109293] device_del+0x1de/0x310 [16072.109298] hid_destroy_device+0x27/0x60 [hid] [16072.109303] usbhid_disconnect+0x51/0x70 [usbhid] [16072.109308] usb_unbind_interface+0x77/0x270 [16072.109311] device_release_driver_internal+0x158/0x210 [16072.109315] device_release_driver+0x12/0x20 [16072.109318] usb_driver_release_interface+0x77/0x80 [16072.109321] proc_ioctl+0x20f/0x250 [16072.109325] usbdev_do_ioctl+0x57f/0x1140 [16072.109327] ? __wake_up+0x44/0x50 [16072.109331] usbdev_ioctl+0xe/0x20 [16072.109336] do_vfs_ioctl+0xa4/0x600 [16072.109339] ? vfs_write+0x15a/0x1b0 [16072.109343] SyS_ioctl+0x79/0x90 [16072.109347] entry_SYSCALL_64_fastpath+0x24/0xab [16072.109349] RIP: 0033:0x7f20da807f47 [16072.109351] RSP: 002b:00007ffc422ae398 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [16072.109353] RAX: ffffffffffffffda RBX: 00000000010b8560 RCX: 00007f20da807f47 [16072.109355] RDX: 00007ffc422ae3a0 RSI: 00000000c0105512 RDI: 0000000000000009 [16072.109356] RBP: 0000000000000000 R08: 00007ffc422ae3e0 R09: 0000000000000010 [16072.109357] R10: 00000000000000a6 R11: 0000000000000246 R12: 0000000000000000 [16072.109359] R13: 00000000010b8560 R14: 00007ffc422ae2e0 R15: 0000000000000000 Reported-and-tested-by: Richard Hughes Tested-by: Aaron Skomra Signed-off-by: Benjamin Tissoires Fixes: 7f1a57fdd6cb ("power_supply: Fix possible NULL pointer dereference on early uevent") Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index b21c4bd96b84..f80769175c56 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -269,6 +269,7 @@ struct power_supply { spinlock_t changed_lock; bool changed; bool initialized; + bool removing; atomic_t use_cnt; #ifdef CONFIG_THERMAL struct thermal_zone_device *tzd; -- cgit v1.2.3 From 7efe25a70c37395b41b034335345d6855926c2a6 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 7 Jun 2018 19:44:56 +0000 Subject: iommu/shmobile: Remove unused include/linux/platform_data/sh_ipmmu.h header include/linux/platform_data/sh_ipmmu.h is unused since commit ae50dc4874c5 ("iommu/shmobile: Remove unused Renesas IPMMU/IPMMUI") Let's remove it. Signed-off-by: Corentin Labbe Reviewed-by: Geert Uytterhoeven Reviewed-by: Simon Horman Signed-off-by: Joerg Roedel --- include/linux/platform_data/sh_ipmmu.h | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 include/linux/platform_data/sh_ipmmu.h (limited to 'include/linux') diff --git a/include/linux/platform_data/sh_ipmmu.h b/include/linux/platform_data/sh_ipmmu.h deleted file mode 100644 index 39f7405cdac5..000000000000 --- a/include/linux/platform_data/sh_ipmmu.h +++ /dev/null @@ -1,18 +0,0 @@ -/* sh_ipmmu.h - * - * Copyright (C) 2012 Hideki EIRAKU - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; version 2 of the License. - */ - -#ifndef __SH_IPMMU_H__ -#define __SH_IPMMU_H__ - -struct shmobile_ipmmu_platform_data { - const char * const *dev_names; - unsigned int num_dev_names; -}; - -#endif /* __SH_IPMMU_H__ */ -- cgit v1.2.3 From 5740c99e9d30b81fcc478797e7215c61e241f44e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 6 Jul 2018 23:57:03 +0200 Subject: vfs: dedupe: return int Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 5c91108846db..b81c4b7e339f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1749,7 +1749,7 @@ struct file_operations { loff_t, size_t, unsigned int); int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64); - ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, + int (*dedupe_file_range)(struct file *, u64, u64, struct file *, u64); } __randomize_layout; -- cgit v1.2.3 From 87eb5eb2423213ac0e7315ce5d275f1ff80e0263 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 6 Jul 2018 23:57:03 +0200 Subject: vfs: dedupe: rationalize args Clean up f_op->dedupe_file_range() interface. 1) Use loff_t for offsets and length instead of u64 2) Order the arguments the same way as {copy|clone}_file_range(). Signed-off-by: Miklos Szeredi Reviewed-by: Darrick J. Wong --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b81c4b7e339f..a8fee2f44981 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1749,7 +1749,7 @@ struct file_operations { loff_t, size_t, unsigned int); int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64); - int (*dedupe_file_range)(struct file *, u64, u64, struct file *, + int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64); } __randomize_layout; -- cgit v1.2.3 From 7f35e63dbfcb627bd30bac45702ffdf1ddde1516 Mon Sep 17 00:00:00 2001 From: Faiz Abbas Date: Mon, 9 Jul 2018 22:18:38 +0530 Subject: bus: ti-sysc: Add support for using ti-sysc for MCAN on dra76x The dra76x MCAN generic interconnect module has a its own format for the bits in the control registers. Therefore add a new module type, new regbits and new capabilities specific to the MCAN module. Acked-by: Rob Herring CC: Tony Lindgren Signed-off-by: Faiz Abbas Signed-off-by: Tony Lindgren --- include/linux/platform_data/ti-sysc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h index 990aad477458..2efa3470a451 100644 --- a/include/linux/platform_data/ti-sysc.h +++ b/include/linux/platform_data/ti-sysc.h @@ -14,6 +14,7 @@ enum ti_sysc_module_type { TI_SYSC_OMAP4_SR, TI_SYSC_OMAP4_MCASP, TI_SYSC_OMAP4_USB_HOST_FS, + TI_SYSC_DRA7_MCAN, }; struct ti_sysc_cookie { -- cgit v1.2.3 From 74655749a58405e259eaaba66bfc391fdbe1e34e Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Mon, 9 Jul 2018 13:03:16 +0530 Subject: ARM: OMAP2+: sleep33/43xx: Make sleep actions configurable Add an argument to the sleep33xx and sleep43xx code to allow us to set flags to determine which portions of the code get called in order to use the same code for multiple power saving modes. This patch allows us to decide whether or not we flush and disable caches, save EMIF context, put the memory into self refresh and disable the EMIF, and/or invoke the wkup_m3 when entering into WFI. Signed-off-by: Dave Gerlach Signed-off-by: Tero Kristo Signed-off-by: Keerthy Signed-off-by: Tony Lindgren --- include/linux/platform_data/pm33xx.h | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/pm33xx.h b/include/linux/platform_data/pm33xx.h index f9bed2a0af9d..d231265c135c 100644 --- a/include/linux/platform_data/pm33xx.h +++ b/include/linux/platform_data/pm33xx.h @@ -12,6 +12,29 @@ #include #include +/* + * WFI Flags for sleep code control + * + * These flags allow PM code to exclude certain operations from happening + * in the low level ASM code found in sleep33xx.S and sleep43xx.S + * + * WFI_FLAG_FLUSH_CACHE: Flush the ARM caches and disable caching. Only + * needed when MPU will lose context. + * WFI_FLAG_SELF_REFRESH: Let EMIF place DDR memory into self-refresh and + * disable EMIF. + * WFI_FLAG_SAVE_EMIF: Save context of all EMIF registers and restore in + * resume path. Only needed if PER domain loses context + * and must also have WFI_FLAG_SELF_REFRESH set. + * WFI_FLAG_WAKE_M3: Disable MPU clock or clockdomain to cause wkup_m3 to + * execute when WFI instruction executes. + * WFI_FLAG_RTC_ONLY: Configure the RTC to enter RTC+DDR mode. + */ +#define WFI_FLAG_FLUSH_CACHE BIT(0) +#define WFI_FLAG_SELF_REFRESH BIT(1) +#define WFI_FLAG_SAVE_EMIF BIT(2) +#define WFI_FLAG_WAKE_M3 BIT(3) +#define WFI_FLAG_RTC_ONLY BIT(4) + #ifndef __ASSEMBLER__ struct am33xx_pm_sram_addr { void (*do_wfi)(void); @@ -23,7 +46,8 @@ struct am33xx_pm_sram_addr { struct am33xx_pm_platform_data { int (*init)(void); - int (*soc_suspend)(unsigned int state, int (*fn)(unsigned long)); + int (*soc_suspend)(unsigned int state, int (*fn)(unsigned long), + unsigned long args); struct am33xx_pm_sram_addr *(*get_sram_addrs)(void); }; -- cgit v1.2.3 From 8c5a916f4c8815196cc8a86b9582ca89422aac25 Mon Sep 17 00:00:00 2001 From: Keerthy Date: Mon, 9 Jul 2018 13:03:17 +0530 Subject: ARM: OMAP2+: sleep33/43xx: Add RTC-Mode support Add support for RTC mode to low level suspend code. This includes providing the rtc base address for the assembly code to configuring the PMIC_PWR_EN line late in suspend to enter RTC+DDR mode. Note: This patch also fold in left out space parameter for am33xx_emif_sram_table and am43xx_emif_sram_table Signed-off-by: Dave Gerlach Signed-off-by: Keerthy Signed-off-by: Tony Lindgren --- include/linux/platform_data/pm33xx.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/pm33xx.h b/include/linux/platform_data/pm33xx.h index d231265c135c..fbf5ed73c7cc 100644 --- a/include/linux/platform_data/pm33xx.h +++ b/include/linux/platform_data/pm33xx.h @@ -42,6 +42,7 @@ struct am33xx_pm_sram_addr { unsigned long *resume_offset; unsigned long *emif_sram_table; unsigned long *ro_sram_data; + unsigned long resume_address; }; struct am33xx_pm_platform_data { @@ -49,6 +50,7 @@ struct am33xx_pm_platform_data { int (*soc_suspend)(unsigned int state, int (*fn)(unsigned long), unsigned long args); struct am33xx_pm_sram_addr *(*get_sram_addrs)(void); + void __iomem *(*get_rtc_base_addr)(void); }; struct am33xx_pm_sram_data { @@ -60,6 +62,7 @@ struct am33xx_pm_sram_data { struct am33xx_pm_ro_sram_data { u32 amx3_pm_sram_data_virt; u32 amx3_pm_sram_data_phys; + void __iomem *rtc_base_virt; } __packed __aligned(8); #endif /* __ASSEMBLER__ */ -- cgit v1.2.3 From 7f69ae7fadd9dbebe563bfc28b993a2bc9c8acf9 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 28 Jun 2018 11:57:48 +0200 Subject: ARM: davinci: unduplicate aemif support All users now register platform devices using the ti-aemif driver. Remove the handcrafted aemif API. Signed-off-by: Bartosz Golaszewski Signed-off-by: Sekhar Nori --- include/linux/platform_data/mtd-davinci-aemif.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/mtd-davinci-aemif.h b/include/linux/platform_data/mtd-davinci-aemif.h index 97948ac2bb9b..a403dd51dacc 100644 --- a/include/linux/platform_data/mtd-davinci-aemif.h +++ b/include/linux/platform_data/mtd-davinci-aemif.h @@ -33,5 +33,4 @@ struct davinci_aemif_timing { u8 ta; }; -int davinci_aemif_setup(struct platform_device *pdev); #endif -- cgit v1.2.3 From fae68031f7fbc8b6db58d87830ba7ed1d696fbb1 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Fri, 6 Jul 2018 07:35:50 +0200 Subject: w1: core: match sub-nodes of bus masters in devicetree Once a new slave device is detected, match it against all sub-nodes of the master bus controller. If a match is found, set the slave device's of_node pointer. Signed-off-by: Daniel Mack Signed-off-by: Sebastian Reichel --- include/linux/w1.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/w1.h b/include/linux/w1.h index 694101f744c7..3111585c371f 100644 --- a/include/linux/w1.h +++ b/include/linux/w1.h @@ -274,6 +274,8 @@ struct w1_family { struct w1_family_ops *fops; + const struct of_device_id *of_match_table; + atomic_t refcnt; }; -- cgit v1.2.3 From 23ebda2fc715534d383d59ae6740d4e3ebd43798 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 11 Jul 2018 17:21:05 +0200 Subject: libata: remove ata_sff_data_xfer_noirq() ata_sff_data_xfer_noirq() is invoked via the ->sff_data_xfer hook. The latter is invoked by ata_pio_sector(), atapi_send_cdb() and __atapi_pio_bytes() which in turn is invoked by ata_sff_hsm_move(). The latter function requires that the "ap->lock" lock is held which needs to be taken with disabled interrupts. There is no need have to have ata_sff_data_xfer_noirq() which invokes ata_sff_data_xfer32() with disabled interrupts because at this point the interrupts are already disabled. Remove the function and its references to it and replace all callers with ata_sff_data_xfer32(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- include/linux/libata.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 8b8946dd63b9..aa8583655a18 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1832,8 +1832,6 @@ extern unsigned int ata_sff_data_xfer(struct ata_queued_cmd *qc, unsigned char *buf, unsigned int buflen, int rw); extern unsigned int ata_sff_data_xfer32(struct ata_queued_cmd *qc, unsigned char *buf, unsigned int buflen, int rw); -extern unsigned int ata_sff_data_xfer_noirq(struct ata_queued_cmd *qc, - unsigned char *buf, unsigned int buflen, int rw); extern void ata_sff_irq_on(struct ata_port *ap); extern void ata_sff_irq_clear(struct ata_port *ap); extern int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc, -- cgit v1.2.3 From 2abc77af89e17582db9039293c8ac881c8c96d79 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 12 Jul 2018 11:18:42 -0400 Subject: new helper: open_with_fake_path() open a file by given inode, faking ->f_path. Use with shitloads of caution - at the very least you'd damn better make sure that some dentry alias of that inode is pinned down by the path in question. Again, this is no general-purpose interface and I hope it will eventually go away. Right now overlayfs wants something like that, but nothing else should. Any out-of-tree code with bright idea of using this one *will* eventually get hurt, with zero notice and great delight on my part. I refuse to use EXPORT_SYMBOL_GPL(), especially in situations when it's really EXPORT_SYMBOL_DONT_USE_IT(), but don't take that export as "you are welcome to use it". Signed-off-by: Al Viro --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 05f34726e29c..4ff7b7012186 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2424,6 +2424,8 @@ extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, const char *, int, umode_t); extern struct file * dentry_open(const struct path *, int, const struct cred *); +extern struct file * open_with_fake_path(const struct path *, int, + struct inode*, const struct cred *); static inline struct file *file_clone_open(struct file *file) { return dentry_open(&file->f_path, file->f_flags, file->f_cred); -- cgit v1.2.3 From 3f3a89e1d7c31558c070692241e3d6146d2cf1bf Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Wed, 20 Jun 2018 07:18:03 +0200 Subject: i2c: remove i2c_lock_adapter and use i2c_lock_bus directly The i2c_lock_adapter name is ambiguous since it is unclear if it refers to the root adapter or the adapter you name in the argument. The natural interpretation is the adapter you name in the argument, but there are historical reasons for that not being the case; it in fact locks the root adapter. Just remove the function and force users to spell out the I2C_LOCK_ROOT_ADAPTER name to indicate what is really going on. Also remove i2c_unlock_adapter, of course. This patch was generated with git grep -l 'i2c_\(un\)\?lock_adapter' \ | xargs sed -i 's/i2c_\(un\)\?lock_adapter(\([^)]*\))/'\ 'i2c_\1lock_bus(\2, I2C_LOCK_ROOT_ADAPTER)/g' followed by white-space touch-up. Signed-off-by: Peter Rosin Acked-by: Jonathan Cameron Tested-by: Sekhar Nori Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 254cd34eeae2..795e3a860afe 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -754,18 +754,6 @@ i2c_unlock_bus(struct i2c_adapter *adapter, unsigned int flags) adapter->lock_ops->unlock_bus(adapter, flags); } -static inline void -i2c_lock_adapter(struct i2c_adapter *adapter) -{ - i2c_lock_bus(adapter, I2C_LOCK_ROOT_ADAPTER); -} - -static inline void -i2c_unlock_adapter(struct i2c_adapter *adapter) -{ - i2c_unlock_bus(adapter, I2C_LOCK_ROOT_ADAPTER); -} - /*flags for the client struct: */ #define I2C_CLIENT_PEC 0x04 /* Use Packet Error Checking */ #define I2C_CLIENT_TEN 0x10 /* we have a ten bit chip address */ -- cgit v1.2.3 From ffc59c496bf8498657321c59433f55bbcf2d9c38 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 10 Jul 2018 23:42:16 +0200 Subject: i2c: recovery: require either get_sda or set_sda For bus recovery, we either need to bail out early if we can read SDA or we need to send STOP after every pulse. Otherwise recovery might be misinterpreted as an unwanted write. So, require one of those SDA handling functions to avoid this problem. Signed-off-by: Wolfram Sang Acked-by: Peter Rosin Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 24acd69c8874..4c4360f94786 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -581,12 +581,12 @@ struct i2c_timings { * recovery. Populated internally for generic GPIO recovery. * @set_scl: This sets/clears the SCL line. Mandatory for generic SCL recovery. * Populated internally for generic GPIO recovery. - * @get_sda: This gets current value of SDA line. Optional for generic SCL - * recovery. Populated internally, if sda_gpio is a valid GPIO, for generic - * GPIO recovery. - * @set_sda: This sets/clears the SDA line. Optional for generic SCL recovery. - * Populated internally, if sda_gpio is a valid GPIO, for generic GPIO - * recovery. + * @get_sda: This gets current value of SDA line. This or set_sda() is mandatory + * for generic SCL recovery. Populated internally, if sda_gpio is a valid + * GPIO, for generic GPIO recovery. + * @set_sda: This sets/clears the SDA line. This or get_sda() is mandatory for + * generic SCL recovery. Populated internally, if sda_gpio is a valid GPIO, + * for generic GPIO recovery. * @prepare_recovery: This will be called before starting recovery. Platform may * configure padmux here for SDA/SCL line or something else they want. * @unprepare_recovery: This will be called after completing recovery. Platform -- cgit v1.2.3 From 7ca5f6be7900ca753ed01c0202dc5f998a41f4ee Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 11 Jul 2018 00:24:22 +0200 Subject: i2c: recovery: add get_bus_free callback Some IP cores have an internal 'bus free' logic which may be more advanced than just checking if SDA is high. Add a separate callback to get this status. Filling it is optional. Signed-off-by: Wolfram Sang Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 4c4360f94786..bc8d42f8544f 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -587,6 +587,8 @@ struct i2c_timings { * @set_sda: This sets/clears the SDA line. This or get_sda() is mandatory for * generic SCL recovery. Populated internally, if sda_gpio is a valid GPIO, * for generic GPIO recovery. + * @get_bus_free: Returns the bus free state as seen from the IP core in case it + * has a more complex internal logic than just reading SDA. Optional. * @prepare_recovery: This will be called before starting recovery. Platform may * configure padmux here for SDA/SCL line or something else they want. * @unprepare_recovery: This will be called after completing recovery. Platform @@ -601,6 +603,7 @@ struct i2c_bus_recovery_info { void (*set_scl)(struct i2c_adapter *adap, int val); int (*get_sda)(struct i2c_adapter *adap); void (*set_sda)(struct i2c_adapter *adap, int val); + int (*get_bus_free)(struct i2c_adapter *adap); void (*prepare_recovery)(struct i2c_adapter *adap); void (*unprepare_recovery)(struct i2c_adapter *adap); -- cgit v1.2.3 From 5b56c182edb1224bc1a97a1c74003eaa0eb59daf Mon Sep 17 00:00:00 2001 From: Wenyou Yang Date: Tue, 17 Jul 2018 11:26:55 +0300 Subject: ARM: at91: pm: Add ULP1 mode support In the ULP1 mode, in order to achieve the lowest power consumption with the system in retention mode and be able to resume on the wake up events, all the clocks are shut off, inclusive the embedded 12MHz RC oscillator, and the number of wake up sources is limited as well. When the wake up event is asserted, the embedded 12MHz RC oscillator restarts automatically. The ULP1 (Ultra Low-power mode 1) is introduced by SAMA5D2. The previous size of pm_suspend.o was 2148 bytes. With the addition of ULP1 mode the new size of pm_suspend.o raised at 2456 bytes. Signed-off-by: Wenyou Yang Signed-off-by: Ludovic Desroches [claudiu.beznea@microchip.com: aligned with 4.18-rc1] Signed-off-by: Claudiu Beznea Signed-off-by: Alexandre Belloni --- include/linux/clk/at91_pmc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk/at91_pmc.h b/include/linux/clk/at91_pmc.h index 6aca5ce8a99a..4ea2cbf9b50d 100644 --- a/include/linux/clk/at91_pmc.h +++ b/include/linux/clk/at91_pmc.h @@ -47,8 +47,10 @@ #define AT91_CKGR_MOR 0x20 /* Main Oscillator Register [not on SAM9RL] */ #define AT91_PMC_MOSCEN (1 << 0) /* Main Oscillator Enable */ #define AT91_PMC_OSCBYPASS (1 << 1) /* Oscillator Bypass */ +#define AT91_PMC_WAITMODE (1 << 2) /* Wait Mode Command */ #define AT91_PMC_MOSCRCEN (1 << 3) /* Main On-Chip RC Oscillator Enable [some SAM9] */ #define AT91_PMC_OSCOUNT (0xff << 8) /* Main Oscillator Start-up Time */ +#define AT91_PMC_KEY_MASK (0xff << 16) #define AT91_PMC_KEY (0x37 << 16) /* MOR Writing Key */ #define AT91_PMC_MOSCSEL (1 << 24) /* Main Oscillator Selection [some SAM9] */ #define AT91_PMC_CFDEN (1 << 25) /* Clock Failure Detector Enable [some SAM9] */ -- cgit v1.2.3 From 3abd729aa468d7346f12d7dfc8f81aba653f6c88 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Tue, 17 Jul 2018 11:26:56 +0300 Subject: ARM: at91: pm: add PMC fast startup registers defines Add PMC fast startup registers defines. Signed-off-by: Claudiu Beznea Signed-off-by: Alexandre Belloni --- include/linux/clk/at91_pmc.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk/at91_pmc.h b/include/linux/clk/at91_pmc.h index 4ea2cbf9b50d..931ab05f771d 100644 --- a/include/linux/clk/at91_pmc.h +++ b/include/linux/clk/at91_pmc.h @@ -157,6 +157,19 @@ #define AT91_PMC_GCKRDY (1 << 24) /* Generated Clocks */ #define AT91_PMC_IMR 0x6c /* Interrupt Mask Register */ +#define AT91_PMC_FSMR 0x70 /* Fast Startup Mode Register */ +#define AT91_PMC_FSTT(n) BIT(n) +#define AT91_PMC_RTCAL BIT(17) /* RTC Alarm Enable */ +#define AT91_PMC_USBAL BIT(18) /* USB Resume Enable */ +#define AT91_PMC_SDMMC_CD BIT(19) /* SDMMC Card Detect Enable */ +#define AT91_PMC_LPM BIT(20) /* Low-power Mode */ +#define AT91_PMC_RXLP_MCE BIT(24) /* Backup UART Receive Enable */ +#define AT91_PMC_ACC_CE BIT(25) /* ACC Enable */ + +#define AT91_PMC_FSPR 0x74 /* Fast Startup Polarity Reg */ + +#define AT91_PMC_FS_INPUT_MASK 0x7ff + #define AT91_PMC_PLLICPR 0x80 /* PLL Charge Pump Current Register */ #define AT91_PMC_PROT 0xe4 /* Write Protect Mode Register [some SAM9] */ -- cgit v1.2.3 From d3b1084dfd629ef89bc1c4bab95e5cb87e7d08c2 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2018 15:44:40 +0200 Subject: vfs: make open_with_fake_path() not contribute to nr_files Stacking file operations in overlay will store an extra open file for each overlay file opened. The overhead is just that of "struct file" which is about 256bytes, because overlay already pins an extra dentry and inode when the file is open, which add up to a much larger overhead. For fear of breaking working setups, don't start accounting the extra file. Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 5ce2b413abc6..e1884840d556 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -156,6 +156,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File is capable of returning -EAGAIN if I/O will block */ #define FMODE_NOWAIT ((__force fmode_t)0x8000000) +/* File does not contribute to nr_files count */ +#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000) + /* * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector * that indicates that they should check the contents of the iovec are -- cgit v1.2.3 From 9df6702ad0e85901450fe48a7b5f0f8975353eeb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2018 15:44:40 +0200 Subject: vfs: export vfs_ioctl() to modules This is needed by the stacked ioctl implementation in overlayfs. Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e1884840d556..019817a083a0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1634,6 +1634,8 @@ int vfs_mkobj(struct dentry *, umode_t, int (*f)(struct dentry *, umode_t, void *), void *); +extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + /* * VFS file helper functions. */ -- cgit v1.2.3 From f182536684d876afaf4627c36a16c4e15ea8a2b8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2018 15:44:40 +0200 Subject: vfs: export vfs_dedupe_file_range_one() to modules This is needed by the stacked dedupe implementation in overlayfs. Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 019817a083a0..b67209948f1b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1829,6 +1829,10 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, loff_t len, bool *is_same); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); +extern int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, + struct file *dst_file, loff_t dst_pos, + u64 len); + struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); -- cgit v1.2.3 From 88059de155d4db817a3a78ba899cb3b7f4de0fb0 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2018 15:44:43 +0200 Subject: Revert "ovl: fix relatime for directories" This reverts commit cd91304e7190b4c4802f8e413ab2214b233e0260. Overlayfs no longer relies on the vfs correct atime handling. Signed-off-by: Miklos Szeredi --- include/linux/dcache.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 66c6e17e61e5..ddae4103d324 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -564,9 +564,6 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper) return upper; } -/* d_real() flags */ -#define D_REAL_UPPER 0x2 /* return upper dentry or NULL if non-upper */ - /** * d_real - Return the real dentry * @dentry: the dentry to query -- cgit v1.2.3 From c6718543463dbb78486ad259f884cb800df802b5 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2018 15:44:43 +0200 Subject: Revert "vfs: update ovl inode before relatime check" This reverts commit 598e3c8f72f5b77c84d2cb26cfd936ffb3cfdbaa. Overlayfs no longer relies on the vfs correct atime handling. Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b67209948f1b..8f8c9ac1c9d5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2088,6 +2088,7 @@ enum file_time_flags { S_VERSION = 8, }; +extern bool atime_needs_update(const struct path *, struct inode *); extern void touch_atime(const struct path *); static inline void file_accessed(struct file *file) { -- cgit v1.2.3 From 4ab30319fd7c691a1b3165325c647a5cd6d282ac Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2018 15:44:43 +0200 Subject: Revert "vfs: add flags to d_real()" This reverts commit 495e642939114478a5237a7d91661ba93b76f15a. No user of "flags" argument of d_real() remain. Signed-off-by: Miklos Szeredi --- include/linux/dcache.h | 11 +++++------ include/linux/fs.h | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index ddae4103d324..8fe4efa94af6 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -146,7 +146,7 @@ struct dentry_operations { struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, - unsigned int, unsigned int); + unsigned int); } ____cacheline_aligned; /* @@ -568,8 +568,7 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper) * d_real - Return the real dentry * @dentry: the dentry to query * @inode: inode to select the dentry from multiple layers (can be NULL) - * @open_flags: open flags to control copy-up behavior - * @flags: flags to control what is returned by this function + * @flags: open flags to control copy-up behavior * * If dentry is on a union/overlay, then return the underlying, real dentry. * Otherwise return the dentry itself. @@ -578,10 +577,10 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper) */ static inline struct dentry *d_real(struct dentry *dentry, const struct inode *inode, - unsigned int open_flags, unsigned int flags) + unsigned int flags) { if (unlikely(dentry->d_flags & DCACHE_OP_REAL)) - return dentry->d_op->d_real(dentry, inode, open_flags, flags); + return dentry->d_op->d_real(dentry, inode, flags); else return dentry; } @@ -596,7 +595,7 @@ static inline struct dentry *d_real(struct dentry *dentry, static inline struct inode *d_real_inode(const struct dentry *dentry) { /* This usage of d_real() results in const dentry */ - return d_backing_inode(d_real((struct dentry *) dentry, NULL, 0, 0)); + return d_backing_inode(d_real((struct dentry *) dentry, NULL, 0)); } struct name_snapshot { diff --git a/include/linux/fs.h b/include/linux/fs.h index 8f8c9ac1c9d5..16e2741cec3c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1249,7 +1249,7 @@ static inline struct inode *file_inode(const struct file *f) static inline struct dentry *file_dentry(const struct file *file) { - return d_real(file->f_path.dentry, file_inode(file), 0, 0); + return d_real(file->f_path.dentry, file_inode(file), 0); } static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) -- cgit v1.2.3 From de2a4a501e716bbf5ff691ba16faf59a35320228 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2018 15:44:43 +0200 Subject: Partially revert "locks: fix file locking on overlayfs" This partially reverts commit c568d68341be7030f5647def68851e469b21ca11. Overlayfs files will now automatically get the correct locks, no need to hack overlay support in VFS. It is a partial revert, because it leaves the locks_inode() calls in place and defines locks_inode() to file_inode(). We could revert those as well, but it would be unnecessary code churn and it makes sense to document that we are getting the inode for locking purposes. Don't revert MS_NOREMOTELOCK yet since that has been part of the userspace API for some time (though not in a useful way). Will try to remove internal flags later when the dust around the new mount API settles. Signed-off-by: Miklos Szeredi Acked-by: Jeff Layton --- include/linux/fs.h | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 16e2741cec3c..1cbcf37c45e1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1054,17 +1054,7 @@ struct file_lock_context { extern void send_sigio(struct fown_struct *fown, int fd, int band); -/* - * Return the inode to use for locking - * - * For overlayfs this should be the overlay inode, not the real inode returned - * by file_inode(). For any other fs file_inode(filp) and locks_inode(filp) are - * equal. - */ -static inline struct inode *locks_inode(const struct file *f) -{ - return f->f_path.dentry->d_inode; -} +#define locks_inode(f) file_inode(f) #ifdef CONFIG_FILE_LOCKING extern int fcntl_getlk(struct file *, unsigned int, struct flock *); @@ -1305,7 +1295,6 @@ extern int send_sigurg(struct fown_struct *fown); /* These sb flags are internal to the kernel */ #define SB_SUBMOUNT (1<<26) -#define SB_NOREMOTELOCK (1<<27) #define SB_NOSEC (1<<28) #define SB_BORN (1<<29) #define SB_ACTIVE (1<<30) -- cgit v1.2.3 From 573e1784817ca1f13d76a0df636929e983e5de3c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2018 15:44:44 +0200 Subject: Revert "fsnotify: support overlayfs" This reverts commit f3fbbb079263bd29ae592478de6808db7e708267. Overlayfs now works correctly without adding hacks to fsnotify. Signed-off-by: Miklos Szeredi --- include/linux/fsnotify.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index bdaf22582f6e..fd1ce10553bf 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -30,11 +30,7 @@ static inline int fsnotify_parent(const struct path *path, struct dentry *dentry static inline int fsnotify_perm(struct file *file, int mask) { const struct path *path = &file->f_path; - /* - * Do not use file_inode() here or anywhere in this file to get the - * inode. That would break *notity on overlayfs. - */ - struct inode *inode = path->dentry->d_inode; + struct inode *inode = file_inode(file); __u32 fsnotify_mask = 0; int ret; @@ -178,7 +174,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry) static inline void fsnotify_access(struct file *file) { const struct path *path = &file->f_path; - struct inode *inode = path->dentry->d_inode; + struct inode *inode = file_inode(file); __u32 mask = FS_ACCESS; if (S_ISDIR(inode->i_mode)) @@ -196,7 +192,7 @@ static inline void fsnotify_access(struct file *file) static inline void fsnotify_modify(struct file *file) { const struct path *path = &file->f_path; - struct inode *inode = path->dentry->d_inode; + struct inode *inode = file_inode(file); __u32 mask = FS_MODIFY; if (S_ISDIR(inode->i_mode)) @@ -214,7 +210,7 @@ static inline void fsnotify_modify(struct file *file) static inline void fsnotify_open(struct file *file) { const struct path *path = &file->f_path; - struct inode *inode = path->dentry->d_inode; + struct inode *inode = file_inode(file); __u32 mask = FS_OPEN; if (S_ISDIR(inode->i_mode)) @@ -230,7 +226,7 @@ static inline void fsnotify_open(struct file *file) static inline void fsnotify_close(struct file *file) { const struct path *path = &file->f_path; - struct inode *inode = path->dentry->d_inode; + struct inode *inode = file_inode(file); fmode_t mode = file->f_mode; __u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE; -- cgit v1.2.3 From fb16043b46831a75c9b076a7262ae035290b0409 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2018 15:44:44 +0200 Subject: vfs: remove open_flags from d_real() Opening regular files on overlayfs is now handled via ovl_open(). Remove the now unused "open_flags" argument from d_op->d_real() and the d_real() helper. Signed-off-by: Miklos Szeredi --- include/linux/dcache.h | 11 ++++------- include/linux/fs.h | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8fe4efa94af6..78cea80423a3 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -145,8 +145,7 @@ struct dentry_operations { char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); - struct dentry *(*d_real)(struct dentry *, const struct inode *, - unsigned int); + struct dentry *(*d_real)(struct dentry *, const struct inode *); } ____cacheline_aligned; /* @@ -568,7 +567,6 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper) * d_real - Return the real dentry * @dentry: the dentry to query * @inode: inode to select the dentry from multiple layers (can be NULL) - * @flags: open flags to control copy-up behavior * * If dentry is on a union/overlay, then return the underlying, real dentry. * Otherwise return the dentry itself. @@ -576,11 +574,10 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper) * See also: Documentation/filesystems/vfs.txt */ static inline struct dentry *d_real(struct dentry *dentry, - const struct inode *inode, - unsigned int flags) + const struct inode *inode) { if (unlikely(dentry->d_flags & DCACHE_OP_REAL)) - return dentry->d_op->d_real(dentry, inode, flags); + return dentry->d_op->d_real(dentry, inode); else return dentry; } @@ -595,7 +592,7 @@ static inline struct dentry *d_real(struct dentry *dentry, static inline struct inode *d_real_inode(const struct dentry *dentry) { /* This usage of d_real() results in const dentry */ - return d_backing_inode(d_real((struct dentry *) dentry, NULL, 0)); + return d_backing_inode(d_real((struct dentry *) dentry, NULL)); } struct name_snapshot { diff --git a/include/linux/fs.h b/include/linux/fs.h index 1cbcf37c45e1..1fa63d184d1f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1239,7 +1239,7 @@ static inline struct inode *file_inode(const struct file *f) static inline struct dentry *file_dentry(const struct file *file) { - return d_real(file->f_path.dentry, file_inode(file), 0); + return d_real(file->f_path.dentry, file_inode(file)); } static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) -- cgit v1.2.3 From 51261aac51a05c791ef880a100ac2ceed201ef72 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 14 Jul 2018 15:46:55 +0800 Subject: iommu/vt-d: Avoid using idr_for_each_entry() idr_for_each_entry() is used to iteratte over idr elements of a given type. It isn't suitable for the globle pasid idr since the pasid idr consumer could specify different types of pointers to bind with a pasid. Cc: Ashok Raj Cc: Jacob Pan Cc: Kevin Tian Cc: Liu Yi L Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Liu Yi L Reviewed-by: Peter Xu Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 6692b40ca814..e1e193855581 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -487,6 +487,7 @@ struct intel_svm { int flags; int pasid; struct list_head devs; + struct list_head list; }; extern int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev); -- cgit v1.2.3 From af39507305fb83a5d3c475c2851f4d59545d8a18 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 14 Jul 2018 15:46:56 +0800 Subject: iommu/vt-d: Apply global PASID in SVA This patch applies the global pasid name space in the shared virtual address (SVA) implementation. Cc: Ashok Raj Cc: Jacob Pan Cc: Kevin Tian Cc: Liu Yi L Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Liu Yi L Reviewed-by: Peter Xu Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index e1e193855581..2ff15195f73d 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -420,7 +420,6 @@ struct intel_iommu { struct pasid_state_entry *pasid_state_table; struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ - struct idr pasid_idr; u32 pasid_max; #endif struct q_inval *qi; /* Queued invalidation info */ -- cgit v1.2.3 From 9ddbfb42138d84bb326023616c40a3dc30ea2837 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 14 Jul 2018 15:46:57 +0800 Subject: iommu/vt-d: Move device_domain_info to header This allows the per device iommu data and some helpers to be used in other files. Cc: Ashok Raj Cc: Jacob Pan Cc: Kevin Tian Cc: Liu Yi L Signed-off-by: Lu Baolu Reviewed-by: Liu Yi L Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 61 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 2ff15195f73d..2e1fbde020ca 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -387,6 +388,42 @@ struct pasid_entry; struct pasid_state_entry; struct page_req_dsc; +struct dmar_domain { + int nid; /* node id */ + + unsigned iommu_refcnt[DMAR_UNITS_SUPPORTED]; + /* Refcount of devices per iommu */ + + + u16 iommu_did[DMAR_UNITS_SUPPORTED]; + /* Domain ids per IOMMU. Use u16 since + * domain ids are 16 bit wide according + * to VT-d spec, section 9.3 */ + + bool has_iotlb_device; + struct list_head devices; /* all devices' list */ + struct iova_domain iovad; /* iova's that belong to this domain */ + + struct dma_pte *pgd; /* virtual address */ + int gaw; /* max guest address width */ + + /* adjusted guest address width, 0 is level 2 30-bit */ + int agaw; + + int flags; /* flags to find out type of domain */ + + int iommu_coherency;/* indicate coherency of iommu access */ + int iommu_snooping; /* indicate snooping control feature*/ + int iommu_count; /* reference count of iommu */ + int iommu_superpage;/* Level of superpages supported: + 0 == 4KiB (no superpages), 1 == 2MiB, + 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ + u64 max_addr; /* maximum mapped address */ + + struct iommu_domain domain; /* generic domain data structure for + iommu core */ +}; + struct intel_iommu { void __iomem *reg; /* Pointer to hardware regs, virtual addr */ u64 reg_phys; /* physical address of hw register set */ @@ -435,6 +472,25 @@ struct intel_iommu { u32 flags; /* Software defined flags */ }; +/* PCI domain-device relationship */ +struct device_domain_info { + struct list_head link; /* link to domain siblings */ + struct list_head global; /* link to global list */ + u8 bus; /* PCI bus number */ + u8 devfn; /* PCI devfn number */ + u16 pfsid; /* SRIOV physical function source ID */ + u8 pasid_supported:3; + u8 pasid_enabled:1; + u8 pri_supported:1; + u8 pri_enabled:1; + u8 ats_supported:1; + u8 ats_enabled:1; + u8 ats_qdep; + struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ + struct intel_iommu *iommu; /* IOMMU used by this device */ + struct dmar_domain *domain; /* pointer to domain */ +}; + static inline void __iommu_flush_cache( struct intel_iommu *iommu, void *addr, int size) { @@ -460,6 +516,11 @@ extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); extern int dmar_ir_support(void); +struct dmar_domain *get_valid_domain_for_dev(struct device *dev); +void *alloc_pgtable_page(int node); +void free_pgtable_page(void *vaddr); +struct intel_iommu *domain_get_iommu(struct dmar_domain *domain); + #ifdef CONFIG_INTEL_IOMMU_SVM extern int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu); extern int intel_svm_free_pasid_tables(struct intel_iommu *iommu); -- cgit v1.2.3 From 85319dcc8955f8f31828dc8bafff29f6aa011d93 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 14 Jul 2018 15:46:58 +0800 Subject: iommu/vt-d: Add for_each_device_domain() helper This adds a helper named for_each_device_domain() to iterate over the elements in device_domain_list and invoke a callback against each element. This allows to search the device_domain list in other source files. Cc: Ashok Raj Cc: Jacob Pan Cc: Kevin Tian Cc: Liu Yi L Signed-off-by: Lu Baolu Reviewed-by: Liu Yi L Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 2e1fbde020ca..4fd4c6fee93e 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -520,6 +520,8 @@ struct dmar_domain *get_valid_domain_for_dev(struct device *dev); void *alloc_pgtable_page(int node); void free_pgtable_page(void *vaddr); struct intel_iommu *domain_get_iommu(struct dmar_domain *domain); +int for_each_device_domain(int (*fn)(struct device_domain_info *info, + void *data), void *data); #ifdef CONFIG_INTEL_IOMMU_SVM extern int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu); -- cgit v1.2.3 From cc580e41260dbf1a46269235f1f2b572137d9d03 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 14 Jul 2018 15:46:59 +0800 Subject: iommu/vt-d: Per PCI device pasid table interfaces This patch adds the interfaces for per PCI device pasid table management. Currently we allocate one pasid table for all PCI devices under the scope of an IOMMU. It's insecure in some cases where multiple devices under one single IOMMU unit support PASID features. With per PCI device pasid table, we can achieve finer protection and isolation granularity. Cc: Ashok Raj Cc: Jacob Pan Cc: Kevin Tian Cc: Liu Yi L Suggested-by: Ashok Raj Signed-off-by: Lu Baolu Reviewed-by: Liu Yi L Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 4fd4c6fee93e..e7901d402337 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -476,6 +476,7 @@ struct intel_iommu { struct device_domain_info { struct list_head link; /* link to domain siblings */ struct list_head global; /* link to global list */ + struct list_head table; /* link to pasid table */ u8 bus; /* PCI bus number */ u8 devfn; /* PCI devfn number */ u16 pfsid; /* SRIOV physical function source ID */ @@ -489,6 +490,7 @@ struct device_domain_info { struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ struct intel_iommu *iommu; /* IOMMU used by this device */ struct dmar_domain *domain; /* pointer to domain */ + struct pasid_table *pasid_table; /* pasid table */ }; static inline void __iommu_flush_cache( -- cgit v1.2.3 From d9737953d85131436b09668b5e8d3389c37c1f28 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 14 Jul 2018 15:47:02 +0800 Subject: iommu/vt-d: Remove the obsolete per iommu pasid tables The obsolete per iommu pasid tables are no longer used. Hence, clean up them. Cc: Ashok Raj Cc: Jacob Pan Cc: Kevin Tian Cc: Liu Yi L Signed-off-by: Lu Baolu Reviewed-by: Liu Yi L Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index e7901d402337..3c43882d3b77 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -453,7 +453,6 @@ struct intel_iommu { * devices away to userspace processes (e.g. for DPDK) and don't * want to trust that userspace will use *only* the PASID it was * told to. But while it's all driver-arbitrated, we're fine. */ - struct pasid_entry *pasid_table; struct pasid_state_entry *pasid_state_table; struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ @@ -526,8 +525,8 @@ int for_each_device_domain(int (*fn)(struct device_domain_info *info, void *data), void *data); #ifdef CONFIG_INTEL_IOMMU_SVM -extern int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu); -extern int intel_svm_free_pasid_tables(struct intel_iommu *iommu); +int intel_svm_init(struct intel_iommu *iommu); +int intel_svm_exit(struct intel_iommu *iommu); extern int intel_svm_enable_prq(struct intel_iommu *iommu); extern int intel_svm_finish_prq(struct intel_iommu *iommu); -- cgit v1.2.3 From 226ab561075f6f8f3cd5f7b3b7544f3997aab51f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 13 Jul 2018 21:49:34 -0700 Subject: device-dax: Convert to vmf_insert_mixed and vm_fault_t Use new return type vm_fault_t for fault and huge_fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Commit 1c8f422059ae ("mm: change return type to vm_fault_t") Previously vm_insert_mixed() returned an error code which driver mapped into VM_FAULT_* type. The new function vmf_insert_mixed() will replace this inefficiency by returning VM_FAULT_* type. Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Reviewed-by: Ross Zwisler Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- include/linux/huge_mm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a8a126259bc4..d3bbf6bea9e9 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -3,6 +3,7 @@ #define _LINUX_HUGE_MM_H #include +#include #include /* only for vma_is_dax() */ @@ -46,9 +47,9 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa); -int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, pfn_t pfn, bool write); -int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, pfn_t pfn, bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, -- cgit v1.2.3 From a2dca217dae29c4ff6420e8c78d56b3f61ae0797 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 16 Jul 2018 15:06:18 +0200 Subject: KVM: arm/arm64: vgic: Define GICD_IIDR fields for GICv2 and GIv3 Instead of hardcoding the shifts and masks in the GICD_IIDR register emulation, let's add the definition of these fields to the GIC header files and use them. This will make things more obvious when we're going to bump the revision in the IIDR when we'll make guest-visible changes to the implementation. Reviewed-by: Andrew Jones Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 10 ++++++++++ include/linux/irqchip/arm-gic.h | 10 ++++++++++ 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index cbb872c1b607..b22f9dfa61af 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -61,6 +61,16 @@ #define GICD_CTLR_ENABLE_G1A (1U << 1) #define GICD_CTLR_ENABLE_G1 (1U << 0) +#define GICD_IIDR_IMPLEMENTER_SHIFT 0 +#define GICD_IIDR_IMPLEMENTER_MASK (0xfff << GICD_IIDR_IMPLEMENTER_SHIFT) +#define GICD_IIDR_REVISION_SHIFT 12 +#define GICD_IIDR_REVISION_MASK (0xf << GICD_IIDR_REVISION_SHIFT) +#define GICD_IIDR_VARIANT_SHIFT 16 +#define GICD_IIDR_VARIANT_MASK (0xf << GICD_IIDR_VARIANT_SHIFT) +#define GICD_IIDR_PRODUCT_ID_SHIFT 24 +#define GICD_IIDR_PRODUCT_ID_MASK (0xff << GICD_IIDR_PRODUCT_ID_SHIFT) + + /* * In systems with a single security state (what we emulate in KVM) * the meaning of the interrupt group enable bits is slightly different diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 68d8b1f73682..484f5bfa9f3d 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -71,6 +71,16 @@ (GICD_INT_DEF_PRI << 8) |\ GICD_INT_DEF_PRI) +#define GICD_IIDR_IMPLEMENTER_SHIFT 0 +#define GICD_IIDR_IMPLEMENTER_MASK (0xfff << GICD_IIDR_IMPLEMENTER_SHIFT) +#define GICD_IIDR_REVISION_SHIFT 12 +#define GICD_IIDR_REVISION_MASK (0xf << GICD_IIDR_REVISION_SHIFT) +#define GICD_IIDR_VARIANT_SHIFT 16 +#define GICD_IIDR_VARIANT_MASK (0xf << GICD_IIDR_VARIANT_SHIFT) +#define GICD_IIDR_PRODUCT_ID_SHIFT 24 +#define GICD_IIDR_PRODUCT_ID_MASK (0xff << GICD_IIDR_PRODUCT_ID_SHIFT) + + #define GICH_HCR 0x0 #define GICH_VTR 0x4 #define GICH_VMCR 0x8 -- cgit v1.2.3 From 87322099052b6a534cecd3d303fc097d4560b7d0 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 16 Jul 2018 15:06:22 +0200 Subject: KVM: arm/arm64: vgic: Signal IRQs using their configured group Now when we have a group configuration on the struct IRQ, use this state when populating the LR and signaling interrupts as either group 0 or group 1 to the VM. Depending on the model of the emulated GIC, and the guest's configuration of the VMCR, interrupts may be signaled as IRQs or FIQs to the VM. Reviewed-by: Andrew Jones Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 484f5bfa9f3d..6c4aaf04046c 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -104,6 +104,7 @@ #define GICH_LR_PENDING_BIT (1 << 28) #define GICH_LR_ACTIVE_BIT (1 << 29) #define GICH_LR_EOI (1 << 19) +#define GICH_LR_GROUP1 (1 << 30) #define GICH_LR_HW (1 << 31) #define GICH_VMCR_ENABLE_GRP0_SHIFT 0 -- cgit v1.2.3 From 1fb53567a3633740aac8761eb7023dc5671f0edb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 5 May 2017 13:45:14 -0500 Subject: pids: Move task_pid_type into sched/signal.h The function is general and inline so there is no need to hide it inside of exit.c Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 113d1ad1ced7..d8ef0a3d2e7e 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -556,6 +556,14 @@ extern bool current_is_single_threaded(void); typedef int (*proc_visitor)(struct task_struct *p, void *data); void walk_process_tree(struct task_struct *top, proc_visitor, void *); +static inline +struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +{ + if (type != PIDTYPE_PID) + task = task->group_leader; + return task->pids[type].pid; +} + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; -- cgit v1.2.3 From 7a36094d61bfe9843de5484ff0140227983ac5d5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Sep 2017 12:45:33 -0500 Subject: pids: Compute task_tgid using signal->leader_pid The cost is the the same and this removes the need to worry about complications that come from de_thread and group_leader changing. __task_pid_nr_ns has been updated to take advantage of this change. Signed-off-by: "Eric W. Biederman" --- include/linux/sched.h | 5 ----- include/linux/sched/signal.h | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 87bf02d93a27..a461ff89a3af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1202,11 +1202,6 @@ static inline struct pid *task_pid(struct task_struct *task) return task->pids[PIDTYPE_PID].pid; } -static inline struct pid *task_tgid(struct task_struct *task) -{ - return task->group_leader->pids[PIDTYPE_PID].pid; -} - /* * Without tasklist or RCU lock it is not safe to dereference * the result of task_pgrp/task_session even if task == current, diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index d8ef0a3d2e7e..b95a272c1ab5 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -564,6 +564,11 @@ struct pid *task_pid_type(struct task_struct *task, enum pid_type type) return task->pids[type].pid; } +static inline struct pid *task_tgid(struct task_struct *task) +{ + return task->signal->leader_pid; +} + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; -- cgit v1.2.3 From 2c4704756cab7cfa031ada4dab361562f0e357c0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Sep 2017 13:06:43 -0500 Subject: pids: Move the pgrp and session pid pointers from task_struct to signal_struct To access these fields the code always has to go to group leader so going to signal struct is no loss and is actually a fundamental simplification. This saves a little bit of memory by only allocating the pid pointer array once instead of once for every thread, and even better this removes a few potential races caused by the fact that group_leader can be changed by de_thread, while signal_struct can not. Signed-off-by: "Eric W. Biederman" --- include/linux/init_task.h | 9 --------- include/linux/pid.h | 8 +------- include/linux/sched.h | 22 ++++------------------ include/linux/sched/signal.h | 26 +++++++++++++++++++++++--- 4 files changed, 28 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index a454b8aeb938..a7083a45a26c 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -46,15 +46,6 @@ extern struct cred init_cred; #define INIT_CPU_TIMERS(s) #endif -#define INIT_PID_LINK(type) \ -{ \ - .node = { \ - .next = NULL, \ - .pprev = NULL, \ - }, \ - .pid = &init_struct_pid, \ -} - #define INIT_TASK_COMM "swapper" /* Attach to the init_task data structure for proper alignment */ diff --git a/include/linux/pid.h b/include/linux/pid.h index 7633d55d9a24..3d4c504dcc8c 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -67,12 +67,6 @@ struct pid extern struct pid init_struct_pid; -struct pid_link -{ - struct hlist_node node; - struct pid *pid; -}; - static inline struct pid *get_pid(struct pid *pid) { if (pid) @@ -177,7 +171,7 @@ pid_t pid_vnr(struct pid *pid); do { \ if ((pid) != NULL) \ hlist_for_each_entry_rcu((task), \ - &(pid)->tasks[type], pids[type].node) { + &(pid)->tasks[type], pid_links[type]) { /* * Both old and new leaders may be attached to diff --git a/include/linux/sched.h b/include/linux/sched.h index a461ff89a3af..445bdf5b1f64 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -775,7 +775,8 @@ struct task_struct { struct list_head ptrace_entry; /* PID/PID hash table linkage. */ - struct pid_link pids[PIDTYPE_MAX]; + struct pid *thread_pid; + struct hlist_node pid_links[PIDTYPE_MAX]; struct list_head thread_group; struct list_head thread_node; @@ -1199,22 +1200,7 @@ struct task_struct { static inline struct pid *task_pid(struct task_struct *task) { - return task->pids[PIDTYPE_PID].pid; -} - -/* - * Without tasklist or RCU lock it is not safe to dereference - * the result of task_pgrp/task_session even if task == current, - * we can race with another thread doing sys_setsid/sys_setpgid. - */ -static inline struct pid *task_pgrp(struct task_struct *task) -{ - return task->group_leader->pids[PIDTYPE_PGID].pid; -} - -static inline struct pid *task_session(struct task_struct *task) -{ - return task->group_leader->pids[PIDTYPE_SID].pid; + return task->thread_pid; } /* @@ -1263,7 +1249,7 @@ static inline pid_t task_tgid_nr(struct task_struct *tsk) */ static inline int pid_alive(const struct task_struct *p) { - return p->pids[PIDTYPE_PID].pid != NULL; + return p->thread_pid != NULL; } static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index b95a272c1ab5..2dcded16eb1e 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -146,7 +146,9 @@ struct signal_struct { #endif + /* PID/PID hash table linkage. */ struct pid *leader_pid; + struct pid *pids[PIDTYPE_MAX]; #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; @@ -559,9 +561,12 @@ void walk_process_tree(struct task_struct *top, proc_visitor, void *); static inline struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { - if (type != PIDTYPE_PID) - task = task->group_leader; - return task->pids[type].pid; + struct pid *pid; + if (type == PIDTYPE_PID) + pid = task_pid(task); + else + pid = task->signal->pids[type]; + return pid; } static inline struct pid *task_tgid(struct task_struct *task) @@ -569,6 +574,21 @@ static inline struct pid *task_tgid(struct task_struct *task) return task->signal->leader_pid; } +/* + * Without tasklist or RCU lock it is not safe to dereference + * the result of task_pgrp/task_session even if task == current, + * we can race with another thread doing sys_setsid/sys_setpgid. + */ +static inline struct pid *task_pgrp(struct task_struct *task) +{ + return task->signal->pids[PIDTYPE_PGID]; +} + +static inline struct pid *task_session(struct task_struct *task) +{ + return task->signal->pids[PIDTYPE_SID]; +} + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; -- cgit v1.2.3 From 6883f81aac6f44e7df70a6af189b3689ff52cbfb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 4 Jun 2017 04:32:13 -0500 Subject: pid: Implement PIDTYPE_TGID Everywhere except in the pid array we distinguish between a tasks pid and a tasks tgid (thread group id). Even in the enumeration we want that distinction sometimes so we have added __PIDTYPE_TGID. With leader_pid we almost have an implementation of PIDTYPE_TGID in struct signal_struct. Add PIDTYPE_TGID as a first class member of the pid_type enumeration and into the pids array. Then remove the __PIDTYPE_TGID special case and the leader_pid in signal_struct. The net size increase is just an extra pointer added to struct pid and an extra pair of pointers of an hlist_node added to task_struct. The effect on code maintenance is the removal of a number of special cases today and the potential to remove many more special cases as PIDTYPE_TGID gets used to it's fullest. The long term potential is allowing zombie thread group leaders to exit, which will remove a lot more special cases in the code. Signed-off-by: "Eric W. Biederman" --- include/linux/pid.h | 3 +-- include/linux/sched.h | 4 ++-- include/linux/sched/signal.h | 5 ++--- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index 3d4c504dcc8c..14a9a39da9c7 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -7,11 +7,10 @@ enum pid_type { PIDTYPE_PID, + PIDTYPE_TGID, PIDTYPE_PGID, PIDTYPE_SID, PIDTYPE_MAX, - /* only valid to __task_pid_nr_ns() */ - __PIDTYPE_TGID }; /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 445bdf5b1f64..06b4e3bda93a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1275,12 +1275,12 @@ static inline pid_t task_session_vnr(struct task_struct *tsk) static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { - return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, ns); + return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); } static inline pid_t task_tgid_vnr(struct task_struct *tsk) { - return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, NULL); + return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); } static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 2dcded16eb1e..ee30a5ba475f 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -147,7 +147,6 @@ struct signal_struct { #endif /* PID/PID hash table linkage. */ - struct pid *leader_pid; struct pid *pids[PIDTYPE_MAX]; #ifdef CONFIG_NO_HZ_FULL @@ -571,7 +570,7 @@ struct pid *task_pid_type(struct task_struct *task, enum pid_type type) static inline struct pid *task_tgid(struct task_struct *task) { - return task->signal->leader_pid; + return task->signal->pids[PIDTYPE_TGID]; } /* @@ -607,7 +606,7 @@ static inline bool thread_group_leader(struct task_struct *p) */ static inline bool has_group_leader_pid(struct task_struct *p) { - return task_pid(p) == p->signal->leader_pid; + return task_pid(p) == task_tgid(p); } static inline -- cgit v1.2.3 From 24122c7f4969adeeaeca3fb1656a31569e9aa59b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Jul 2018 14:30:23 -0500 Subject: signal: Pass pid and pid type into send_sigqueue Make the code more maintainable by performing more of the signal related work in send_sigqueue. A quick inspection of do_timer_create will show that this code path does not lookup a thread group by a thread's pid. Making it safe to find the task pointed to by it_pid with "pid_task(it_pid, type)"; This supports the changes needed in fork to tell if a signal was sent to a single process or a group of processes. Having the pid to task transition in signal.c will also make it easier to sort out races with de_thread and and the thread group leader exiting when it comes time to address that. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index ee30a5ba475f..94558ffa82ab 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -330,7 +330,7 @@ extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); -extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); +extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline int restart_syscall(void) -- cgit v1.2.3 From 0102498083d58d8b17759642c602b525215e1a54 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Jul 2018 18:40:57 -0500 Subject: signal: Pass pid type into group_send_sig_info This passes the information we already have at the call sight into group_send_sig_info. Ultimatelly allowing for to better handle signals sent to a group of processes. Signed-off-by: "Eric W. Biederman" --- include/linux/signal.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/signal.h b/include/linux/signal.h index 3c5200137b24..d8f2bf3d41e6 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -254,11 +254,13 @@ static inline int valid_signal(unsigned long sig) struct timespec; struct pt_regs; +enum pid_type; extern int next_signal(struct sigpending *pending, sigset_t *mask); extern int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, bool group); -extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p); +extern int group_send_sig_info(int sig, struct siginfo *info, + struct task_struct *p, enum pid_type type); extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(sigset_t *); -- cgit v1.2.3 From 40b3b02535621027f56d248139e0e467573c3098 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Jul 2018 10:45:15 -0500 Subject: signal: Pass pid type into do_send_sig_info This passes the information we already have at the call sight into do_send_sig_info. Ultimately allowing for better handling of signals sent to a group of processes during fork. Signed-off-by: "Eric W. Biederman" --- include/linux/signal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/signal.h b/include/linux/signal.h index d8f2bf3d41e6..fe125b0335f7 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -258,7 +258,7 @@ enum pid_type; extern int next_signal(struct sigpending *pending, sigset_t *mask); extern int do_send_sig_info(int sig, struct siginfo *info, - struct task_struct *p, bool group); + struct task_struct *p, enum pid_type type); extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, enum pid_type type); extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); -- cgit v1.2.3 From c2a7d2a115525d3501d38e23d24875a79a07e15e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 13 Jul 2018 21:50:16 -0700 Subject: filesystem-dax: Introduce dax_lock_mapping_entry() In preparation for implementing support for memory poison (media error) handling via dax mappings, implement a lock_page() equivalent. Poison error handling requires rmap and needs guarantees that the page->mapping association is maintained / valid (inode not freed) for the duration of the lookup. In the device-dax case it is sufficient to simply hold a dev_pagemap reference. In the filesystem-dax case we need to use the entry lock. Export the entry lock via dax_lock_mapping_entry() that uses rcu_read_lock() to protect against the inode being freed, and revalidates the page->mapping association under xa_lock(). Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Jan Kara Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- include/linux/dax.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index deb0f663252f..450b28db9533 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -88,6 +88,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc); struct page *dax_layout_busy_page(struct address_space *mapping); +bool dax_lock_mapping_entry(struct page *page); +void dax_unlock_mapping_entry(struct page *page); #else static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize) @@ -119,6 +121,17 @@ static inline int dax_writeback_mapping_range(struct address_space *mapping, { return -EOPNOTSUPP; } + +static inline bool dax_lock_mapping_entry(struct page *page) +{ + if (IS_DAX(page->mapping->host)) + return true; + return false; +} + +static inline void dax_unlock_mapping_entry(struct page *page) +{ +} #endif int dax_read_lock(void); -- cgit v1.2.3 From 6100e34b2526e1dc3dbcc47fea2677974d6aaea5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 13 Jul 2018 21:50:21 -0700 Subject: mm, memory_failure: Teach memory_failure() about dev_pagemap pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mce: Uncorrected hardware memory error in user-access at af34214200 {1}[Hardware Error]: It has been corrected by h/w and requires no further action mce: [Hardware Error]: Machine check events logged {1}[Hardware Error]: event severity: corrected Memory failure: 0xaf34214: reserved kernel page still referenced by 1 users [..] Memory failure: 0xaf34214: recovery action for reserved kernel page: Failed mce: Memory error not recovered In contrast to typical memory, dev_pagemap pages may be dax mapped. With dax there is no possibility to map in another page dynamically since dax establishes 1:1 physical address to file offset associations. Also dev_pagemap pages associated with NVDIMM / persistent memory devices can internal remap/repair addresses with poison. While memory_failure() assumes that it can discard typical poisoned pages and keep them unmapped indefinitely, dev_pagemap pages may be returned to service after the error is cleared. Teach memory_failure() to detect and handle MEMORY_DEVICE_HOST dev_pagemap pages that have poison consumed by userspace. Mark the memory as UC instead of unmapping it completely to allow ongoing access via the device driver (nd_pmem). Later, nd_pmem will grow support for marking the page back to WB when the error is cleared. Cc: Jan Kara Cc: Christoph Hellwig Cc: Jérôme Glisse Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Ross Zwisler Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a0fbb9ffe380..374e5e9284f7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2725,6 +2725,7 @@ enum mf_action_page_type { MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, MF_MSG_BUDDY_2ND, + MF_MSG_DAX, MF_MSG_UNKNOWN, }; -- cgit v1.2.3 From 10ac86884b4d7642a235f9da61367c3f8d2ab2ff Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Tue, 24 Jul 2018 19:11:26 +0200 Subject: fbcon: introduce for_each_registered_fb() helper Following pattern is often used: for (i = 0; i < FB_MAX; i++) { if (registered_fb[i]) { ... } } Therefore, as Andy's suggestion, for_each_registered_fb() helper can be introduced to make the code easier to read and write by reducing indentation level. It also saves few lines of code in each occurrence. This patch convert all part here at the same time. Suggested-by: Andy Shevchenko Signed-off-by: Yisheng Xie Acked-by: Hans de Goede Reviewed-by: Andy Shevchenko Cc: Kees Cook Cc: David Lechner Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/fb.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index aa74a228bb92..fd31e6f24b8d 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -650,6 +650,10 @@ extern struct fb_info *registered_fb[FB_MAX]; extern int num_registered_fb; extern struct class *fb_class; +#define for_each_registered_fb(i) \ + for (i = 0; i < FB_MAX; i++) \ + if (!registered_fb[i]) {} else + extern int lock_fb_info(struct fb_info *info); static inline void unlock_fb_info(struct fb_info *info) -- cgit v1.2.3 From 3d910ef71732d15f251ca9b679da634adca72f5a Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Tue, 24 Jul 2018 19:11:26 +0200 Subject: fbdev: fix typo in comment Change beeng to being and occured to occurred. Signed-off-by: Yisheng Xie Acked-by: Hans de Goede Cc: Kees Cook Cc: David Lechner Cc: Andy Shevchenko Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/fb.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index fd31e6f24b8d..3e7e75383d32 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -126,7 +126,7 @@ struct fb_cursor_user { /* The resolution of the passed in fb_info about to change */ #define FB_EVENT_MODE_CHANGE 0x01 -/* The display on this fb_info is beeing suspended, no access to the +/* The display on this fb_info is being suspended, no access to the * framebuffer is allowed any more after that call returns */ #define FB_EVENT_SUSPEND 0x02 @@ -159,9 +159,9 @@ struct fb_cursor_user { #define FB_EVENT_FB_UNBIND 0x0E /* CONSOLE-SPECIFIC: remap all consoles to new fb - for vga_switcheroo */ #define FB_EVENT_REMAP_ALL_CONSOLE 0x0F -/* A hardware display blank early change occured */ +/* A hardware display blank early change occurred */ #define FB_EARLY_EVENT_BLANK 0x10 -/* A hardware display blank revert early change occured */ +/* A hardware display blank revert early change occurred */ #define FB_R_EARLY_EVENT_BLANK 0x11 struct fb_event { -- cgit v1.2.3 From 1560d0848a1a84db6c1d9b17c14273c0dae41828 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 25 Jul 2018 14:34:29 +0200 Subject: rtc: remove rtc_irq_register/rtc_irq_unregister The rtc_irq_* interface is not used from outside the RTC subsytem since 2016. Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 6268208760e9..f868d6b619ab 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -204,10 +204,6 @@ extern void rtc_update_irq(struct rtc_device *rtc, extern struct rtc_device *rtc_class_open(const char *name); extern void rtc_class_close(struct rtc_device *rtc); -extern int rtc_irq_register(struct rtc_device *rtc, - struct rtc_task *task); -extern void rtc_irq_unregister(struct rtc_device *rtc, - struct rtc_task *task); extern int rtc_irq_set_state(struct rtc_device *rtc, struct rtc_task *task, int enabled); extern int rtc_irq_set_freq(struct rtc_device *rtc, -- cgit v1.2.3 From acecb3ad8b21a519ce4ad728106d45d4e978bb56 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 25 Jul 2018 14:58:10 +0200 Subject: rtc: remove irq_task and irq_task_lock There is no way to set a periodic task anymore, remove task pointer and lock. Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index f868d6b619ab..8cc23fdfd4ee 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -121,8 +121,6 @@ struct rtc_device { wait_queue_head_t irq_queue; struct fasync_struct *async_queue; - struct rtc_task *irq_task; - spinlock_t irq_task_lock; int irq_freq; int max_user_freq; -- cgit v1.2.3 From 8719d3c9188b38db462a77ecd8c7a8e25e7e8c4c Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 25 Jul 2018 15:07:09 +0200 Subject: rtc: simplify rtc_irq_set_state/rtc_irq_set_freq The PIE doesn't handle tasks anymore, remove the pointer from the interface. Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 8cc23fdfd4ee..bf4d375025d1 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -202,10 +202,8 @@ extern void rtc_update_irq(struct rtc_device *rtc, extern struct rtc_device *rtc_class_open(const char *name); extern void rtc_class_close(struct rtc_device *rtc); -extern int rtc_irq_set_state(struct rtc_device *rtc, - struct rtc_task *task, int enabled); -extern int rtc_irq_set_freq(struct rtc_device *rtc, - struct rtc_task *task, int freq); +extern int rtc_irq_set_state(struct rtc_device *rtc, int enabled); +extern int rtc_irq_set_freq(struct rtc_device *rtc, int freq); extern int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled); extern int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled); extern int rtc_dev_update_irq_enable_emul(struct rtc_device *rtc, -- cgit v1.2.3 From 1f45a4db3657c5ecafb7d3331536c987e6d18311 Mon Sep 17 00:00:00 2001 From: Paul McKenney Date: Thu, 28 Jun 2018 11:21:43 -0700 Subject: srcu: Add notrace variants of srcu_read_{lock,unlock} This is needed for a future tracepoint patch that uses srcu, and to make sure it doesn't call into lockdep. tracepoint code already calls notrace variants for rcu_read_lock_sched so this patch does the same for srcu which will be used in a later patch. Keeps it consistent with rcu-sched. [Joel: Added commit message] Link: http://lkml.kernel.org/r/20180628182149.226164-2-joel@joelfernandes.org Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Paul McKenney Signed-off-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- include/linux/srcu.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 91494d7e8e41..3e72a291c401 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -195,6 +195,16 @@ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) return retval; } +/* Used by tracing, cannot be traced and cannot invoke lockdep. */ +static inline notrace int +srcu_read_lock_notrace(struct srcu_struct *sp) __acquires(sp) +{ + int retval; + + retval = __srcu_read_lock(sp); + return retval; +} + /** * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. * @sp: srcu_struct in which to unregister the old reader. @@ -209,6 +219,13 @@ static inline void srcu_read_unlock(struct srcu_struct *sp, int idx) __srcu_read_unlock(sp, idx); } +/* Used by tracing, cannot be traced and cannot call lockdep. */ +static inline notrace void +srcu_read_unlock_notrace(struct srcu_struct *sp, int idx) __releases(sp) +{ + __srcu_read_unlock(sp, idx); +} + /** * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock * -- cgit v1.2.3 From 0b764a6e4e19dc254caca636002eaa47b1d0b0af Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 28 Jun 2018 11:21:44 -0700 Subject: srcu: Add notrace variant of srcu_dereference In the last patch in this series, we are making lockdep register hooks onto the irq_{disable,enable} tracepoints. These tracepoints use the _rcuidle tracepoint variant. In this series we switch the _rcuidle tracepoint callers to use SRCU instead of sched-RCU. Inorder to dereference the pointer to the probe functions, we could call srcu_dereference, however this API will call back into lockdep to check if the lock is held *before* the lockdep probe hooks have a chance to run and annotate the IRQ enabled/disabled state. For this reason we need a notrace variant of srcu_dereference since otherwise we get lockdep splats. This patch adds the needed srcu_dereference_notrace variant. Link: http://lkml.kernel.org/r/20180628182149.226164-3-joel@joelfernandes.org Reviewed-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- include/linux/srcu.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 3e72a291c401..67135d4a8a30 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -169,6 +169,11 @@ static inline int srcu_read_lock_held(const struct srcu_struct *sp) */ #define srcu_dereference(p, sp) srcu_dereference_check((p), (sp), 0) +/** + * srcu_dereference_notrace - no tracing and no lockdep calls from here + */ +#define srcu_dereference_notrace(p, sp) srcu_dereference_check((p), (sp), 1) + /** * srcu_read_lock - register a new reader for an SRCU-protected structure. * @sp: srcu_struct in which to register the new reader. -- cgit v1.2.3 From 72809cbf6748830ae4a59a45bcb2367a6c24d74d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 26 Jul 2018 21:44:32 +0900 Subject: tracing: Remove orphaned function ftrace_nr_registered_ops() Remove ftrace_nr_registered_ops() because it is no longer used. ftrace_nr_registered_ops() has been introduced by commit ea701f11da44 ("ftrace: Add selftest to test function trace recursion protection"), but its caller has been removed by commit 05cbbf643b8e ("tracing: Fix selftest function recursion accounting"). So it is not called anymore. Link: http://lkml.kernel.org/r/153260907227.12474.5234899025934963683.stgit@devbox Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ebb77674be90..63af5eb0ff46 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -234,10 +234,6 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1, */ #define register_ftrace_function(ops) ({ 0; }) #define unregister_ftrace_function(ops) ({ 0; }) -static inline int ftrace_nr_registered_ops(void) -{ - return 0; -} static inline void ftrace_kill(void) { } static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } @@ -328,8 +324,6 @@ struct seq_file; extern int ftrace_text_reserved(const void *start, const void *end); -extern int ftrace_nr_registered_ops(void); - struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr); bool is_ftrace_trampoline(unsigned long addr); -- cgit v1.2.3 From af9b6d7570ca9afbbc6076ab7920d8f00f7e55c1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 29 Jun 2018 12:45:53 -0400 Subject: pNFS: Parse the results of layoutget on open even if permissions checks fail Even if the results of the permissions checks failed, we should parse the results of the layout on open call so that we can return the layout if required. Note that we also want to ignore the sequence counter for whether or not a layout recall occurred. If the recall pertained to our OPEN, then the callback will know, and will attempt to wait for us to finih processing anyway. Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 1 - include/linux/nfs_xdr.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 74ae3e1d19a0..2c18d618604e 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -28,7 +28,6 @@ struct nfs41_impl_id; struct nfs_client { refcount_t cl_count; atomic_t cl_mds_count; - seqcount_t cl_callback_count; int cl_cons_state; /* current construction state (-ve: init error) */ #define NFS_CS_READY 0 /* ready to be used */ #define NFS_CS_INITING 1 /* busy initialising */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 712eed156d09..3b7325cfb291 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -271,7 +271,6 @@ struct nfs4_layoutget { struct nfs4_layoutget_args args; struct nfs4_layoutget_res res; struct rpc_cred *cred; - unsigned callback_count; gfp_t gfp_flags; }; -- cgit v1.2.3 From e15d54d5009688ccb2a5312f3b70d631615329c9 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 27 Jun 2018 14:46:21 +0800 Subject: f2fs: Allocate and stat mem used by free nid bitmap more accurately This patch used f2fs_bitmap_size macro to calculate mem used by free nid bitmap, and stat used mem including aligned part. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index aa5db8b5521a..f70f8ac9c4f4 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -304,11 +304,6 @@ struct f2fs_node { * For NAT entries */ #define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) -#define NAT_ENTRY_BITMAP_SIZE ((NAT_ENTRY_PER_BLOCK + 7) / 8) -#define NAT_ENTRY_BITMAP_SIZE_ALIGNED \ - ((NAT_ENTRY_BITMAP_SIZE + BITS_PER_LONG - 1) / \ - BITS_PER_LONG * BITS_PER_LONG) - struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ -- cgit v1.2.3 From a5ec5a7bfd1f28d1905499641c9f589be36808c1 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 27 Jul 2018 13:47:02 -0700 Subject: ata: ahci: Support state with min power but Partial low power state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently when min_power policy is selected, the partial low power state is not entered and link will try aggressively enter to only slumber state. Add a new policy which still enable DEVSLP but also try to enter partial low power state. This policy is presented as "min_power_with_partial". For information the difference between partial and slumber Partial – PHY logic is powered up, and in a reduced power state. The link PM exit latency to active state maximum is 10 ns. Slumber – PHY logic is powered up, and in a reduced power state. The link PM exit latency to active state maximum is 10 ms. Devslp – PHY logic is powered down. The link PM exit latency from this state to active state maximum is 20 ms, unless otherwise specified by DETO. Suggested-and-reviewed-by: Hans de Goede Signed-off-by: Srinivas Pandruvada Signed-off-by: Tejun Heo --- include/linux/libata.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index aa8583655a18..80c2bd202367 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -522,7 +522,8 @@ enum ata_lpm_policy { ATA_LPM_MAX_POWER, ATA_LPM_MED_POWER, ATA_LPM_MED_POWER_WITH_DIPM, /* Med power + DIPM as win IRST does */ - ATA_LPM_MIN_POWER, + ATA_LPM_MIN_POWER_WITH_PARTIAL, /* Min Power + partial and slumber */ + ATA_LPM_MIN_POWER, /* Min power + no partial (slumber only) */ }; enum ata_lpm_hints { -- cgit v1.2.3 From 82b98ca566ca2af170eb0ab50cef09dd7335fa55 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Thu, 5 Jul 2018 16:48:50 +0000 Subject: net/sunrpc: Make rpc_auth_create_args a const This turns rpc_auth_create_args into a const as it gets passed through the auth stack. Signed-off-by: Sargun Dhillon Signed-off-by: Anna Schumaker --- include/linux/sunrpc/auth.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index d9af474a857d..58a6765c1c5e 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -125,7 +125,8 @@ struct rpc_authops { struct module *owner; rpc_authflavor_t au_flavor; /* flavor (RPC_AUTH_*) */ char * au_name; - struct rpc_auth * (*create)(struct rpc_auth_create_args *, struct rpc_clnt *); + struct rpc_auth * (*create)(const struct rpc_auth_create_args *, + struct rpc_clnt *); void (*destroy)(struct rpc_auth *); int (*hash_cred)(struct auth_cred *, unsigned int); @@ -174,7 +175,7 @@ struct rpc_cred * rpc_lookup_generic_cred(struct auth_cred *, int, gfp_t); struct rpc_cred * rpc_lookup_machine_cred(const char *service_name); int rpcauth_register(const struct rpc_authops *); int rpcauth_unregister(const struct rpc_authops *); -struct rpc_auth * rpcauth_create(struct rpc_auth_create_args *, +struct rpc_auth * rpcauth_create(const struct rpc_auth_create_args *, struct rpc_clnt *); void rpcauth_release(struct rpc_auth *); rpc_authflavor_t rpcauth_get_pseudoflavor(rpc_authflavor_t, -- cgit v1.2.3 From e6753f23d961d601dbae50a2fc2a3975c9715b14 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 30 Jul 2018 15:24:22 -0700 Subject: tracepoint: Make rcuidle tracepoint callers use SRCU In recent tests with IRQ on/off tracepoints, a large performance overhead ~10% is noticed when running hackbench. This is root caused to calls to rcu_irq_enter_irqson and rcu_irq_exit_irqson from the tracepoint code. Following a long discussion on the list [1] about this, we concluded that srcu is a better alternative for use during rcu idle. Although it does involve extra barriers, its lighter than the sched-rcu version which has to do additional RCU calls to notify RCU idle about entry into RCU sections. In this patch, we change the underlying implementation of the trace_*_rcuidle API to use SRCU. This has shown to improve performance alot for the high frequency irq enable/disable tracepoints. Test: Tested idle and preempt/irq tracepoints. Here are some performance numbers: With a run of the following 30 times on a single core x86 Qemu instance with 1GB memory: hackbench -g 4 -f 2 -l 3000 Completion times in seconds. CONFIG_PROVE_LOCKING=y. No patches (without this series) Mean: 3.048 Median: 3.025 Std Dev: 0.064 With Lockdep using irq tracepoints with RCU implementation: Mean: 3.451 (-11.66 %) Median: 3.447 (-12.22%) Std Dev: 0.049 With Lockdep using irq tracepoints with SRCU implementation (this series): Mean: 3.020 (I would consider the improvement against the "without this series" case as just noise). Median: 3.013 Std Dev: 0.033 [1] https://patchwork.kernel.org/patch/10344297/ [remove rcu_read_lock_sched_notrace as its the equivalent of preempt_disable_notrace and is unnecessary to call in tracepoint code] Link: http://lkml.kernel.org/r/20180730222423.196630-3-joel@joelfernandes.org Cleaned-up-by: Peter Zijlstra Acked-by: Peter Zijlstra Reviewed-by: Mathieu Desnoyers Signed-off-by: Joel Fernandes (Google) [ Simplified WARN_ON_ONCE() ] Signed-off-by: Steven Rostedt (VMware) --- include/linux/tracepoint.h | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 19a690b559ca..d9a084c72541 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -33,6 +34,8 @@ struct trace_eval_map { #define TRACEPOINT_DEFAULT_PRIO 10 +extern struct srcu_struct tracepoint_srcu; + extern int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data); extern int @@ -75,10 +78,16 @@ int unregister_tracepoint_module_notifier(struct notifier_block *nb) * probe unregistration and the end of module exit to make sure there is no * caller executing a probe when it is freed. */ +#ifdef CONFIG_TRACEPOINTS static inline void tracepoint_synchronize_unregister(void) { + synchronize_srcu(&tracepoint_srcu); synchronize_sched(); } +#else +static inline void tracepoint_synchronize_unregister(void) +{ } +#endif #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS extern int syscall_regfunc(void); @@ -129,18 +138,31 @@ extern void syscall_unregfunc(void); * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto". */ -#define __DO_TRACE(tp, proto, args, cond, rcucheck) \ +#define __DO_TRACE(tp, proto, args, cond, rcuidle) \ do { \ struct tracepoint_func *it_func_ptr; \ void *it_func; \ void *__data; \ + int __maybe_unused idx = 0; \ \ if (!(cond)) \ return; \ - if (rcucheck) \ - rcu_irq_enter_irqson(); \ - rcu_read_lock_sched_notrace(); \ - it_func_ptr = rcu_dereference_sched((tp)->funcs); \ + \ + /* srcu can't be used from NMI */ \ + WARN_ON_ONCE(rcuidle && in_nmi()); \ + \ + /* keep srcu and sched-rcu usage consistent */ \ + preempt_disable_notrace(); \ + \ + /* \ + * For rcuidle callers, use srcu since sched-rcu \ + * doesn't work from the idle path. \ + */ \ + if (rcuidle) \ + idx = srcu_read_lock_notrace(&tracepoint_srcu); \ + \ + it_func_ptr = rcu_dereference_raw((tp)->funcs); \ + \ if (it_func_ptr) { \ do { \ it_func = (it_func_ptr)->func; \ @@ -148,9 +170,11 @@ extern void syscall_unregfunc(void); ((void(*)(proto))(it_func))(args); \ } while ((++it_func_ptr)->func); \ } \ - rcu_read_unlock_sched_notrace(); \ - if (rcucheck) \ - rcu_irq_exit_irqson(); \ + \ + if (rcuidle) \ + srcu_read_unlock_notrace(&tracepoint_srcu, idx);\ + \ + preempt_enable_notrace(); \ } while (0) #ifndef MODULE -- cgit v1.2.3 From 56e6c104e4f151e19eb410004405ec52b4f8605a Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 31 Jul 2018 13:06:57 +0200 Subject: console: Replace #if 0 with atomic var 'ignore_console_lock_warning' The macro WARN_CONSOLE_UNLOCKED prints a warning when a thread enters the console's critical section without having acquired the console lock. The console lock can be ignored when debugging the console using printk, but this makes WARN_CONSOLE_UNLOCKED generate unnecessary warnings. The variable ignore_console_lock_warning temporarily disables WARN_CONSOLE_UNLOCKED. Developers interested in debugging the console's critical sections should increment it before entering the CS and decrement it after leaving the CS. Setting ignore_console_lock_warning is only for debugging. Regular operation should not manipulate it. Acknoledgements: This patch is based on an earlier version by Steven Rostedt. The use of atomic increment/decrement was suggested by Petr Mladek. Link: http://lkml.kernel.org/r/717e6337-e7a6-7a92-1c1b-8929a25696b5@suse.de Signed-off-by: Thomas Zimmermann Acked-by: Hans de Goede Acked-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Cc: Steven Rostedt (VMware) Cc: Andrew Morton [b.zolnierkie: use ] Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/console.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index f59f3dbca65c..ec9bdb3d7bab 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -14,6 +14,7 @@ #ifndef _LINUX_CONSOLE_H_ #define _LINUX_CONSOLE_H_ 1 +#include #include struct vc_data; @@ -201,11 +202,14 @@ void vcs_make_sysfs(int index); void vcs_remove_sysfs(int index); /* Some debug stub to catch some of the obvious races in the VT code */ -#if 1 -#define WARN_CONSOLE_UNLOCKED() WARN_ON(!is_console_locked() && !oops_in_progress) -#else -#define WARN_CONSOLE_UNLOCKED() -#endif +#define WARN_CONSOLE_UNLOCKED() \ + WARN_ON(!atomic_read(&ignore_console_lock_warning) && \ + !is_console_locked() && !oops_in_progress) +/* + * Increment ignore_console_lock_warning if you need to quiet + * WARN_CONSOLE_UNLOCKED() for debugging purposes. + */ +extern atomic_t ignore_console_lock_warning; /* VESA Blanking Levels */ #define VESA_NO_BLANKING 0 -- cgit v1.2.3 From c3bc8fd637a9623f5c507bd18f9677effbddf584 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 30 Jul 2018 15:24:23 -0700 Subject: tracing: Centralize preemptirq tracepoints and unify their usage This patch detaches the preemptirq tracepoints from the tracers and keeps it separate. Advantages: * Lockdep and irqsoff event can now run in parallel since they no longer have their own calls. * This unifies the usecase of adding hooks to an irqsoff and irqson event, and a preemptoff and preempton event. 3 users of the events exist: - Lockdep - irqsoff and preemptoff tracers - irqs and preempt trace events The unification cleans up several ifdefs and makes the code in preempt tracer and irqsoff tracers simpler. It gets rid of all the horrific ifdeferry around PROVE_LOCKING and makes configuration of the different users of the tracepoints more easy and understandable. It also gets rid of the time_* function calls from the lockdep hooks used to call into the preemptirq tracer which is not needed anymore. The negative delta in lines of code in this patch is quite large too. In the patch we introduce a new CONFIG option PREEMPTIRQ_TRACEPOINTS as a single point for registering probes onto the tracepoints. With this, the web of config options for preempt/irq toggle tracepoints and its users becomes: PREEMPT_TRACER PREEMPTIRQ_EVENTS IRQSOFF_TRACER PROVE_LOCKING | | \ | | \ (selects) / \ \ (selects) / TRACE_PREEMPT_TOGGLE ----> TRACE_IRQFLAGS \ / \ (depends on) / PREEMPTIRQ_TRACEPOINTS Other than the performance tests mentioned in the previous patch, I also ran the locking API test suite. I verified that all tests cases are passing. I also injected issues by not registering lockdep probes onto the tracepoints and I see failures to confirm that the probes are indeed working. This series + lockdep probes not registered (just to inject errors): [ 0.000000] hard-irqs-on + irq-safe-A/21: ok | ok | ok | [ 0.000000] soft-irqs-on + irq-safe-A/21: ok | ok | ok | [ 0.000000] sirq-safe-A => hirqs-on/12:FAILED|FAILED| ok | [ 0.000000] sirq-safe-A => hirqs-on/21:FAILED|FAILED| ok | [ 0.000000] hard-safe-A + irqs-on/12:FAILED|FAILED| ok | [ 0.000000] soft-safe-A + irqs-on/12:FAILED|FAILED| ok | [ 0.000000] hard-safe-A + irqs-on/21:FAILED|FAILED| ok | [ 0.000000] soft-safe-A + irqs-on/21:FAILED|FAILED| ok | [ 0.000000] hard-safe-A + unsafe-B #1/123: ok | ok | ok | [ 0.000000] soft-safe-A + unsafe-B #1/123: ok | ok | ok | With this series + lockdep probes registered, all locking tests pass: [ 0.000000] hard-irqs-on + irq-safe-A/21: ok | ok | ok | [ 0.000000] soft-irqs-on + irq-safe-A/21: ok | ok | ok | [ 0.000000] sirq-safe-A => hirqs-on/12: ok | ok | ok | [ 0.000000] sirq-safe-A => hirqs-on/21: ok | ok | ok | [ 0.000000] hard-safe-A + irqs-on/12: ok | ok | ok | [ 0.000000] soft-safe-A + irqs-on/12: ok | ok | ok | [ 0.000000] hard-safe-A + irqs-on/21: ok | ok | ok | [ 0.000000] soft-safe-A + irqs-on/21: ok | ok | ok | [ 0.000000] hard-safe-A + unsafe-B #1/123: ok | ok | ok | [ 0.000000] soft-safe-A + unsafe-B #1/123: ok | ok | ok | Link: http://lkml.kernel.org/r/20180730222423.196630-4-joel@joelfernandes.org Acked-by: Peter Zijlstra (Intel) Reviewed-by: Namhyung Kim Signed-off-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 11 +---------- include/linux/irqflags.h | 11 ++++++++--- include/linux/lockdep.h | 8 +++++--- include/linux/preempt.h | 2 +- 4 files changed, 15 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 63af5eb0ff46..a397907e8d72 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -701,16 +701,7 @@ static inline unsigned long get_lock_parent_ip(void) return CALLER_ADDR2; } -#ifdef CONFIG_IRQSOFF_TRACER - extern void time_hardirqs_on(unsigned long a0, unsigned long a1); - extern void time_hardirqs_off(unsigned long a0, unsigned long a1); -#else - static inline void time_hardirqs_on(unsigned long a0, unsigned long a1) { } - static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { } -#endif - -#if defined(CONFIG_PREEMPT_TRACER) || \ - (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS)) +#ifdef CONFIG_TRACE_PREEMPT_TOGGLE extern void trace_preempt_on(unsigned long a0, unsigned long a1); extern void trace_preempt_off(unsigned long a0, unsigned long a1); #else diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 9700f00bbc04..50edb9cbbd26 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -15,9 +15,16 @@ #include #include -#ifdef CONFIG_TRACE_IRQFLAGS +/* Currently trace_softirqs_on/off is used only by lockdep */ +#ifdef CONFIG_PROVE_LOCKING extern void trace_softirqs_on(unsigned long ip); extern void trace_softirqs_off(unsigned long ip); +#else +# define trace_softirqs_on(ip) do { } while (0) +# define trace_softirqs_off(ip) do { } while (0) +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS extern void trace_hardirqs_on(void); extern void trace_hardirqs_off(void); # define trace_hardirq_context(p) ((p)->hardirq_context) @@ -43,8 +50,6 @@ do { \ #else # define trace_hardirqs_on() do { } while (0) # define trace_hardirqs_off() do { } while (0) -# define trace_softirqs_on(ip) do { } while (0) -# define trace_softirqs_off(ip) do { } while (0) # define trace_hardirq_context(p) 0 # define trace_softirq_context(p) 0 # define trace_hardirqs_enabled(p) 0 diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 6fc77d4dbdcd..a8113357ceeb 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -266,7 +266,8 @@ struct held_lock { /* * Initialization, self-test and debugging-output methods: */ -extern void lockdep_info(void); +extern void lockdep_init(void); +extern void lockdep_init_early(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); extern void lockdep_free_key_range(void *start, unsigned long size); @@ -406,7 +407,8 @@ static inline void lockdep_on(void) # define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) -# define lockdep_info() do { } while (0) +# define lockdep_init() do { } while (0) +# define lockdep_init_early() do { } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) # define lockdep_set_class(lock, key) do { (void)(key); } while (0) @@ -532,7 +534,7 @@ do { \ #endif /* CONFIG_LOCKDEP */ -#ifdef CONFIG_TRACE_IRQFLAGS +#ifdef CONFIG_PROVE_LOCKING extern void print_irqtrace_events(struct task_struct *curr); #else static inline void print_irqtrace_events(struct task_struct *curr) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 5bd3f151da78..c01813c3fbe9 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -150,7 +150,7 @@ */ #define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) -#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); #define preempt_count_dec_and_test() \ -- cgit v1.2.3 From 016583d7030cec9b69e0d55269a5967f4ee871d2 Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Tue, 31 Jul 2018 10:10:51 -0400 Subject: sunrpc: Change rpc_print_iostats to rpc_clnt_show_stats and handle rpc_clnt clones The existing rpc_print_iostats has a few shortcomings. First, the naming is not consistent with other functions in the kernel that display stats. Second, it is really displaying stats for an rpc_clnt structure as it displays both xprt stats and per-op stats. Third, it does not handle rpc_clnt clones, which is important for the one in-kernel tree caller of this function, the NFS client's nfs_show_stats function. Fix all of the above by renaming the rpc_print_iostats to rpc_clnt_show_stats and looping through any rpc_clnt clones via cl_parent. Once this interface is fixed, this addresses a problem with NFSv4. Before this patch, the /proc/self/mountstats always showed incorrect counts for NFSv4 lease and session related opcodes such as SEQUENCE, RENEW, SETCLIENTID, CREATE_SESSION, etc. These counts were always 0 even though many ops would go over the wire. The reason for this is there are multiple rpc_clnt structures allocated for any given NFSv4 mount, and inside nfs_show_stats() we callled into rpc_print_iostats() which only handled one of them, nfs_server->client. Fix these counts by calling sunrpc's new rpc_clnt_show_stats() function, which handles cloned rpc_clnt structs and prints the stats together. Note that one side-effect of the above is that multiple mounts from the same NFS server will show identical counts in the above ops due to the fact the one rpc_clnt (representing the NFSv4 client state) is shared across mounts. Signed-off-by: Dave Wysochanski Signed-off-by: Anna Schumaker --- include/linux/sunrpc/metrics.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/metrics.h b/include/linux/sunrpc/metrics.h index 9baed7b355b2..1b3751327575 100644 --- a/include/linux/sunrpc/metrics.h +++ b/include/linux/sunrpc/metrics.h @@ -82,7 +82,7 @@ void rpc_count_iostats(const struct rpc_task *, struct rpc_iostats *); void rpc_count_iostats_metrics(const struct rpc_task *, struct rpc_iostats *); -void rpc_print_iostats(struct seq_file *, struct rpc_clnt *); +void rpc_clnt_show_stats(struct seq_file *, struct rpc_clnt *); void rpc_free_iostats(struct rpc_iostats *); #else /* CONFIG_PROC_FS */ @@ -95,7 +95,7 @@ static inline void rpc_count_iostats_metrics(const struct rpc_task *task, { } -static inline void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt) {} +static inline void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt) {} static inline void rpc_free_iostats(struct rpc_iostats *stats) {} #endif /* CONFIG_PROC_FS */ -- cgit v1.2.3 From 0f90be132cbf1537d87a6a8b9e80867adac892f6 Mon Sep 17 00:00:00 2001 From: Bill Baker Date: Tue, 19 Jun 2018 16:24:58 -0500 Subject: NFSv4 client live hangs after live data migration recovery After a live data migration event at the NFS server, the client may send I/O requests to the wrong server, causing a live hang due to repeated recovery events. On the wire, this will appear as an I/O request failing with NFS4ERR_BADSESSION, followed by successful CREATE_SESSION, repeatedly. NFS4ERR_BADSSESSION is returned because the session ID being used was issued by the other server and is not valid at the old server. The failure is caused by async worker threads having cached the transport (xprt) in the rpc_task structure. After the migration recovery completes, the task is redispatched and the task resends the request to the wrong server based on the old value still present in tk_xprt. The solution is to recompute the tk_xprt field of the rpc_task structure so that the request goes to the correct server. Signed-off-by: Bill Baker Reviewed-by: Chuck Lever Tested-by: Helen Chao Fixes: fb43d17210ba ("SUNRPC: Use the multipath iterator to assign a ...") Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 9b11b6a0978c..73d5c4a870fa 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -156,6 +156,7 @@ int rpc_switch_client_transport(struct rpc_clnt *, void rpc_shutdown_client(struct rpc_clnt *); void rpc_release_client(struct rpc_clnt *); +void rpc_task_release_transport(struct rpc_task *); void rpc_task_release_client(struct rpc_task *); int rpcb_create_local(struct net *); -- cgit v1.2.3 From 3ebea280d7e9b610fa3d31c9cfd556b1705eeedf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 1 Aug 2018 21:08:30 -0400 Subject: ring-buffer: Make ring_buffer_record_is_on() return bool The value of ring_buffer_record_is_on() is either true or false, so have its return value be bool. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 003d09ab308d..5176124fc4ba 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -164,7 +164,7 @@ void ring_buffer_record_disable(struct ring_buffer *buffer); void ring_buffer_record_enable(struct ring_buffer *buffer); void ring_buffer_record_off(struct ring_buffer *buffer); void ring_buffer_record_on(struct ring_buffer *buffer); -int ring_buffer_record_is_on(struct ring_buffer *buffer); +bool ring_buffer_record_is_on(struct ring_buffer *buffer); int ring_buffer_record_is_set_on(struct ring_buffer *buffer); void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); -- cgit v1.2.3 From d7224c0e128c7337c0b0f66ac20921fbbf4efc14 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 1 Aug 2018 21:09:50 -0400 Subject: ring-buffer: Make ring_buffer_record_is_set_on() return bool The value of ring_buffer_record_is_set_on() is either true or false, so have its return value be bool. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 5176124fc4ba..0940fda59872 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -165,7 +165,7 @@ void ring_buffer_record_enable(struct ring_buffer *buffer); void ring_buffer_record_off(struct ring_buffer *buffer); void ring_buffer_record_on(struct ring_buffer *buffer); bool ring_buffer_record_is_on(struct ring_buffer *buffer); -int ring_buffer_record_is_set_on(struct ring_buffer *buffer); +bool ring_buffer_record_is_set_on(struct ring_buffer *buffer); void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); -- cgit v1.2.3 From 5a5ba10f44fa1cd081cec38389e1b47f438fe25b Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Thu, 26 Jul 2018 15:40:56 +0200 Subject: rtc: remove struct rtc_task Include rtc_task members directly in rtc_timer member. Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index bf4d375025d1..6aedc30003e7 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -87,16 +87,11 @@ struct rtc_class_ops { int (*set_offset)(struct device *, long offset); }; -typedef struct rtc_task { - void (*func)(void *private_data); - void *private_data; -} rtc_task_t; - - struct rtc_timer { - struct rtc_task task; struct timerqueue_node node; ktime_t period; + void (*func)(void *private_data); + void *private_data; int enabled; }; -- cgit v1.2.3 From 6d54228fd1f293d00576ab2c3d2e4992c7cce12f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 25 Jun 2018 17:26:55 +0200 Subject: libceph: make ceph_osdc_notify{,_ack}() payload_len u32 The wire format dictates that payload_len fits into 4 bytes. Take u32 instead of size_t to reflect that. All callers pass a small integer, so no changes required. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 0d6ee04b4c41..d6fe7e4df8cf 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -528,12 +528,12 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, u64 notify_id, u64 cookie, void *payload, - size_t payload_len); + u32 payload_len); int ceph_osdc_notify(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, void *payload, - size_t payload_len, + u32 payload_len, u32 timeout, struct page ***preply_pages, size_t *preply_len); -- cgit v1.2.3 From c9ed51c9123ab5e8f79b7d53a9afd786b43d4fe6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 27 Jun 2018 16:42:51 +0200 Subject: libceph: change ceph_pagelist_encode_string() to take u32 The wire format dictates that the length of string fits into 4 bytes. Take u32 instead of size_t to reflect that. We were already truncating len in ceph_pagelist_encode_32() -- this just pushes that truncation one level up. Signed-off-by: Ilya Dryomov --- include/linux/ceph/pagelist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h index 7edcded07641..d0223364349f 100644 --- a/include/linux/ceph/pagelist.h +++ b/include/linux/ceph/pagelist.h @@ -68,7 +68,7 @@ static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v) return ceph_pagelist_append(pl, &v, 1); } static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl, - char *s, size_t len) + char *s, u32 len) { int ret = ceph_pagelist_encode_32(pl, len); if (ret) -- cgit v1.2.3 From 473bd2d780d1699d81b25f57c0ec4de633a28eb8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Jul 2018 22:18:34 +0200 Subject: libceph: use timespec64 in for keepalive2 and ticket validity ceph_con_keepalive_expired() is the last user of timespec_add() and some of the last uses of ktime_get_real_ts(). Replacing this with timespec64 based interfaces lets us remove that deprecated API. I'm introducing new ceph_encode_timespec64()/ceph_decode_timespec64() here that take timespec64 structures and convert to/from ceph_timespec, which is defined to have an unsigned 32-bit tv_sec member. This extends the range of valid times to year 2106, avoiding the year 2038 overflow. The ceph file system portion still uses the old functions for inode timestamps, this will be done separately after the VFS layer is converted. Signed-off-by: Arnd Bergmann Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/decode.h | 20 +++++++++++++++++++- include/linux/ceph/messenger.h | 2 +- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index d143ac8879c6..094b9b4a34f3 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -194,8 +194,26 @@ ceph_decode_skip_n(p, end, sizeof(u8), bad) } while (0) /* - * struct ceph_timespec <-> struct timespec + * struct ceph_timespec <-> struct timespec64 */ +static inline void ceph_decode_timespec64(struct timespec64 *ts, + const struct ceph_timespec *tv) +{ + /* + * This will still overflow in year 2106. We could extend + * the protocol to steal two more bits from tv_nsec to + * add three more 136 year epochs after that the way ext4 + * does if necessary. + */ + ts->tv_sec = (time64_t)le32_to_cpu(tv->tv_sec); + ts->tv_nsec = (long)le32_to_cpu(tv->tv_nsec); +} +static inline void ceph_encode_timespec64(struct ceph_timespec *tv, + const struct timespec64 *ts) +{ + tv->tv_sec = cpu_to_le32((u32)ts->tv_sec); + tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec); +} static inline void ceph_decode_timespec(struct timespec *ts, const struct ceph_timespec *tv) { diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index c7dfcb8a1fb2..a718b877c597 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -330,7 +330,7 @@ struct ceph_connection { int in_base_pos; /* bytes read */ __le64 in_temp_ack; /* for reading an ack */ - struct timespec last_keepalive_ack; /* keepalive2 ack stamp */ + struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */ struct delayed_work work; /* send|recv work */ unsigned long delay; /* current delay interval */ -- cgit v1.2.3 From fac02ddf910814c24f5d9d969dfdab5227f6f3eb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Jul 2018 22:18:37 +0200 Subject: libceph: use timespec64 for r_mtime The request mtime field is used all over ceph, and is currently represented as a 'timespec' structure in Linux. This changes it to timespec64 to allow times beyond 2038, modifying all users at the same time. [ Remove now redundant ts variable in writepage_nounlock(). ] Signed-off-by: Arnd Bergmann Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d6fe7e4df8cf..02096da01845 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -199,7 +199,7 @@ struct ceph_osd_request { /* set by submitter */ u64 r_snapid; /* for reads, CEPH_NOSNAP o/w */ struct ceph_snap_context *r_snapc; /* for writes */ - struct timespec r_mtime; /* ditto */ + struct timespec64 r_mtime; /* ditto */ u64 r_data_offset; /* ditto */ bool r_linger; /* don't resend on failure */ @@ -253,7 +253,7 @@ struct ceph_osd_linger_request { struct ceph_osd_request_target t; u32 map_dne_bound; - struct timespec mtime; + struct timespec64 mtime; struct kref kref; struct mutex lock; @@ -508,7 +508,7 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_snap_context *sc, u64 off, u64 len, u32 truncate_seq, u64 truncate_size, - struct timespec *mtime, + struct timespec64 *mtime, struct page **pages, int nr_pages); /* watch/notify */ -- cgit v1.2.3 From f7e52d8efe8588c5d4b4c78eb33da81d89486c1a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 23 Jul 2018 14:11:40 +0200 Subject: libceph: remove now unused ceph_{en,de}code_timespec() Signed-off-by: Ilya Dryomov --- include/linux/ceph/decode.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 094b9b4a34f3..a6c2a48d42e0 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -214,18 +214,6 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv, tv->tv_sec = cpu_to_le32((u32)ts->tv_sec); tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec); } -static inline void ceph_decode_timespec(struct timespec *ts, - const struct ceph_timespec *tv) -{ - ts->tv_sec = (__kernel_time_t)le32_to_cpu(tv->tv_sec); - ts->tv_nsec = (long)le32_to_cpu(tv->tv_nsec); -} -static inline void ceph_encode_timespec(struct ceph_timespec *tv, - const struct timespec *ts) -{ - tv->tv_sec = cpu_to_le32((u32)ts->tv_sec); - tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec); -} /* * sockaddr_storage <-> ceph_sockaddr -- cgit v1.2.3 From 262614c4294d33b1f19e0d18c0091d9c329b544a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 26 Jul 2018 15:17:46 +0200 Subject: libceph: store ceph_auth_handshake pointer in ceph_connection We already copy authorizer_reply_buf and authorizer_reply_buf_len into ceph_connection. Factoring out __prepare_write_connect() requires two more: authorizer_buf and authorizer_buf_len. Store the pointer to the handshake in con->auth rather than piling on. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/messenger.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index a718b877c597..021718570b50 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -286,9 +286,8 @@ struct ceph_connection { attempt for this connection, client */ u32 peer_global_seq; /* peer's global seq for this connection */ + struct ceph_auth_handshake *auth; int auth_retry; /* true if we need a newer authorizer */ - void *auth_reply_buf; /* where to put the authorizer reply */ - int auth_reply_buf_len; struct mutex mutex; -- cgit v1.2.3 From 6daca13d2e72bedaaacfc08f873114c9307d5aea Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 27 Jul 2018 19:18:34 +0200 Subject: libceph: add authorizer challenge When a client authenticates with a service, an authorizer is sent with a nonce to the service (ceph_x_authorize_[ab]) and the service responds with a mutation of that nonce (ceph_x_authorize_reply). This lets the client verify the service is who it says it is but it doesn't protect against a replay: someone can trivially capture the exchange and reuse the same authorizer to authenticate themselves. Allow the service to reject an initial authorizer with a random challenge (ceph_x_authorize_challenge). The client then has to respond with an updated authorizer proving they are able to decrypt the service's challenge and that the new authorizer was produced for this specific connection instance. The accepting side requires this challenge and response unconditionally if the client side advertises they have CEPHX_V2 feature bit. This addresses CVE-2018-1128. Link: http://tracker.ceph.com/issues/24836 Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/auth.h | 8 ++++++++ include/linux/ceph/messenger.h | 3 +++ include/linux/ceph/msgr.h | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h index e931da8424a4..6728c2ee0205 100644 --- a/include/linux/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -64,6 +64,10 @@ struct ceph_auth_client_ops { /* ensure that an existing authorizer is up to date */ int (*update_authorizer)(struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth); + int (*add_authorizer_challenge)(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + void *challenge_buf, + int challenge_buf_len); int (*verify_authorizer_reply)(struct ceph_auth_client *ac, struct ceph_authorizer *a); void (*invalidate_authorizer)(struct ceph_auth_client *ac, @@ -118,6 +122,10 @@ void ceph_auth_destroy_authorizer(struct ceph_authorizer *a); extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *a); +int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + void *challenge_buf, + int challenge_buf_len); extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, struct ceph_authorizer *a); extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 021718570b50..fc2b4491ee0a 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -31,6 +31,9 @@ struct ceph_connection_operations { struct ceph_auth_handshake *(*get_authorizer) ( struct ceph_connection *con, int *proto, int force_new); + int (*add_authorizer_challenge)(struct ceph_connection *con, + void *challenge_buf, + int challenge_buf_len); int (*verify_authorizer_reply) (struct ceph_connection *con); int (*invalidate_authorizer)(struct ceph_connection *con); diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h index 73ae2a926548..9e50aede46c8 100644 --- a/include/linux/ceph/msgr.h +++ b/include/linux/ceph/msgr.h @@ -91,7 +91,7 @@ struct ceph_entity_inst { #define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */ #define CEPH_MSGR_TAG_KEEPALIVE2 14 /* keepalive2 byte + ceph_timespec */ #define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive2 reply */ - +#define CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER 16 /* cephx v2 doing server challenge */ /* * connection negotiation -- cgit v1.2.3 From cc255c76c70f7a87d97939621eae04b600d9f4a1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 27 Jul 2018 19:25:32 +0200 Subject: libceph: implement CEPHX_V2 calculation mode Derive the signature from the entire buffer (both AES cipher blocks) instead of using just the first half of the first block, leaving out data_crc entirely. This addresses CVE-2018-1129. Link: http://tracker.ceph.com/issues/24837 Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 3901927cf6a0..6b92b3395fa9 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -165,9 +165,9 @@ DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap DEFINE_CEPH_FEATURE(59, 1, FS_BTIME) DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap -DEFINE_CEPH_FEATURE(60, 1, BLKIN_TRACING) // *do not share this bit* +DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit* +DEFINE_CEPH_FEATURE(61, 1, CEPHX_V2) // *do not share this bit* -DEFINE_CEPH_FEATURE(61, 1, RESERVED2) // unused, but slow down! DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinal DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing @@ -210,7 +210,8 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin CEPH_FEATURE_SERVER_JEWEL | \ CEPH_FEATURE_MON_STATEFUL_SUB | \ CEPH_FEATURE_CRUSH_TUNABLES5 | \ - CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING) + CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \ + CEPH_FEATURE_CEPHX_V2) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ -- cgit v1.2.3 From 088fe47ce952542389e604e83f533811750aaf7c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Jul 2018 17:26:49 -0500 Subject: signal: Add calculate_sigpending() Add a function calculate_sigpending to test to see if any signals are pending for a new task immediately following fork. Signals have to happen either before or after fork. Today our practice is to push all of the signals to before the fork, but that has the downside that frequent or periodic signals can make fork take much much longer than normal or prevent fork from completing entirely. So we need move signals that we can after the fork to prevent that. This updates the code to set TIF_SIGPENDING on a new task if there are signals or other activities that have moved so that they appear to happen after the fork. As the code today restarts if it sees any such activity this won't immediately have an effect, as there will be no reason for it to set TIF_SIGPENDING immediately after the fork. Adding calculate_sigpending means the code in fork can safely be changed to not always restart if a signal is pending. The new calculate_sigpending function sets sigpending if there are pending bits in jobctl, pending signals, the freezer needs to freeze the new task or the live kernel patching framework need the new thread to take the slow path to userspace. I have verified that setting TIF_SIGPENDING does make a new process take the slow path to userspace before it executes it's first userspace instruction. I have looked at the callers of signal_wake_up and the code paths setting TIF_SIGPENDING and I don't see anything else that needs to be handled. The code probably doesn't need to set TIF_SIGPENDING for the kernel live patching as it uses a separate thread flag as well. But at this point it seems safer reuse the recalc_sigpending logic and get the kernel live patching folks to sort out their story later. V2: I have moved the test into schedule_tail where siglock can be grabbed and recalc_sigpending can be reused directly. Further as the last action of setting up a new task this guarantees that TIF_SIGPENDING will be properly set in the new process. The helper calculate_sigpending takes the siglock and uncontitionally sets TIF_SIGPENDING and let's recalc_sigpending clear TIF_SIGPENDING if it is unnecessary. This allows reusing the existing code and keeps maintenance of the conditions simple. Oleg Nesterov suggested the movement and pointed out the need to take siglock if this code was going to be called while the new task is discoverable. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 94558ffa82ab..b55fd293c1e5 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -372,6 +372,7 @@ static inline int signal_pending_state(long state, struct task_struct *p) */ extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); +extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); -- cgit v1.2.3 From 4390e9eadbbb6774b7ba03fde0a0fdf3f07db4cd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 3 Aug 2018 20:10:54 -0500 Subject: fork: Skip setting TIF_SIGPENDING in ptrace_init_task The code in calculate_sigpending will now handle this so it is just redundant and possibly a little confusing to continue setting TIF_SIGPENDING in ptrace_init_task. Suggested-by: Oleg Nesterov Signed-off-by: "Eric W. Biederman" --- include/linux/ptrace.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 037bf0ef1ae9..4f36431c380b 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -214,8 +214,6 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) task_set_jobctl_pending(child, JOBCTL_TRAP_STOP); else sigaddset(&child->pending.signal, SIGSTOP); - - set_tsk_thread_flag(child, TIF_SIGPENDING); } else child->ptracer_cred = NULL; -- cgit v1.2.3 From 924de3b8c9410c404c6eda7abffd282b97b3ff7f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Jul 2018 13:38:00 -0500 Subject: fork: Have new threads join on-going signal group stops There are only two signals that are delivered to every member of a signal group: SIGSTOP and SIGKILL. Signal delivery requires every signal appear to be delivered either before or after a clone syscall. SIGKILL terminates the clone so does not need to be considered. Which leaves only SIGSTOP that needs to be considered when creating new threads. Today in the event of a group stop TIF_SIGPENDING will get set and the fork will restart ensuring the fork syscall participates in the group stop. A fork (especially of a process with a lot of memory) is one of the most expensive system so we really only want to restart a fork when necessary. It is easy so check to see if a SIGSTOP is ongoing and have the new thread join it immediate after the clone completes. Making it appear the clone completed happened just before the SIGSTOP. The calculate_sigpending function will see the bits set in jobctl and set TIF_SIGPENDING to ensure the new task takes the slow path to userspace. V2: The call to task_join_group_stop was moved before the new task is added to the thread group list. This should not matter as sighand->siglock is held over both the addition of the threads, the call to task_join_group_stop and do_signal_stop. But the change is trivial and it is one less thing to worry about when reading the code. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index b55fd293c1e5..ae2b0b81be25 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -385,6 +385,8 @@ static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) signal_wake_up_state(t, resume ? __TASK_TRACED : 0); } +void task_join_group_stop(struct task_struct *task); + #ifdef TIF_RESTORE_SIGMASK /* * Legacy restore_sigmask accessors. These are inefficient on -- cgit v1.2.3 From d9cfe2ce246845b9cca0ec1b881e826965893c58 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 23 Jul 2018 22:26:05 +0200 Subject: i2c: quirks: add zero length checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some adapters do not support a message length of 0. Add this as a quirk so drivers don't have to open code it. Signed-off-by: Wolfram Sang Reviewed-by: Niklas Söderlund Reviewed-by: Andy Shevchenko Tested-by: Jarkko Nikula Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index bc8d42f8544f..2a98d0886d2e 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -661,6 +661,10 @@ struct i2c_adapter_quirks { I2C_AQ_COMB_READ_SECOND | I2C_AQ_COMB_SAME_ADDR) /* clock stretching is not supported */ #define I2C_AQ_NO_CLK_STRETCH BIT(4) +/* message cannot have length of 0 */ +#define I2C_AQ_NO_ZERO_LEN_READ BIT(5) +#define I2C_AQ_NO_ZERO_LEN_WRITE BIT(6) +#define I2C_AQ_NO_ZERO_LEN (I2C_AQ_NO_ZERO_LEN_READ | I2C_AQ_NO_ZERO_LEN_WRITE) /* * i2c_adapter is the structure used to identify a physical i2c bus along -- cgit v1.2.3 From eac7e072d7e99fad1b6e817c608b03c48205241e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 6 Aug 2018 10:22:35 -0700 Subject: Revert "ata: ahci_platform: allow disabling of hotplug to save power" This reverts commit aece27a2f01be4bb7683790f69cd1bed3a0929a2. Causes boot failure on some devices. http://lore.kernel.org/r/CA+G9fYuKW_jCFZPqG4tz=QY9ROfHO38KiCp9XTA+KaDOFVtcqQ@mail.gmail.com Signed-off-by: Tejun Heo --- include/linux/ahci_platform.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h index 6396e6982103..1b0a17b22cd3 100644 --- a/include/linux/ahci_platform.h +++ b/include/linux/ahci_platform.h @@ -42,7 +42,5 @@ int ahci_platform_suspend_host(struct device *dev); int ahci_platform_resume_host(struct device *dev); int ahci_platform_suspend(struct device *dev); int ahci_platform_resume(struct device *dev); -int ahci_platform_runtime_suspend(struct device *dev); -int ahci_platform_runtime_resume(struct device *dev); #endif /* _AHCI_PLATFORM_H */ -- cgit v1.2.3 From d88e61faad526a5850e9330c846641b91cf971e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:36:26 +0200 Subject: iommu: Remove the ->map_sg indirection All iommu drivers use the default_iommu_map_sg implementation, and there is no good reason to ever override it. Just expose it as iommu_map_sg directly and remove the indirection, specially in our post-spectre world where indirect calls are horribly expensive. Signed-off-by: Christoph Hellwig Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7447b0b0579a..87994c265bf5 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -166,8 +166,6 @@ struct iommu_resv_region { * @detach_dev: detach device from an iommu domain * @map: map a physically contiguous memory region to an iommu domain * @unmap: unmap a physically contiguous memory region from an iommu domain - * @map_sg: map a scatter-gather list of physically contiguous memory chunks - * to an iommu domain * @flush_tlb_all: Synchronously flush all hardware TLBs for this domain * @tlb_range_add: Add a given iova range to the flush queue for this domain * @tlb_sync: Flush all queued ranges from the hardware TLBs and empty flush @@ -201,8 +199,6 @@ struct iommu_ops { phys_addr_t paddr, size_t size, int prot); size_t (*unmap)(struct iommu_domain *domain, unsigned long iova, size_t size); - size_t (*map_sg)(struct iommu_domain *domain, unsigned long iova, - struct scatterlist *sg, unsigned int nents, int prot); void (*flush_iotlb_all)(struct iommu_domain *domain); void (*iotlb_range_add)(struct iommu_domain *domain, unsigned long iova, size_t size); @@ -303,9 +299,8 @@ extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size); extern size_t iommu_unmap_fast(struct iommu_domain *domain, unsigned long iova, size_t size); -extern size_t default_iommu_map_sg(struct iommu_domain *domain, unsigned long iova, - struct scatterlist *sg,unsigned int nents, - int prot); +extern size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, + struct scatterlist *sg,unsigned int nents, int prot); extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova); extern void iommu_set_fault_handler(struct iommu_domain *domain, iommu_fault_handler_t handler, void *token); @@ -378,13 +373,6 @@ static inline void iommu_tlb_sync(struct iommu_domain *domain) domain->ops->iotlb_sync(domain); } -static inline size_t iommu_map_sg(struct iommu_domain *domain, - unsigned long iova, struct scatterlist *sg, - unsigned int nents, int prot) -{ - return domain->ops->map_sg(domain, iova, sg, nents, prot); -} - /* PCI device grouping function */ extern struct iommu_group *pci_device_group(struct device *dev); /* Generic device grouping function */ -- cgit v1.2.3 From 4717be73c2843a3d6d8546872177a19358f6b7b5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 25 Jul 2018 17:39:25 +0300 Subject: i2c: core: Parse SDA hold time from firmware There are two drivers already using the SDA hold time setting. It might be more in the future, thus, make I2C core to parse the setting for us if provided by firmware. Signed-off-by: Andy Shevchenko Tested-by: Alexandre Belloni Reviewed-by: Alexandre Belloni Acked-by: Ludovic Desroches Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 2a98d0886d2e..36f357ecdf67 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -564,6 +564,7 @@ struct i2c_lock_operations { * @scl_fall_ns: time SCL signal takes to fall in ns; t(f) in the I2C specification * @scl_int_delay_ns: time IP core additionally needs to setup SCL in ns * @sda_fall_ns: time SDA signal takes to fall in ns; t(f) in the I2C specification + * @sda_hold_ns: time IP core additionally needs to hold SDA in ns */ struct i2c_timings { u32 bus_freq_hz; @@ -571,6 +572,7 @@ struct i2c_timings { u32 scl_fall_ns; u32 scl_int_delay_ns; u32 sda_fall_ns; + u32 sda_hold_ns; }; /** -- cgit v1.2.3 From cb95deea0b4aa5c7c7423f4e075a3ddcd59e710b Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 9 Jul 2018 15:13:29 -0400 Subject: NFS OFFLOAD_CANCEL xdr Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- include/linux/nfs4.h | 1 + include/linux/nfs_fs_sb.h | 1 + include/linux/nfs_xdr.h | 12 ++++++++++++ 3 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 57ffaa20d564..c44b87293229 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -527,6 +527,7 @@ enum { NFSPROC4_CLNT_LAYOUTSTATS, NFSPROC4_CLNT_CLONE, NFSPROC4_CLNT_COPY, + NFSPROC4_CLNT_OFFLOAD_CANCEL, NFSPROC4_CLNT_LOOKUPP, }; diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 2c18d618604e..fbc735f08d7e 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -255,5 +255,6 @@ struct nfs_server { #define NFS_CAP_LAYOUTSTATS (1U << 22) #define NFS_CAP_CLONE (1U << 23) #define NFS_CAP_COPY (1U << 24) +#define NFS_CAP_OFFLOAD_CANCEL (1U << 25) #endif diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 3b7325cfb291..85e928a56cef 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1403,6 +1403,18 @@ struct nfs42_copy_res { struct nfs_commitres commit_res; }; +struct nfs42_offload_status_args { + struct nfs4_sequence_args osa_seq_args; + struct nfs_fh *osa_src_fh; + nfs4_stateid osa_stateid; +}; + +struct nfs42_offload_status_res { + struct nfs4_sequence_res osr_seq_res; + uint64_t osr_count; + int osr_status; +}; + struct nfs42_seek_args { struct nfs4_sequence_args seq_args; -- cgit v1.2.3 From 67aa7444c4beb40aafedd8d2c60bbcc54987adda Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 9 Jul 2018 15:13:30 -0400 Subject: NFS COPY xdr handle async reply If server returns async reply, it must include a callback stateid, wr_callback_id in the write_response4. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 85e928a56cef..06ddfa31cbef 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1391,6 +1391,7 @@ struct nfs42_copy_args { }; struct nfs42_write_res { + nfs4_stateid stateid; u64 count; struct nfs_writeverf verifier; }; -- cgit v1.2.3 From 62164f317972fcd36590578888f33a1994dda519 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 9 Jul 2018 15:13:31 -0400 Subject: NFS add support for asynchronous COPY Change xdr to always send COPY asynchronously. Keep the list copies send in a list under a server structure. Once copy is sent, it waits on a completion structure that will be signalled by the callback thread that receives CB_OFFLOAD. If CB_OFFLOAD returned an error and even if it returned partial bytes, ignore them (as we can't commit without a verifier to match) and return an error. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- include/linux/nfs_fs.h | 9 +++++++++ include/linux/nfs_fs_sb.h | 1 + include/linux/nfs_xdr.h | 1 + 3 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 2f129bbfaae8..645ad8e342f6 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -185,6 +185,15 @@ struct nfs_inode { struct inode vfs_inode; }; +struct nfs4_copy_state { + struct list_head copies; + nfs4_stateid stateid; + struct completion completion; + uint64_t count; + struct nfs_writeverf verf; + int error; +}; + /* * Access bit flags */ diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index fbc735f08d7e..f88952d7b9fb 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -208,6 +208,7 @@ struct nfs_server { struct list_head state_owners_lru; struct list_head layouts; struct list_head delegations; + struct list_head ss_copies; unsigned long mig_gen; unsigned long mig_status; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 06ddfa31cbef..bd1c889a9ed9 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1388,6 +1388,7 @@ struct nfs42_copy_args { u64 dst_pos; u64 count; + bool sync; }; struct nfs42_write_res { -- cgit v1.2.3 From bc0c9079b48ddcf1f8a6e1aaa277288b263c78d8 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 9 Jul 2018 15:13:32 -0400 Subject: NFS handle COPY reply CB_OFFLOAD call race It's possible that server replies back with CB_OFFLOAD call and COPY reply at the same time such that client will process CB_OFFLOAD before reply to COPY. For that keep a list of pending callback stateids received and then before waiting on completion check the pending list. Cleanup any pending copies on the client shutdown. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index f88952d7b9fb..bf39d9c92201 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -121,6 +121,7 @@ struct nfs_client { #endif struct net *cl_net; + struct list_head pending_cb_stateids; }; /* -- cgit v1.2.3 From c3ad2c3b02e953ead2b8d52a0c9e70312930c3d0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 Jul 2018 15:20:37 -0500 Subject: signal: Don't restart fork when signals come in. Wen Yang and majiang report that a periodic signal received during fork can cause fork to continually restart preventing an application from making progress. The code was being overly pessimistic. Fork needs to guarantee that a signal sent to multiple processes is logically delivered before the fork and just to the forking process or logically delivered after the fork to both the forking process and it's newly spawned child. For signals like periodic timers that are always delivered to a single process fork can safely complete and let them appear to logically delivered after the fork(). While examining this issue I also discovered that fork today will miss signals delivered to multiple processes during the fork and handled by another thread. Similarly the current code will also miss blocked signals that are delivered to multiple process, as those signals will not appear pending during fork. Add a list of each thread that is currently forking, and keep on that list a signal set that records all of the signals sent to multiple processes. When fork completes initialize the new processes shared_pending signal set with it. The calculate_sigpending function will see those signals and set TIF_SIGPENDING causing the new task to take the slow path to userspace to handle those signals. Making it appear as if those signals were received immediately after the fork. It is not possible to send real time signals to multiple processes and exceptions don't go to multiple processes, which means that that are no signals sent to multiple processes that require siginfo. This means it is safe to not bother collecting siginfo on signals sent during fork. The sigaction of a child of fork is initially the same as the sigaction of the parent process. So a signal the parent ignores the child will also initially ignore. Therefore it is safe to ignore signals sent to multiple processes and ignored by the forking process. Signals sent to only a single process or only a single thread and delivered during fork are treated as if they are received after the fork, and generally not dealt with. They won't cause any problems. V2: Added removal from the multiprocess list on failure. V3: Use -ERESTARTNOINTR directly V4: - Don't queue both SIGCONT and SIGSTOP - Initialize signal_struct.multiprocess in init_task - Move setting of shared_pending to before the new task is visible to signals. This prevents signals from comming in before shared_pending.signal is set to delayed.signal and being lost. V5: - rework list add and delete to account for idle threads v6: - Use sigdelsetmask when removing stop signals Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=200447 Reported-by: Wen Yang and Reported-by: majiang Fixes: 4a2c7a7837da ("[PATCH] make fork() atomic wrt pgrp/session signals") Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index ae2b0b81be25..4e9b77fb702d 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -69,6 +69,11 @@ struct thread_group_cputimer { bool checking_timer; }; +struct multiprocess_signals { + sigset_t signal; + struct hlist_node node; +}; + /* * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always @@ -90,6 +95,9 @@ struct signal_struct { /* shared signal handling: */ struct sigpending shared_pending; + /* For collecting multiprocess signals during fork */ + struct hlist_head multiprocess; + /* thread group exit support */ int group_exit_code; /* overloaded: -- cgit v1.2.3 From 64bed6cbe38bc95689fb9399872d9ce250192f90 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 13 Jul 2018 17:22:24 +0300 Subject: nfsd: fix leaked file lock with nfs exported overlayfs nfsd and lockd call vfs_lock_file() to lock/unlock the inode returned by locks_inode(file). Many places in nfsd/lockd code use the inode returned by file_inode(file) for lock manipulation. With Overlayfs, file_inode() (the underlying inode) is not the same object as locks_inode() (the overlay inode). This can result in "Leaked POSIX lock" messages and eventually to a kernel crash as reported by Eddie Horng: https://marc.info/?l=linux-unionfs&m=153086643202072&w=2 Fix all the call sites in nfsd/lockd that should use locks_inode(). This is a correctness bug that manifested when overlayfs gained NFS export support in v4.16. Reported-by: Eddie Horng Tested-by: Eddie Horng Cc: Jeff Layton Fixes: 8383f1748829 ("ovl: wire up NFS export operations") Cc: stable@vger.kernel.org Signed-off-by: Amir Goldstein Signed-off-by: J. Bruce Fields --- include/linux/lockd/lockd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 4fd95dbeb52f..b065ef406770 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -299,7 +299,7 @@ int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) { - return file_inode(file->f_file); + return locks_inode(file->f_file); } static inline int __nlm_privileged_request4(const struct sockaddr *sap) @@ -359,7 +359,7 @@ static inline int nlm_privileged_requester(const struct svc_rqst *rqstp) static inline int nlm_compare_locks(const struct file_lock *fl1, const struct file_lock *fl2) { - return file_inode(fl1->fl_file) == file_inode(fl2->fl_file) + return locks_inode(fl1->fl_file) == locks_inode(fl2->fl_file) && fl1->fl_pid == fl2->fl_pid && fl1->fl_owner == fl2->fl_owner && fl1->fl_start == fl2->fl_start -- cgit v1.2.3 From 3fd9557aec919e2db99365ad5a2c00d04ae8893c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Jul 2018 11:19:05 -0400 Subject: NFSD: Refactor the generic write vector fill helper fill_in_write_vector() is nearly the same logic as svc_fill_write_vector(), but there are a few differences so that the former can handle multiple WRITE payloads in a single COMPOUND. svc_fill_write_vector() can be adjusted so that it can be used in the NFSv4 WRITE code path too. Instead of assuming the pages are coming from rq_args.pages, have the caller pass in the page list. The immediate benefit is a reduction of code duplication. It also prevents the NFSv4 WRITE decoder from passing an empty vector element when the transport has provided the payload in the xdr_buf's page array. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 574368e8a16f..43f88bd7b601 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -496,6 +496,7 @@ void svc_reserve(struct svc_rqst *rqstp, int space); struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); char * svc_print_addr(struct svc_rqst *, char *, size_t); unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, + struct page **pages, struct kvec *first, size_t total); char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, size_t total); -- cgit v1.2.3 From 11b4d66ea3313d9b03a83b80458ddee64990e3c3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 27 Jul 2018 11:19:10 -0400 Subject: NFSD: Handle full-length symlinks I've given up on the idea of zero-copy handling of SYMLINK on the server side. This is because the Linux VFS symlink API requires the symlink pathname to be in a NUL-terminated kmalloc'd buffer. The NUL-termination is going to be problematic (watching out for landing on a page boundary and dealing with a 4096-byte pathname). I don't believe that SYMLINK creation is on a performance path or is requested frequently enough that it will cause noticeable CPU cache pollution due to data copies. There will be two places where a transport callout will be necessary to fill in the rqstp: one will be in the svc_fill_symlink_pathname() helper that is used by NFSv2 and NFSv3, and the other will be in nfsd4_decode_create(). Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 43f88bd7b601..73e130a840ce 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -499,7 +499,8 @@ unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct page **pages, struct kvec *first, size_t total); char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, - struct kvec *first, size_t total); + struct kvec *first, void *p, + size_t total); #define RPC_MAX_ADDRBUFLEN (63U) -- cgit v1.2.3 From bff1b208a5d1dbb2355822ef859edcb9be0379e4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 6 Aug 2018 15:50:58 -0400 Subject: tracing: Partial revert of "tracing: Centralize preemptirq tracepoints and unify their usage" Joel Fernandes created a nice patch that cleaned up the duplicate hooks used by lockdep and irqsoff latency tracer. It made both use tracepoints. But it caused lockdep to trigger several false positives. We have not figured out why yet, but removing lockdep from using the trace event hooks and just call its helper functions directly (like it use to), makes the problem go away. This is a partial revert of the clean up patch c3bc8fd637a9 ("tracing: Centralize preemptirq tracepoints and unify their usage") that adds direct calls for lockdep, but also keeps most of the clean up done to get rid of the horrible preprocessor if statements. Link: http://lkml.kernel.org/r/20180806155058.5ee875f4@gandalf.local.home Cc: Peter Zijlstra Reviewed-by: Joel Fernandes (Google) Fixes: c3bc8fd637a9 ("tracing: Centralize preemptirq tracepoints and unify their usage") Signed-off-by: Steven Rostedt (VMware) --- include/linux/irqflags.h | 8 ++++++-- include/linux/lockdep.h | 2 -- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 50edb9cbbd26..21619c92c377 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -19,9 +19,13 @@ #ifdef CONFIG_PROVE_LOCKING extern void trace_softirqs_on(unsigned long ip); extern void trace_softirqs_off(unsigned long ip); + extern void lockdep_hardirqs_on(unsigned long ip); + extern void lockdep_hardirqs_off(unsigned long ip); #else -# define trace_softirqs_on(ip) do { } while (0) -# define trace_softirqs_off(ip) do { } while (0) + static inline void trace_softirqs_on(unsigned long ip) { } + static inline void trace_softirqs_off(unsigned long ip) { } + static inline void lockdep_hardirqs_on(unsigned long ip) { } + static inline void lockdep_hardirqs_off(unsigned long ip) { } #endif #ifdef CONFIG_TRACE_IRQFLAGS diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index a8113357ceeb..b0d0b51c4d85 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -267,7 +267,6 @@ struct held_lock { * Initialization, self-test and debugging-output methods: */ extern void lockdep_init(void); -extern void lockdep_init_early(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); extern void lockdep_free_key_range(void *start, unsigned long size); @@ -408,7 +407,6 @@ static inline void lockdep_on(void) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) -# define lockdep_init_early() do { } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) # define lockdep_set_class(lock, key) do { (void)(key); } while (0) -- cgit v1.2.3 From e4648aa4f98a87cf0a83f73a5864cede073053a0 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 13 Aug 2018 15:33:01 -0400 Subject: NFS recover from destination server reboot for copies Mark the destination state to indicate a server-side copy is happening. On detecting a reboot and recovering open state check if any state is engaged in a server-side copy, if so, find the copy and mark it and then signal the waiting thread. Upon wakeup, if copy was marked then propage EAGAIN to the nfsd_copy_file_range and restart the copy from scratch. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- include/linux/nfs_fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 645ad8e342f6..a0831e9d19c9 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -192,6 +192,8 @@ struct nfs4_copy_state { uint64_t count; struct nfs_writeverf verf; int error; + int flags; + struct nfs4_state *parent_state; }; /* -- cgit v1.2.3 From 6d43743e9079ac0531b60cde7eadd0f042873344 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 9 Aug 2018 09:48:52 +0530 Subject: Uprobe: Additional argument arch_uprobe to uprobe_write_opcode() Add addition argument 'arch_uprobe' to uprobe_write_opcode(). We need this in later set of patches. Link: http://lkml.kernel.org/r/20180809041856.1547-3-ravi.bangoria@linux.ibm.com Reviewed-by: Song Liu Acked-by: Srikar Dronamraju Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- include/linux/uprobes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 0a294e950df8..bb9d2084af03 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -121,7 +121,7 @@ extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); -- cgit v1.2.3 From 6855dc41b24619c3d1de3dbd27dd0546b0e45272 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 6 Jun 2018 15:54:11 +0300 Subject: x86: Add entry trampolines to kcore Without program headers for PTI entry trampoline pages, the trampoline virtual addresses do not map to anything. Example before: sudo gdb --quiet vmlinux /proc/kcore Reading symbols from vmlinux...done. [New process 1] Core was generated by `BOOT_IMAGE=/boot/vmlinuz-4.16.0 root=UUID=a6096b83-b763-4101-807e-f33daff63233'. #0 0x0000000000000000 in irq_stack_union () (gdb) x /21ib 0xfffffe0000006000 0xfffffe0000006000: Cannot access memory at address 0xfffffe0000006000 (gdb) quit After: sudo gdb --quiet vmlinux /proc/kcore [sudo] password for ahunter: Reading symbols from vmlinux...done. [New process 1] Core was generated by `BOOT_IMAGE=/boot/vmlinuz-4.16.0-fix-4-00005-gd6e65a8b4072 root=UUID=a6096b83-b7'. #0 0x0000000000000000 in irq_stack_union () (gdb) x /21ib 0xfffffe0000006000 0xfffffe0000006000: swapgs 0xfffffe0000006003: mov %rsp,-0x3e12(%rip) # 0xfffffe00000021f8 0xfffffe000000600a: xchg %ax,%ax 0xfffffe000000600c: mov %cr3,%rsp 0xfffffe000000600f: bts $0x3f,%rsp 0xfffffe0000006014: and $0xffffffffffffe7ff,%rsp 0xfffffe000000601b: mov %rsp,%cr3 0xfffffe000000601e: mov -0x3019(%rip),%rsp # 0xfffffe000000300c 0xfffffe0000006025: pushq $0x2b 0xfffffe0000006027: pushq -0x3e35(%rip) # 0xfffffe00000021f8 0xfffffe000000602d: push %r11 0xfffffe000000602f: pushq $0x33 0xfffffe0000006031: push %rcx 0xfffffe0000006032: push %rdi 0xfffffe0000006033: mov $0xffffffff91a00010,%rdi 0xfffffe000000603a: callq 0xfffffe0000006046 0xfffffe000000603f: pause 0xfffffe0000006041: lfence 0xfffffe0000006044: jmp 0xfffffe000000603f 0xfffffe0000006046: mov %rdi,(%rsp) 0xfffffe000000604a: retq (gdb) quit In addition, entry trampolines all map to the same page. Represent that by giving the corresponding program headers in kcore the same offset. This has the benefit that, when perf tools uses /proc/kcore as a source for kernel object code, samples from different CPU trampolines are aggregated together. Note, such aggregation is normal for profiling i.e. people want to profile the object code, not every different virtual address the object code might be mapped to (across different processes for example). Notes by PeterZ: This also adds the KCORE_REMAP functionality. Signed-off-by: Adrian Hunter Acked-by: Andi Kleen Acked-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Dave Hansen Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Joerg Roedel Cc: Thomas Gleixner Cc: x86@kernel.org Link: http://lkml.kernel.org/r/1528289651-4113-4-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/kcore.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kcore.h b/include/linux/kcore.h index 8de55e4b5ee9..bc088ef96358 100644 --- a/include/linux/kcore.h +++ b/include/linux/kcore.h @@ -12,11 +12,13 @@ enum kcore_type { KCORE_VMEMMAP, KCORE_USER, KCORE_OTHER, + KCORE_REMAP, }; struct kcore_list { struct list_head list; unsigned long addr; + unsigned long vaddr; size_t size; int type; }; @@ -36,11 +38,22 @@ struct vmcoredd_node { #ifdef CONFIG_PROC_KCORE extern void kclist_add(struct kcore_list *, void *, size_t, int type); +static inline +void kclist_add_remap(struct kcore_list *m, void *addr, void *vaddr, size_t sz) +{ + m->vaddr = (unsigned long)vaddr; + kclist_add(m, addr, sz, KCORE_REMAP); +} #else static inline void kclist_add(struct kcore_list *new, void *addr, size_t size, int type) { } + +static inline +void kclist_add_remap(struct kcore_list *m, void *addr, void *vaddr, size_t sz) +{ +} #endif #endif /* _LINUX_KCORE_H */ -- cgit v1.2.3 From 284ce4011ba60d6c487b668eea729b6294930806 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 13 Jul 2018 21:50:32 -0700 Subject: x86/memory_failure: Introduce {set, clear}_mce_nospec() Currently memory_failure() returns zero if the error was handled. On that result mce_unmap_kpfn() is called to zap the page out of the kernel linear mapping to prevent speculative fetches of potentially poisoned memory. However, in the case of dax mapped devmap pages the page may be in active permanent use by the device driver, so it cannot be unmapped from the kernel. Instead of marking the page not present, marking the page UC should be sufficient for preventing poison from being pre-fetched into the cache. Convert mce_unmap_pfn() to set_mce_nospec() remapping the page as UC, to hide it from speculative accesses. Given that that persistent memory errors can be cleared by the driver, include a facility to restore the page to cacheable operation, clear_mce_nospec(). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Borislav Petkov Cc: Cc: Acked-by: Tony Luck Signed-off-by: Dan Williams Acked-by: Ingo Molnar Signed-off-by: Dave Jiang --- include/linux/set_memory.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index da5178216da5..2a986d282a97 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -17,6 +17,20 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif +#ifndef set_mce_nospec +static inline int set_mce_nospec(unsigned long pfn) +{ + return 0; +} +#endif + +#ifndef clear_mce_nospec +static inline int clear_mce_nospec(unsigned long pfn) +{ + return 0; +} +#endif + #ifndef CONFIG_ARCH_HAS_MEM_ENCRYPT static inline int set_memory_encrypted(unsigned long addr, int numpages) { -- cgit v1.2.3 From 04f264d3a8b0eb25d378127bd78c3c9a0261c828 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Mon, 20 Aug 2018 15:36:17 -0700 Subject: compiler.h: Allow arch-specific asm/compiler.h We have a need to override the definition of barrier_before_unreachable() for MIPS, which means we either need to add architecture-specific code into linux/compiler-gcc.h or we need to allow the architecture to provide a header that can define the macro before the generic definition. The latter seems like the better approach. A straightforward approach to the per-arch header is to make use of asm-generic to provide a default empty header & adjust architectures which don't need anything specific to make use of that by adding the header to generic-y. Unfortunately this doesn't work so well due to commit 28128c61e08e ("kconfig.h: Include compiler types to avoid missed struct attributes") which caused linux/compiler_types.h to be included in the compilation of every C file via the -include linux/kconfig.h flag in c_flags. Because the -include flag is present for all C files we compile, we need the architecture-provided header to be present before any C files are compiled. If any C files can be compiled prior to the asm-generic header wrappers being generated then we hit a build failure due to missing header. Such cases do exist - one pointed out by the kbuild test robot is the compilation of arch/ia64/kernel/nr-irqs.c, which occurs as part of the archprepare target [1]. This leaves us with a few options: 1) Use generic-y & fix any build failures we find by enforcing ordering such that the asm-generic target occurs before any C compilation, such that linux/compiler_types.h can always include the generated asm-generic wrapper which in turn includes the empty asm-generic header. This would rely on us finding all the problematic cases - I don't know for sure that the ia64 issue is the only one. 2) Add an actual empty header to each architecture, so that we don't need the generated asm-generic wrapper. This seems messy. 3) Give up & add #ifdef CONFIG_MIPS or similar to linux/compiler_types.h. This seems messy too. 4) Include the arch header only when it's actually needed, removing the need for the asm-generic wrapper for all other architectures. This patch allows us to use approach 4, by including an asm/compiler.h header from linux/compiler_types.h after the inclusion of the compiler-specific linux/compiler-*.h header(s). We do this conditionally, only when CONFIG_HAVE_ARCH_COMPILER_H is selected, in order to avoid the need for asm-generic wrappers & the associated build ordering issue described above. The asm/compiler.h header is included after the generic linux/compiler-*.h header(s) for consistency with the way linux/compiler-intel.h & linux/compiler-clang.h are included after the linux/compiler-gcc.h header that they override. [1] https://lists.01.org/pipermail/kbuild-all/2018-August/051175.html Signed-off-by: Paul Burton Reviewed-by: Masahiro Yamada Patchwork: https://patchwork.linux-mips.org/patch/20269/ Cc: Arnd Bergmann Cc: James Hogan Cc: Masahiro Yamada Cc: Ralf Baechle Cc: linux-arch@vger.kernel.org Cc: linux-kbuild@vger.kernel.org Cc: linux-mips@linux-mips.org --- include/linux/compiler_types.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 6b79a9bba9a7..4be464a07612 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -78,6 +78,18 @@ extern void __chk_io_ptr(const volatile void __iomem *); #include #endif +/* + * Some architectures need to provide custom definitions of macros provided + * by linux/compiler-*.h, and can do so using asm/compiler.h. We include that + * conditionally rather than using an asm-generic wrapper in order to avoid + * build failures if any C compilation, which will include this file via an + * -include argument in c_flags, occurs prior to the asm-generic wrappers being + * generated. + */ +#ifdef CONFIG_HAVE_ARCH_COMPILER_H +#include +#endif + /* * Generic compiler-dependent macros required for kernel * build go below this comment. Actual compiler/compiler version -- cgit v1.2.3 From 5ade60dda43c8906d4554374226c2eb11cc2ffba Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 20 Mar 2018 17:07:11 -0400 Subject: ida: Add new API Add ida_alloc(), ida_alloc_min(), ida_alloc_max(), ida_alloc_range() and ida_free(). The ida_alloc_max() and ida_alloc_range() functions differ from ida_simple_get() in that they take an inclusive 'max' parameter instead of an exclusive 'end' parameter. Callers are about evenly split whether they'd like inclusive or exclusive parameters and 'max' is easier to document than 'end'. Change the IDA allocation to first attempt to allocate a bit using existing memory, and only allocate memory afterwards. Also change the behaviour of 'min' > INT_MAX from being a BUG() to returning -ENOSPC. Leave compatibility wrappers in place for ida_simple_get() and ida_simple_remove() to avoid changing all callers. Signed-off-by: Matthew Wilcox --- include/linux/idr.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/idr.h b/include/linux/idr.h index e856f4e0ab35..cd339da0b1aa 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -230,15 +230,68 @@ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id); void ida_remove(struct ida *ida, int id); void ida_destroy(struct ida *ida); -int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end, - gfp_t gfp_mask); -void ida_simple_remove(struct ida *ida, unsigned int id); +int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t); +void ida_free(struct ida *, unsigned int id); + +/** + * ida_alloc() - Allocate an unused ID. + * @ida: IDA handle. + * @gfp: Memory allocation flags. + * + * Allocate an ID between 0 and %INT_MAX, inclusive. + * + * Context: Any context. + * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, + * or %-ENOSPC if there are no free IDs. + */ +static inline int ida_alloc(struct ida *ida, gfp_t gfp) +{ + return ida_alloc_range(ida, 0, ~0, gfp); +} + +/** + * ida_alloc_min() - Allocate an unused ID. + * @ida: IDA handle. + * @min: Lowest ID to allocate. + * @gfp: Memory allocation flags. + * + * Allocate an ID between @min and %INT_MAX, inclusive. + * + * Context: Any context. + * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, + * or %-ENOSPC if there are no free IDs. + */ +static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp) +{ + return ida_alloc_range(ida, min, ~0, gfp); +} + +/** + * ida_alloc_max() - Allocate an unused ID. + * @ida: IDA handle. + * @max: Highest ID to allocate. + * @gfp: Memory allocation flags. + * + * Allocate an ID between 0 and @max, inclusive. + * + * Context: Any context. + * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, + * or %-ENOSPC if there are no free IDs. + */ +static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp) +{ + return ida_alloc_range(ida, 0, max, gfp); +} static inline void ida_init(struct ida *ida) { INIT_RADIX_TREE(&ida->ida_rt, IDR_RT_MARKER | GFP_NOWAIT); } +#define ida_simple_get(ida, start, end, gfp) \ + ida_alloc_range(ida, start, (end) - 1, gfp) +#define ida_simple_remove(ida, id) ida_free(ida, id) + /** * ida_get_new - allocate new ID * @ida: idr handle -- cgit v1.2.3 From b03f8e43c9261878bf29d8cc1c3ba458cc98287e Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 18 Jun 2018 19:02:48 -0400 Subject: ida: Remove old API Delete ida_pre_get(), ida_get_new(), ida_get_new_above() and ida_remove() from the public API. Some of these functions still exist as internal helpers, but they should not be called by consumers. Signed-off-by: Matthew Wilcox --- include/linux/idr.h | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/idr.h b/include/linux/idr.h index cd339da0b1aa..2e1db0e36486 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -225,13 +225,9 @@ struct ida { } #define DEFINE_IDA(name) struct ida name = IDA_INIT(name) -int ida_pre_get(struct ida *ida, gfp_t gfp_mask); -int ida_get_new_above(struct ida *ida, int starting_id, int *p_id); -void ida_remove(struct ida *ida, int id); -void ida_destroy(struct ida *ida); - int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t); void ida_free(struct ida *, unsigned int id); +void ida_destroy(struct ida *ida); /** * ida_alloc() - Allocate an unused ID. @@ -292,20 +288,11 @@ static inline void ida_init(struct ida *ida) ida_alloc_range(ida, start, (end) - 1, gfp) #define ida_simple_remove(ida, id) ida_free(ida, id) -/** - * ida_get_new - allocate new ID - * @ida: idr handle - * @p_id: pointer to the allocated handle - * - * Simple wrapper around ida_get_new_above() w/ @starting_id of zero. - */ -static inline int ida_get_new(struct ida *ida, int *p_id) -{ - return ida_get_new_above(ida, 0, p_id); -} - static inline bool ida_is_empty(const struct ida *ida) { return radix_tree_empty(&ida->ida_rt); } + +/* in lib/radix-tree.c */ +int ida_pre_get(struct ida *ida, gfp_t gfp_mask); #endif /* __IDR_H__ */ -- cgit v1.2.3 From fd991a23c8f6ef30692f77409602ccf3614353b2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 21 Aug 2018 22:33:00 +0200 Subject: y2038: Provide aliases for compat helpers As part of the system call rework for 64-bit time_t, we are restructuring the way that compat syscalls deal with 32-bit time_t, reusing the implementation for 32-bit architectures. Christoph Hellwig suggested a rename of the associated types and interfaces to avoid the confusing usage of the 'compat' prefix for 32-bit architectures. To prepare for doing that in linux-4.20, add a set of macros that allows to convert subsystems separately to the new names and avoids some of the nastier merge conflicts. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Acked-by: Christoph Hellwig Cc: y2038@lists.linaro.org Cc: John Stultz Cc: Deepa Dinamani Link: https://lkml.kernel.org/r/20180821203329.2089473-1-arnd@arndb.de --- include/linux/time32.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/time32.h b/include/linux/time32.h index 0b14f936100a..d1ae43c13e25 100644 --- a/include/linux/time32.h +++ b/include/linux/time32.h @@ -207,4 +207,19 @@ static inline s64 timeval_to_ns(const struct timeval *tv) extern struct timeval ns_to_timeval(const s64 nsec); extern struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec); +/* + * New aliases for compat time functions. These will be used to replace + * the compat code so it can be shared between 32-bit and 64-bit builds + * both of which provide compatibility with old 32-bit tasks. + */ +#define old_time32_t compat_time_t +#define old_timeval32 compat_timeval +#define old_timespec32 compat_timespec +#define old_itimerspec32 compat_itimerspec +#define ns_to_old_timeval32 ns_to_compat_timeval +#define get_old_itimerspec32 get_compat_itimerspec64 +#define put_old_itimerspec32 put_compat_itimerspec64 +#define get_old_timespec32 compat_get_timespec64 +#define put_old_timespec32 compat_put_timespec64 + #endif -- cgit v1.2.3 From c4df32c80d04987023844c1fb13734a872c8f2e2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 19 Aug 2018 07:34:47 +0900 Subject: export.h: remove VMLINUX_SYMBOL() and VMLINUX_SYMBOL_STR() With the special case handling for Blackfin and Metag was removed by commit 94e58e0ac312 ("export.h: remove code for prefixing symbols with underscore"), VMLINUX_SYMBOL() is no-op. Replace the remaining usages, then remove the definition of VMLINUX_SYMBOL() and VMLINUX_SYMBOL_STR(). Signed-off-by: Masahiro Yamada --- include/linux/export.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/export.h b/include/linux/export.h index b768d6dd3c90..c363bde21bbe 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -10,13 +10,6 @@ * hackers place grumpy comments in header files. */ -#define __VMLINUX_SYMBOL(x) x -#define __VMLINUX_SYMBOL_STR(x) #x - -/* Indirect, so macros are expanded before pasting. */ -#define VMLINUX_SYMBOL(x) __VMLINUX_SYMBOL(x) -#define VMLINUX_SYMBOL_STR(x) __VMLINUX_SYMBOL_STR(x) - #ifndef __ASSEMBLY__ struct kernel_symbol { -- cgit v1.2.3 From 16af2d65842d343c2f95733c3993a0b5baab08f9 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Wed, 22 Aug 2018 21:13:01 +0900 Subject: ata: add an extra argument to ahci_platform_get_resources() Add an extra argument to ahci_platform_get_resources(), that is for the bitmap representing the resource to get in this function. Currently there is no resources to be defined, so all the callers set '0' to the argument. Suggested-by: Hans de Goede Cc: Thierry Reding Cc: Matthias Brugger Cc: Patrice Chotard Cc: Maxime Ripard Signed-off-by: Kunihiko Hayashi Reviewed-by: Hans de Goede Signed-off-by: Tejun Heo --- include/linux/ahci_platform.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h index 1b0a17b22cd3..6490be1f8a16 100644 --- a/include/linux/ahci_platform.h +++ b/include/linux/ahci_platform.h @@ -30,7 +30,7 @@ void ahci_platform_disable_regulators(struct ahci_host_priv *hpriv); int ahci_platform_enable_resources(struct ahci_host_priv *hpriv); void ahci_platform_disable_resources(struct ahci_host_priv *hpriv); struct ahci_host_priv *ahci_platform_get_resources( - struct platform_device *pdev); + struct platform_device *pdev, unsigned int flags); int ahci_platform_init_host(struct platform_device *pdev, struct ahci_host_priv *hpriv, const struct ata_port_info *pi_template, -- cgit v1.2.3 From 9d2ab99573970838108add835442a03e23f8577b Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Wed, 22 Aug 2018 21:13:02 +0900 Subject: ata: libahci_platform: add reset control support Add support to get and control a list of resets for the device as optional and shared. These resets must be kept de-asserted until the device is enabled. This is specified as shared because some SoCs like UniPhier series have common reset controls with all ahci controller instances. However, according to Thierry's view, https://www.spinics.net/lists/linux-ide/msg55357.html some hardware-specific drivers already use their own resets, and the common reset make a path to occur double controls of resets. The ahci_platform_get_resources() can get and control the reset only when the second argument includes AHCI_PLATFORM_GET_RESETS bit. Suggested-by: Hans de Goede Cc: Thierry Reding Signed-off-by: Kunihiko Hayashi Reviewed-by: Hans de Goede Signed-off-by: Tejun Heo --- include/linux/ahci_platform.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h index 6490be1f8a16..eaedca5fe6fc 100644 --- a/include/linux/ahci_platform.h +++ b/include/linux/ahci_platform.h @@ -43,4 +43,6 @@ int ahci_platform_resume_host(struct device *dev); int ahci_platform_suspend(struct device *dev); int ahci_platform_resume(struct device *dev); +#define AHCI_PLATFORM_GET_RESETS 0x01 + #endif /* _AHCI_PLATFORM_H */ -- cgit v1.2.3 From 64d9d13828c6c8e188bba63794eee923df3d69a9 Mon Sep 17 00:00:00 2001 From: Jeremy Cline Date: Tue, 31 Jul 2018 01:37:30 +0000 Subject: fs/quota: Replace XQM_MAXQUOTAS usage with MAXQUOTAS XQM_MAXQUOTAS and MAXQUOTAS are, it appears, equivalent. Replace all usage of XQM_MAXQUOTAS and remove it along with the unused XQM_*QUOTA definitions. Signed-off-by: Jeremy Cline Signed-off-by: Jan Kara --- include/linux/quota.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/quota.h b/include/linux/quota.h index ca9772c8e48b..f32dd270b8e3 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -408,13 +408,7 @@ struct qc_type_state { struct qc_state { unsigned int s_incoredqs; /* Number of dquots in core */ - /* - * Per quota type information. The array should really have - * max(MAXQUOTAS, XQM_MAXQUOTAS) entries. BUILD_BUG_ON in - * quota_getinfo() makes sure XQM_MAXQUOTAS is large enough. Once VFS - * supports project quotas, this can be changed to MAXQUOTAS - */ - struct qc_type_state s_state[XQM_MAXQUOTAS]; + struct qc_type_state s_state[MAXQUOTAS]; /* Per quota type information */ }; /* Structure for communicating via ->set_info */ -- cgit v1.2.3 From 92be775a3db343889ab159c44d55315c5ee0be4e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 21 Aug 2018 21:51:53 -0700 Subject: mm: struct shrink_control: keep int fields together Patch series "Reorderings in struct shrinker and struct shrink_control". These structures are intensively used during reclaim and, displace other data in cache, so there is no a reason they have int fields not grouped together. This patch (of 2): gfp_t is of unsigned type, so let's move nid to keep them together. Link: http://lkml.kernel.org/r/153199747930.21131.861043607301997810.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Tetsuo Handa Cc: Chris Wilson Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shrinker.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index b154fd2b084c..d58aaaed34a4 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -12,6 +12,9 @@ struct shrink_control { gfp_t gfp_mask; + /* current node being shrunk (for NUMA aware shrinkers) */ + int nid; + /* * How many objects scan_objects should scan and try to reclaim. * This is reset before every call, so it is safe for callees @@ -26,9 +29,6 @@ struct shrink_control { */ unsigned long nr_scanned; - /* current node being shrunk (for NUMA aware shrinkers) */ - int nid; - /* current memcg being shrunk (for memcg aware shrinkers) */ struct mem_cgroup *memcg; }; -- cgit v1.2.3 From e50ef89b0f58a2cb0e7d496a97b851e0dced62a6 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 21 Aug 2018 21:51:57 -0700 Subject: mm: struct shrinker: make flags of unsigned type Currently, there are two flags only, so unsigned is more then enough. Also, move int seeks to keep these fields together. Link: http://lkml.kernel.org/r/153199748720.21131.6476256940113102483.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Tetsuo Handa Cc: Chris Wilson Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shrinker.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index d58aaaed34a4..9443cafd1969 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -63,9 +63,9 @@ struct shrinker { unsigned long (*scan_objects)(struct shrinker *, struct shrink_control *sc); - int seeks; /* seeks to recreate an obj */ long batch; /* reclaim batch size, 0 = default */ - unsigned long flags; + int seeks; /* seeks to recreate an obj */ + unsigned flags; /* These are for internal use */ struct list_head list; -- cgit v1.2.3 From 5d5e8f19544a35ae1609bd96ae6b28c9fcd1baf6 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 21 Aug 2018 21:52:20 -0700 Subject: mm, swap, get_swap_pages: use entry_size instead of cluster in parameter As suggested by Matthew Wilcox, it is better to use "int entry_size" instead of "bool cluster" as parameter to specify whether to operate for huge or normal swap entries. Because this improve the flexibility to support other swap entry size. And Dave Hansen thinks that this improves code readability too. So in this patch, the "bool cluster" parameter of get_swap_pages() is replaced by "int entry_size". And nr_swap_entries() trick is used to reduce the binary size when !CONFIG_TRANSPARENT_HUGE_PAGE. text data bss dec hex filename base 24215 2028 340 26583 67d7 mm/swapfile.o head 24123 2004 340 26467 6763 mm/swapfile.o Link: http://lkml.kernel.org/r/20180720071845.17920-7-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Matthew Wilcox Acked-by: Dave Hansen Cc: Daniel Jordan Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Cc: Rik van Riel Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 1a8bd05a335e..8e2c11e692ba 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -447,7 +447,7 @@ extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(struct page *page); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]); +extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); -- cgit v1.2.3 From 93065ac753e4443840a057bfef4be71ec766fde9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 21 Aug 2018 21:52:33 -0700 Subject: mm, oom: distinguish blockable mode for mmu notifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are several blockable mmu notifiers which might sleep in mmu_notifier_invalidate_range_start and that is a problem for the oom_reaper because it needs to guarantee a forward progress so it cannot depend on any sleepable locks. Currently we simply back off and mark an oom victim with blockable mmu notifiers as done after a short sleep. That can result in selecting a new oom victim prematurely because the previous one still hasn't torn its memory down yet. We can do much better though. Even if mmu notifiers use sleepable locks there is no reason to automatically assume those locks are held. Moreover majority of notifiers only care about a portion of the address space and there is absolutely zero reason to fail when we are unmapping an unrelated range. Many notifiers do really block and wait for HW which is harder to handle and we have to bail out though. This patch handles the low hanging fruit. __mmu_notifier_invalidate_range_start gets a blockable flag and callbacks are not allowed to sleep if the flag is set to false. This is achieved by using trylock instead of the sleepable lock for most callbacks and continue as long as we do not block down the call chain. I think we can improve that even further because there is a common pattern to do a range lookup first and then do something about that. The first part can be done without a sleeping lock in most cases AFAICS. The oom_reaper end then simply retries if there is at least one notifier which couldn't make any progress in !blockable mode. A retry loop is already implemented to wait for the mmap_sem and this is basically the same thing. The simplest way for driver developers to test this code path is to wrap userspace code which uses these notifiers into a memcg and set the hard limit to hit the oom. This can be done e.g. after the test faults in all the mmu notifier managed memory and set the hard limit to something really small. Then we are looking for a proper process tear down. [akpm@linux-foundation.org: coding style fixes] [akpm@linux-foundation.org: minor code simplification] Link: http://lkml.kernel.org/r/20180716115058.5559-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Christian König # AMD notifiers Acked-by: Leon Romanovsky # mlx and umem_odp Reported-by: David Rientjes Cc: "David (ChunMing) Zhou" Cc: Paolo Bonzini Cc: Alex Deucher Cc: David Airlie Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: Doug Ledford Cc: Jason Gunthorpe Cc: Mike Marciniszyn Cc: Dennis Dalessandro Cc: Sudeep Dutt Cc: Ashutosh Dixit Cc: Dimitri Sivanich Cc: Boris Ostrovsky Cc: Juergen Gross Cc: "Jérôme Glisse" Cc: Andrea Arcangeli Cc: Felix Kuehling Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kvm_host.h | 4 ++-- include/linux/mmu_notifier.h | 33 +++++++++++++++++++++++++-------- include/linux/oom.h | 2 +- 3 files changed, 28 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7c7362dd2faa..0205aee44ded 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1289,8 +1289,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, } #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end); +int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end, bool blockable); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 392e6af82701..133ba78820ee 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -151,13 +151,15 @@ struct mmu_notifier_ops { * address space but may still be referenced by sptes until * the last refcount is dropped. * - * If both of these callbacks cannot block, and invalidate_range - * cannot block, mmu_notifier_ops.flags should have - * MMU_INVALIDATE_DOES_NOT_BLOCK set. + * If blockable argument is set to false then the callback cannot + * sleep and has to return with -EAGAIN. 0 should be returned + * otherwise. + * */ - void (*invalidate_range_start)(struct mmu_notifier *mn, + int (*invalidate_range_start)(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + bool blockable); void (*invalidate_range_end)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -229,8 +231,9 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); -extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end); +extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool blockable); extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end, bool only_end); @@ -281,7 +284,15 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range_start(mm, start, end); + __mmu_notifier_invalidate_range_start(mm, start, end, true); +} + +static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_invalidate_range_start(mm, start, end, false); + return 0; } static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, @@ -461,6 +472,12 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, { } +static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + return 0; +} + static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end) { diff --git a/include/linux/oom.h b/include/linux/oom.h index 6adac113e96d..92f70e4c6252 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -95,7 +95,7 @@ static inline int check_stable_address_space(struct mm_struct *mm) return 0; } -void __oom_reap_task_mm(struct mm_struct *mm); +bool __oom_reap_task_mm(struct mm_struct *mm); extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, -- cgit v1.2.3 From a670468f5e0b5fad4db6e4d195f15915dc2a35c1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 21 Aug 2018 21:53:06 -0700 Subject: mm: zero out the vma in vma_init() Rather than in vm_area_alloc(). To ensure that the various oddball stack-based vmas are in a good state. Some of the callers were zeroing them out, others were not. Acked-by: Kirill A. Shutemov Cc: Russell King Cc: Dmitry Vyukov Cc: Oleg Nesterov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a3cae495f9ce..3a4b87d1a59a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -456,6 +456,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { static const struct vm_operations_struct dummy_vm_ops = {}; + memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); -- cgit v1.2.3 From 89696701ea847786f88ccc1c580fb4c3c20c4a3a Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 21 Aug 2018 21:53:24 -0700 Subject: mm: remove zone_id() and make use of zone_idx() in is_dev_zone() is_dev_zone() is using zone_id() to check if the zone is ZONE_DEVICE. zone_id() looks pretty much the same as zone_idx(), and while the use of zone_idx() is quite spread in the kernel, zone_id() is only being used by is_dev_zone(). This patch removes zone_id() and makes is_dev_zone() use zone_idx() to check the zone, so we do not have two things with the same functionality around. Link: http://lkml.kernel.org/r/20180730133718.28683-1-osalvador@techadventures.net Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Acked-by: Vlastimil Babka Reviewed-by: Pavel Tatashin Cc: Pasha Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 32699b2dc52a..3d4577a3ae7e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -755,25 +755,6 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; } -static inline int zone_id(const struct zone *zone) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - - return zone - pgdat->node_zones; -} - -#ifdef CONFIG_ZONE_DEVICE -static inline bool is_dev_zone(const struct zone *zone) -{ - return zone_id(zone) == ZONE_DEVICE; -} -#else -static inline bool is_dev_zone(const struct zone *zone) -{ - return false; -} -#endif - #include void build_all_zonelists(pg_data_t *pgdat); @@ -824,6 +805,18 @@ static inline int local_memory_node(int node_id) { return node_id; }; */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) +#ifdef CONFIG_ZONE_DEVICE +static inline bool is_dev_zone(const struct zone *zone) +{ + return zone_idx(zone) == ZONE_DEVICE; +} +#else +static inline bool is_dev_zone(const struct zone *zone) +{ + return false; +} +#endif + /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than -- cgit v1.2.3 From c1093b746c0576ed81c4d568d1e39cab651d37e6 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 21 Aug 2018 21:53:32 -0700 Subject: mm: access zone->node via zone_to_nid() and zone_set_nid() zone->node is configured only when CONFIG_NUMA=y, so it is a good idea to have inline functions to access this field in order to avoid ifdef's in c files. Link: http://lkml.kernel.org/r/20180730101757.28058-3-osalvador@techadventures.net Signed-off-by: Pavel Tatashin Signed-off-by: Oscar Salvador Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Aaron Lu Cc: Dan Williams Cc: David Hildenbrand Cc: Joonsoo Kim Cc: Mel Gorman Cc: Pasha Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 --------- include/linux/mmzone.h | 26 ++++++++++++++++++++------ 2 files changed, 20 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a4b87d1a59a..a55f5389c491 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -960,15 +960,6 @@ static inline int page_zone_id(struct page *page) return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; } -static inline int zone_to_nid(struct zone *zone) -{ -#ifdef CONFIG_NUMA - return zone->node; -#else - return 0; -#endif -} - #ifdef NODE_NOT_IN_PAGE_FLAGS extern int page_to_nid(const struct page *page); #else diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3d4577a3ae7e..1e22d96734e0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -834,6 +834,25 @@ static inline bool populated_zone(struct zone *zone) return zone->present_pages; } +#ifdef CONFIG_NUMA +static inline int zone_to_nid(struct zone *zone) +{ + return zone->node; +} + +static inline void zone_set_nid(struct zone *zone, int nid) +{ + zone->node = nid; +} +#else +static inline int zone_to_nid(struct zone *zone) +{ + return 0; +} + +static inline void zone_set_nid(struct zone *zone, int nid) {} +#endif + extern int movable_zone; #ifdef CONFIG_HIGHMEM @@ -949,12 +968,7 @@ static inline int zonelist_zone_idx(struct zoneref *zoneref) static inline int zonelist_node_idx(struct zoneref *zoneref) { -#ifdef CONFIG_NUMA - /* zone_to_nid not available in this context */ - return zoneref->zone->node; -#else - return 0; -#endif /* CONFIG_NUMA */ + return zone_to_nid(zoneref->zone); } struct zoneref *__next_zones_zonelist(struct zoneref *z, -- cgit v1.2.3 From 03e85f9d5f1f8c74f127c5f7a87575d74a78d248 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 21 Aug 2018 21:53:43 -0700 Subject: mm/page_alloc: Introduce free_area_init_core_hotplug Currently, whenever a new node is created/re-used from the memhotplug path, we call free_area_init_node()->free_area_init_core(). But there is some code that we do not really need to run when we are coming from such path. free_area_init_core() performs the following actions: 1) Initializes pgdat internals, such as spinlock, waitqueues and more. 2) Account # nr_all_pages and # nr_kernel_pages. These values are used later on when creating hash tables. 3) Account number of managed_pages per zone, substracting dma_reserved and memmap pages. 4) Initializes some fields of the zone structure data 5) Calls init_currently_empty_zone to initialize all the freelists 6) Calls memmap_init to initialize all pages belonging to certain zone When called from memhotplug path, free_area_init_core() only performs actions #1 and #4. Action #2 is pointless as the zones do not have any pages since either the node was freed, or we are re-using it, eitherway all zones belonging to this node should have 0 pages. For the same reason, action #3 results always in manages_pages being 0. Action #5 and #6 are performed later on when onlining the pages: online_pages()->move_pfn_range_to_zone()->init_currently_empty_zone() online_pages()->move_pfn_range_to_zone()->memmap_init_zone() This patch does two things: First, moves the node/zone initializtion to their own function, so it allows us to create a small version of free_area_init_core, where we only perform: 1) Initialization of pgdat internals, such as spinlock, waitqueues and more 4) Initialization of some fields of the zone structure data These two functions are: pgdat_init_internals() and zone_init_internals(). The second thing this patch does, is to introduce free_area_init_core_hotplug(), the memhotplug version of free_area_init_core(): Currently, we call free_area_init_node() from the memhotplug path. In there, we set some pgdat's fields, and call calculate_node_totalpages(). calculate_node_totalpages() calculates the # of pages the node has. Since the node is either new, or we are re-using it, the zones belonging to this node should not have any pages, so there is no point to calculate this now. Actually, we re-set these values to 0 later on with the calls to: reset_node_managed_pages() reset_node_present_pages() The # of pages per node and the # of pages per zone will be calculated when onlining the pages: online_pages()->move_pfn_range()->move_pfn_range_to_zone()->resize_zone_range() online_pages()->move_pfn_range()->move_pfn_range_to_zone()->resize_pgdat_range() Also, since free_area_init_core/free_area_init_node will now only get called during early init, let us replace __paginginit with __init, so their code gets freed up. [osalvador@techadventures.net: fix section usage] Link: http://lkml.kernel.org/r/20180731101752.GA473@techadventures.net [osalvador@suse.de: v6] Link: http://lkml.kernel.org/r/20180801122348.21588-6-osalvador@techadventures.net Link: http://lkml.kernel.org/r/20180730101757.28058-5-osalvador@techadventures.net Signed-off-by: Oscar Salvador Reviewed-by: Pavel Tatashin Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Pasha Tatashin Cc: Aaron Lu Cc: Dan Williams Cc: David Hildenbrand Cc: Joonsoo Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 1 + include/linux/mm.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4e9828cda7a2..34a28227068d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -319,6 +319,7 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) static inline void remove_memory(int nid, u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ +extern void __ref free_area_init_core_hotplug(int nid); extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); diff --git a/include/linux/mm.h b/include/linux/mm.h index a55f5389c491..a9e733b5fb76 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2015,7 +2015,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) extern void __init pagecache_init(void); extern void free_area_init(unsigned long * zones_size); -extern void free_area_init_node(int nid, unsigned long * zones_size, +extern void __init free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void free_initmem(void); -- cgit v1.2.3 From 3d8b38eb81cac81395f6a823f6bf401b327268e6 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 21 Aug 2018 21:53:54 -0700 Subject: mm, oom: introduce memory.oom.group For some workloads an intervention from the OOM killer can be painful. Killing a random task can bring the workload into an inconsistent state. Historically, there are two common solutions for this problem: 1) enabling panic_on_oom, 2) using a userspace daemon to monitor OOMs and kill all outstanding processes. Both approaches have their downsides: rebooting on each OOM is an obvious waste of capacity, and handling all in userspace is tricky and requires a userspace agent, which will monitor all cgroups for OOMs. In most cases an in-kernel after-OOM cleaning-up mechanism can eliminate the necessity of enabling panic_on_oom. Also, it can simplify the cgroup management for userspace applications. This commit introduces a new knob for cgroup v2 memory controller: memory.oom.group. The knob determines whether the cgroup should be treated as an indivisible workload by the OOM killer. If set, all tasks belonging to the cgroup or to its descendants (if the memory cgroup is not a leaf cgroup) are killed together or not at all. To determine which cgroup has to be killed, we do traverse the cgroup hierarchy from the victim task's cgroup up to the OOMing cgroup (or root) and looking for the highest-level cgroup with memory.oom.group set. Tasks with the OOM protection (oom_score_adj set to -1000) are treated as an exception and are never killed. This patch doesn't change the OOM victim selection algorithm. Link: http://lkml.kernel.org/r/20180802003201.817-4-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: David Rientjes Cc: Tetsuo Handa Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0e6c515fb698..652f602167df 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -225,6 +225,11 @@ struct mem_cgroup { */ bool use_hierarchy; + /* + * Should the OOM killer kill all belonging tasks, had it kill one? + */ + bool oom_group; + /* protected by memcg_oom_lock */ bool oom_lock; int under_oom; @@ -542,6 +547,9 @@ static inline bool task_in_memcg_oom(struct task_struct *p) } bool mem_cgroup_oom_synchronize(bool wait); +struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, + struct mem_cgroup *oom_domain); +void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); #ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; @@ -1001,6 +1009,16 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } +static inline struct mem_cgroup *mem_cgroup_get_oom_group( + struct task_struct *victim, struct mem_cgroup *oom_domain) +{ + return NULL; +} + +static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) +{ +} + static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { -- cgit v1.2.3 From 7e8a6304d5419cbf056a59de92939e5eef039c57 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 21 Aug 2018 21:53:58 -0700 Subject: /proc/meminfo: add percpu populated pages count Currently, percpu memory only exposes allocation and utilization information via debugfs. This more or less is only really useful for understanding the fragmentation and allocation information at a per-chunk level with a few global counters. This is also gated behind a config. BPF and cgroup, for example, have seen an increase in use causing increased use of percpu memory. Let's make it easier for someone to identify how much memory is being used. This patch adds the "Percpu" stat to meminfo to more easily look up how much percpu memory is in use. This number includes the cost for all allocated backing pages and not just insight at the per a unit, per chunk level. Metadata is excluded. I think excluding metadata is fair because the backing memory scales with the numbere of cpus and can quickly outweigh the metadata. It also makes this calculation light. Link: http://lkml.kernel.org/r/20180807184723.74919-1-dennisszhou@gmail.com Signed-off-by: Dennis Zhou Acked-by: Tejun Heo Acked-by: Roman Gushchin Reviewed-by: Andrew Morton Acked-by: David Rientjes Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Christoph Lameter Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/percpu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 296bbe49d5d1..70b7123f38c7 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -149,4 +149,6 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr); (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ __alignof__(type)) +extern unsigned long pcpu_nr_pages(void); + #endif /* __LINUX_PERCPU_H */ -- cgit v1.2.3 From 5df66d306ec9b1952d3d183fe4e2ced1a7b2bddb Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 21 Aug 2018 21:54:06 -0700 Subject: mm: fix comment for NODEMASK_ALLOC Currently, NODEMASK_ALLOC allocates a nodemask_t with kmalloc when NODES_SHIFT is higher than 8, otherwise it declares it within the stack. The comment says that the reasoning behind this, is that nodemask_t will be 256 bytes when NODES_SHIFT is higher than 8, but this is not true. For example, NODES_SHIFT = 9 will give us a 64 bytes nodemask_t. Let us fix up the comment for that. Another thing is that it might make sense to let values lower than 128bytes be allocated in the stack. Although this all depends on the depth of the stack (and this changes from function to function), I think that 64 bytes is something we can easily afford. So we could even bump the limit by 1 (from > 8 to > 9). Link: http://lkml.kernel.org/r/20180820085516.9687-1-osalvador@techadventures.net Signed-off-by: Oscar Salvador Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nodemask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 1fbde8a880d9..5a30ad594ccc 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -518,7 +518,7 @@ static inline int node_random(const nodemask_t *mask) * NODEMASK_ALLOC(type, name) allocates an object with a specified type and * name. */ -#if NODES_SHIFT > 8 /* nodemask_t > 256 bytes */ +#if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */ #define NODEMASK_ALLOC(type, name, gfp_flags) \ type *name = kmalloc(sizeof(*name), gfp_flags) #define NODEMASK_FREE(m) kfree(m) -- cgit v1.2.3 From 891ae71dc4fbd2454a3fa569e115a7ca86630949 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 21 Aug 2018 21:54:37 -0700 Subject: proc: spread "const" a bit Link: http://lkml.kernel.org/r/20180627200614.GB18434@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/proc_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 626fc65c4336..d0e1f1522a78 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -129,7 +129,7 @@ int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)); /* get the associated pid namespace for a file in procfs */ -static inline struct pid_namespace *proc_pid_ns(struct inode *inode) +static inline struct pid_namespace *proc_pid_ns(const struct inode *inode) { return inode->i_sb->s_fs_info; } -- cgit v1.2.3 From a8dd9c4df18edc873d244790d163564a5d17626b Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Aug 2018 21:54:51 -0700 Subject: proc/kcore: don't grab lock for kclist_add() Patch series "/proc/kcore improvements", v4. This series makes a few improvements to /proc/kcore. It fixes a couple of small issues in v3 but is otherwise the same. Patches 1, 2, and 3 are prep patches. Patch 4 is a fix/cleanup. Patch 5 is another prep patch. Patches 6 and 7 are optimizations to ->read(). Patch 8 makes it possible to enable CRASH_CORE on any architecture, which is needed for patch 9. Patch 9 adds vmcoreinfo to /proc/kcore. This patch (of 9): kclist_add() is only called at init time, so there's no point in grabbing any locks. We're also going to replace the rwlock with a rwsem, which we don't want to try grabbing during early boot. While we're here, mark kclist_add() with __init so that we'll get a warning if it's called from non-init code. Link: http://lkml.kernel.org/r/98208db1faf167aa8b08eebfa968d95c70527739.1531953780.git.osandov@fb.com Signed-off-by: Omar Sandoval Reviewed-by: Andrew Morton Reviewed-by: Bhupesh Sharma Tested-by: Bhupesh Sharma Cc: Alexey Dobriyan Cc: Bhupesh Sharma Cc: Eric Biederman Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kcore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kcore.h b/include/linux/kcore.h index 8de55e4b5ee9..c20f296438fb 100644 --- a/include/linux/kcore.h +++ b/include/linux/kcore.h @@ -35,7 +35,7 @@ struct vmcoredd_node { }; #ifdef CONFIG_PROC_KCORE -extern void kclist_add(struct kcore_list *, void *, size_t, int type); +void __init kclist_add(struct kcore_list *, void *, size_t, int type); #else static inline void kclist_add(struct kcore_list *new, void *addr, size_t size, int type) -- cgit v1.2.3 From 23c85094fe1895caefdd19ef624ee687ec5f4507 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Aug 2018 21:55:20 -0700 Subject: proc/kcore: add vmcoreinfo note to /proc/kcore The vmcoreinfo information is useful for runtime debugging tools, not just for crash dumps. A lot of this information can be determined by other means, but this is much more convenient, and it only adds a page at most to the file. Link: http://lkml.kernel.org/r/fddbcd08eed76344863303878b12de1c1e2a04b6.1531953780.git.osandov@fb.com Signed-off-by: Omar Sandoval Cc: Alexey Dobriyan Cc: Bhupesh Sharma Cc: Eric Biederman Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index b511f6d24b42..525510a9f965 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -60,6 +60,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +extern unsigned char *vmcoreinfo_data; +extern size_t vmcoreinfo_size; extern u32 *vmcoreinfo_note; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, -- cgit v1.2.3 From cedc5b6aab493f6b1b1d381dccc0cc082da7d3d8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 21 Aug 2018 21:55:28 -0700 Subject: kernel.h: documentation for roundup() vs round_up() Things like 3619dec5103d ("dh key: fix rounding up KDF output length") expose the lack of explicit documentation for roundup() vs round_up(). At least we can try to document it better if anyone goes looking. Link: http://lkml.kernel.org/r/20180703041950.GA43464@beast Signed-off-by: Kees Cook Cc: Ingo Molnar Cc: Greg Kroah-Hartman Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 941dc0a5a877..d6aac75b51ba 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -85,7 +85,23 @@ * arguments just once each. */ #define __round_mask(x, y) ((__typeof__(x))((y)-1)) +/** + * round_up - round up to next specified power of 2 + * @x: the value to round + * @y: multiple to round up to (must be a power of 2) + * + * Rounds @x up to next multiple of @y (which must be a power of 2). + * To perform arbitrary rounding up, use roundup() below. + */ #define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) +/** + * round_down - round down to next specified power of 2 + * @x: the value to round + * @y: multiple to round down to (must be a power of 2) + * + * Rounds @x down to next multiple of @y (which must be a power of 2). + * To perform arbitrary rounding down, use rounddown() below. + */ #define round_down(x, y) ((x) & ~__round_mask(x, y)) /** @@ -110,13 +126,30 @@ # define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP(ll,d) #endif -/* The `const' in roundup() prevents gcc-3.3 from calling __divdi3 */ +/** + * roundup - round up to the next specified multiple + * @x: the value to up + * @y: multiple to round up to + * + * Rounds @x up to next multiple of @y. If @y will always be a power + * of 2, consider using the faster round_up(). + * + * The `const' here prevents gcc-3.3 from calling __divdi3 + */ #define roundup(x, y) ( \ { \ const typeof(y) __y = y; \ (((x) + (__y - 1)) / __y) * __y; \ } \ ) +/** + * rounddown - round down to next specified multiple + * @x: the value to round + * @y: multiple to round down to + * + * Rounds @x down to next multiple of @y. If @y will always be a power + * of 2, consider using the faster round_down(). + */ #define rounddown(x, y) ( \ { \ typeof(x) __x = (x); \ -- cgit v1.2.3 From e58dd0de5eadf145895b13451a1fef8ef03946eb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 21 Aug 2018 21:55:31 -0700 Subject: bdi: use refcount_t for reference counting instead atomic_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This permits avoiding accidental refcounter overflows that might lead to use-after-free situations. Link: http://lkml.kernel.org/r/20180703200141.28415-4-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Suggested-by: Peter Zijlstra Cc: Jens Axboe Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev-defs.h | 3 ++- include/linux/backing-dev.h | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 24251762c20c..9a6bc0951cfa 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -12,6 +12,7 @@ #include #include #include +#include struct page; struct device; @@ -75,7 +76,7 @@ enum wb_reason { */ struct bdi_writeback_congested { unsigned long state; /* WB_[a]sync_congested flags */ - atomic_t refcnt; /* nr of attached wb's and blkg */ + refcount_t refcnt; /* nr of attached wb's and blkg */ #ifdef CONFIG_CGROUP_WRITEBACK struct backing_dev_info *__bdi; /* the associated bdi, set to NULL diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 72ca0f3d39f3..c28a47cbe355 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -404,13 +404,13 @@ static inline bool inode_cgwb_enabled(struct inode *inode) static inline struct bdi_writeback_congested * wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) { - atomic_inc(&bdi->wb_congested->refcnt); + refcount_inc(&bdi->wb_congested->refcnt); return bdi->wb_congested; } static inline void wb_congested_put(struct bdi_writeback_congested *congested) { - if (atomic_dec_and_test(&congested->refcnt)) + if (refcount_dec_and_test(&congested->refcnt)) kfree(congested); } -- cgit v1.2.3 From fc37191272a972d449bab3d8247cf52cadf5d4e6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 21 Aug 2018 21:55:38 -0700 Subject: userns: use refcount_t for reference counting instead atomic_t refcount_t type and corresponding API should be used instead of atomic_t wh en the variable is used as a reference counter. This avoids accidental refcounter overflows that might lead to use-after-free situations. Link: http://lkml.kernel.org/r/20180703200141.28415-6-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Reviewed-by: Andrew Morton Cc: "Eric W. Biederman" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/user.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 96fe289c4c6e..39ad98c09c58 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -4,6 +4,7 @@ #include #include +#include #include struct key; @@ -12,7 +13,7 @@ struct key; * Some day this will be a full-fledged user tracking system.. */ struct user_struct { - atomic_t __count; /* reference count */ + refcount_t __count; /* reference count */ atomic_t processes; /* How many processes does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ #ifdef CONFIG_FANOTIFY @@ -59,7 +60,7 @@ extern struct user_struct root_user; extern struct user_struct * alloc_uid(kuid_t); static inline struct user_struct *get_uid(struct user_struct *u) { - atomic_inc(&u->__count); + refcount_inc(&u->__count); return u; } extern void free_uid(struct user_struct *); -- cgit v1.2.3 From 203583990cb675aeddff6d7623f68268068c078c Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 21 Aug 2018 21:55:45 -0700 Subject: linux/compiler.h: don't use bool Appararently, it's possible to have a non-trivial TU include a few headers, including linux/build_bug.h, without ending up with linux/types.h. So the 0day bot sent me config: um-x86_64_defconfig (attached as .config) >> include/linux/compiler.h:316:3: error: unknown type name 'bool'; did you mean '_Bool'? bool __cond = !(condition); \ for something I'm working on. Rather than contributing to the #include madness and including linux/types.h from compiler.h, just use int. Link: http://lkml.kernel.org/r/20180817101036.20969-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Christopher Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 42506e4d1f53..c8eab637a2a7 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -313,7 +313,7 @@ unsigned long read_word_at_a_time(const void *addr) #ifdef __OPTIMIZE__ # define __compiletime_assert(condition, msg, prefix, suffix) \ do { \ - bool __cond = !(condition); \ + int __cond = !(condition); \ extern void prefix ## suffix(void) __compiletime_error(msg); \ if (__cond) \ prefix ## suffix(); \ -- cgit v1.2.3 From a2e514453861dd39b53b7a50b6771bd3f9852078 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 21 Aug 2018 21:55:52 -0700 Subject: kernel/hung_task.c: allow to set checking interval separately from timeout Currently task hung checking interval is equal to timeout, as the result hung is detected anywhere between timeout and 2*timeout. This is fine for most interactive environments, but this hurts automated testing setups (syzbot). In an automated setup we need to strictly order CPU lockup < RCU stall < workqueue lockup < task hung < silent loss, so that RCU stall is not detected as task hung and task hung is not detected as silent machine loss. The large variance in task hung detection timeout requires setting silent machine loss timeout to a very large value (e.g. if task hung is 3 mins, then silent loss need to be set to ~7 mins). The additional 3 minutes significantly reduce testing efficiency because usually we crash kernel within a minute, and this can add hours to bug localization process as it needs to do dozens of tests. Allow setting checking interval separately from timeout. This allows to set timeout to, say, 3 minutes, but checking interval to 10 secs. The interval is controlled via a new hung_task_check_interval_secs sysctl, similar to the existing hung_task_timeout_secs sysctl. The default value of 0 results in the current behavior: checking interval is equal to timeout. [akpm@linux-foundation.org: update hung_task_timeout_max's comment] Link: http://lkml.kernel.org/r/20180611111004.203513-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Paul E. McKenney Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + include/linux/sched/sysctl.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 789923fbee3a..58eb3a2bc695 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -853,6 +853,7 @@ struct task_struct { #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; + unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 913488d828cb..a9c32daeb9d8 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -10,6 +10,7 @@ struct ctl_table; extern int sysctl_hung_task_check_count; extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_timeout_secs; +extern unsigned long sysctl_hung_task_check_interval_secs; extern int sysctl_hung_task_warnings; extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, void __user *buffer, -- cgit v1.2.3 From f922c4abdf7648523589abee9460c87f51630d2f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Aug 2018 21:56:04 -0700 Subject: module: allow symbol exports to be disabled To allow existing C code to be incorporated into the decompressor or the UEFI stub, introduce a CPP macro that turns all EXPORT_SYMBOL_xxx declarations into nops, and #define it in places where such exports are undesirable. Note that this gets rid of a rather dodgy redefine of linux/export.h's header guard. Link: http://lkml.kernel.org/r/20180704083651.24360-3-ard.biesheuvel@linaro.org Signed-off-by: Ard Biesheuvel Acked-by: Nicolas Pitre Acked-by: Michael Ellerman Reviewed-by: Will Deacon Acked-by: Ingo Molnar Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: James Morris Cc: James Morris Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Paul Mackerras Cc: Petr Mladek Cc: Russell King Cc: "Serge E. Hallyn" Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Thomas Garnier Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/export.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/export.h b/include/linux/export.h index b768d6dd3c90..ea7df303d68d 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -66,7 +66,16 @@ extern struct module __this_module; __attribute__((section("___ksymtab" sec "+" #sym), used)) \ = { (unsigned long)&sym, __kstrtab_##sym } -#if defined(__KSYM_DEPS__) +#if defined(__DISABLE_EXPORTS) + +/* + * Allow symbol exports to be disabled completely so that C code may + * be reused in other execution contexts such as the UEFI stub or the + * decompressor. + */ +#define __EXPORT_SYMBOL(sym, sec) + +#elif defined(__KSYM_DEPS__) /* * For fine grained build dependencies, we want to tell the build system -- cgit v1.2.3 From 7290d58095712a89f845e1bca05334796dd49ed2 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Aug 2018 21:56:09 -0700 Subject: module: use relative references for __ksymtab entries An ordinary arm64 defconfig build has ~64 KB worth of __ksymtab entries, each consisting of two 64-bit fields containing absolute references, to the symbol itself and to a char array containing its name, respectively. When we build the same configuration with KASLR enabled, we end up with an additional ~192 KB of relocations in the .init section, i.e., one 24 byte entry for each absolute reference, which all need to be processed at boot time. Given how the struct kernel_symbol that describes each entry is completely local to module.c (except for the references emitted by EXPORT_SYMBOL() itself), we can easily modify it to contain two 32-bit relative references instead. This reduces the size of the __ksymtab section by 50% for all 64-bit architectures, and gets rid of the runtime relocations entirely for architectures implementing KASLR, either via standard PIE linking (arm64) or using custom host tools (x86). Note that the binary search involving __ksymtab contents relies on each section being sorted by symbol name. This is implemented based on the input section names, not the names in the ksymtab entries, so this patch does not interfere with that. Given that the use of place-relative relocations requires support both in the toolchain and in the module loader, we cannot enable this feature for all architectures. So make it dependent on whether CONFIG_HAVE_ARCH_PREL32_RELOCATIONS is defined. Link: http://lkml.kernel.org/r/20180704083651.24360-4-ard.biesheuvel@linaro.org Signed-off-by: Ard Biesheuvel Acked-by: Jessica Yu Acked-by: Michael Ellerman Reviewed-by: Will Deacon Acked-by: Ingo Molnar Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: James Morris Cc: James Morris Cc: Josh Poimboeuf Cc: Kees Cook Cc: Nicolas Pitre Cc: Paul Mackerras Cc: Petr Mladek Cc: Russell King Cc: "Serge E. Hallyn" Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Thomas Garnier Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 19 +++++++++++++++++++ include/linux/export.h | 46 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 54 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c8eab637a2a7..681d866efb1e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -280,6 +280,25 @@ unsigned long read_word_at_a_time(const void *addr) #endif /* __KERNEL__ */ +/* + * Force the compiler to emit 'sym' as a symbol, so that we can reference + * it from inline assembler. Necessary in case 'sym' could be inlined + * otherwise, or eliminated entirely due to lack of references that are + * visible to the compiler. + */ +#define __ADDRESSABLE(sym) \ + static void * __attribute__((section(".discard.addressable"), used)) \ + __PASTE(__addressable_##sym, __LINE__) = (void *)&sym; + +/** + * offset_to_ptr - convert a relative memory offset to an absolute pointer + * @off: the address of the 32-bit offset value + */ +static inline void *offset_to_ptr(const int *off) +{ + return (void *)((unsigned long)off + *off); +} + #endif /* __ASSEMBLY__ */ #ifndef __optimize diff --git a/include/linux/export.h b/include/linux/export.h index ea7df303d68d..ae072bc5aacf 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -18,12 +18,6 @@ #define VMLINUX_SYMBOL_STR(x) __VMLINUX_SYMBOL_STR(x) #ifndef __ASSEMBLY__ -struct kernel_symbol -{ - unsigned long value; - const char *name; -}; - #ifdef MODULE extern struct module __this_module; #define THIS_MODULE (&__this_module) @@ -54,17 +48,47 @@ extern struct module __this_module; #define __CRC_SYMBOL(sym, sec) #endif +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#include +/* + * Emit the ksymtab entry as a pair of relative references: this reduces + * the size by half on 64-bit architectures, and eliminates the need for + * absolute relocations that require runtime processing on relocatable + * kernels. + */ +#define __KSYMTAB_ENTRY(sym, sec) \ + __ADDRESSABLE(sym) \ + asm(" .section \"___ksymtab" sec "+" #sym "\", \"a\" \n" \ + " .balign 8 \n" \ + "__ksymtab_" #sym ": \n" \ + " .long " #sym "- . \n" \ + " .long __kstrtab_" #sym "- . \n" \ + " .previous \n") + +struct kernel_symbol { + int value_offset; + int name_offset; +}; +#else +#define __KSYMTAB_ENTRY(sym, sec) \ + static const struct kernel_symbol __ksymtab_##sym \ + __attribute__((section("___ksymtab" sec "+" #sym), used)) \ + = { (unsigned long)&sym, __kstrtab_##sym } + +struct kernel_symbol { + unsigned long value; + const char *name; +}; +#endif + /* For every exported symbol, place a struct in the __ksymtab section */ #define ___EXPORT_SYMBOL(sym, sec) \ extern typeof(sym) sym; \ __CRC_SYMBOL(sym, sec) \ static const char __kstrtab_##sym[] \ - __attribute__((section("__ksymtab_strings"), aligned(1))) \ + __attribute__((section("__ksymtab_strings"), used, aligned(1))) \ = #sym; \ - static const struct kernel_symbol __ksymtab_##sym \ - __used \ - __attribute__((section("___ksymtab" sec "+" #sym), used)) \ - = { (unsigned long)&sym, __kstrtab_##sym } + __KSYMTAB_ENTRY(sym, sec) #if defined(__DISABLE_EXPORTS) -- cgit v1.2.3 From 1b1eeca7e4c19fa76d409d4c7b338dba21f2df45 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Aug 2018 21:56:13 -0700 Subject: init: allow initcall tables to be emitted using relative references Allow the initcall tables to be emitted using relative references that are only half the size on 64-bit architectures and don't require fixups at runtime on relocatable kernels. Link: http://lkml.kernel.org/r/20180704083651.24360-5-ard.biesheuvel@linaro.org Acked-by: James Morris Acked-by: Sergey Senozhatsky Acked-by: Petr Mladek Acked-by: Michael Ellerman Acked-by: Ingo Molnar Signed-off-by: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: James Morris Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Nicolas Pitre Cc: Paul Mackerras Cc: Russell King Cc: "Serge E. Hallyn" Cc: Steven Rostedt Cc: Thomas Garnier Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init.h | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index bc27cf03c41e..2538d176dd1f 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -116,8 +116,24 @@ typedef int (*initcall_t)(void); typedef void (*exitcall_t)(void); -extern initcall_t __con_initcall_start[], __con_initcall_end[]; -extern initcall_t __security_initcall_start[], __security_initcall_end[]; +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +typedef int initcall_entry_t; + +static inline initcall_t initcall_from_entry(initcall_entry_t *entry) +{ + return offset_to_ptr(entry); +} +#else +typedef initcall_t initcall_entry_t; + +static inline initcall_t initcall_from_entry(initcall_entry_t *entry) +{ + return *entry; +} +#endif + +extern initcall_entry_t __con_initcall_start[], __con_initcall_end[]; +extern initcall_entry_t __security_initcall_start[], __security_initcall_end[]; /* Used for contructor calls. */ typedef void (*ctor_fn_t)(void); @@ -167,9 +183,20 @@ extern bool initcall_debug; * as KEEP() in the linker script. */ -#define __define_initcall(fn, id) \ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#define ___define_initcall(fn, id, __sec) \ + __ADDRESSABLE(fn) \ + asm(".section \"" #__sec ".init\", \"a\" \n" \ + "__initcall_" #fn #id ": \n" \ + ".long " #fn " - . \n" \ + ".previous \n"); +#else +#define ___define_initcall(fn, id, __sec) \ static initcall_t __initcall_##fn##id __used \ - __attribute__((__section__(".initcall" #id ".init"))) = fn; + __attribute__((__section__(#__sec ".init"))) = fn; +#endif + +#define __define_initcall(fn, id) ___define_initcall(fn, id, .initcall##id) /* * Early initcalls run before initializing SMP. @@ -208,13 +235,8 @@ extern bool initcall_debug; #define __exitcall(fn) \ static exitcall_t __exitcall_##fn __exit_call = fn -#define console_initcall(fn) \ - static initcall_t __initcall_##fn \ - __used __section(.con_initcall.init) = fn - -#define security_initcall(fn) \ - static initcall_t __initcall_##fn \ - __used __section(.security_initcall.init) = fn +#define console_initcall(fn) ___define_initcall(fn,, .con_initcall) +#define security_initcall(fn) ___define_initcall(fn,, .security_initcall) struct obs_kernel_param { const char *str; -- cgit v1.2.3 From c9d8b55fa0191623fccb9ed67d2ff8f9159e9a89 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Aug 2018 21:56:18 -0700 Subject: PCI: Add support for relative addressing in quirk tables Allow the PCI quirk tables to be emitted in a way that avoids absolute references to the hook functions. This reduces the size of the entries, and, more importantly, makes them invariant under runtime relocation (e.g., for KASLR) Link: http://lkml.kernel.org/r/20180704083651.24360-6-ard.biesheuvel@linaro.org Acked-by: Bjorn Helgaas Acked-by: Michael Ellerman Acked-by: Ingo Molnar Signed-off-by: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: James Morris Cc: James Morris Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Nicolas Pitre Cc: Paul Mackerras Cc: Petr Mladek Cc: Russell King Cc: "Serge E. Hallyn" Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Thomas Garnier Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pci.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 9b87f1936906..e72ca8dd6241 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1809,7 +1809,11 @@ struct pci_fixup { u16 device; /* Or PCI_ANY_ID */ u32 class; /* Or PCI_ANY_ID */ unsigned int class_shift; /* should be 0, 8, 16 */ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + int hook_offset; +#else void (*hook)(struct pci_dev *dev); +#endif }; enum pci_fixup_pass { @@ -1823,12 +1827,28 @@ enum pci_fixup_pass { pci_fixup_suspend_late, /* pci_device_suspend_late() */ }; +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class, \ + class_shift, hook) \ + __ADDRESSABLE(hook) \ + asm(".section " #sec ", \"a\" \n" \ + ".balign 16 \n" \ + ".short " #vendor ", " #device " \n" \ + ".long " #class ", " #class_shift " \n" \ + ".long " #hook " - . \n" \ + ".previous \n"); +#define DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class, \ + class_shift, hook) \ + __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class, \ + class_shift, hook) +#else /* Anonymous variables would be nice... */ #define DECLARE_PCI_FIXUP_SECTION(section, name, vendor, device, class, \ class_shift, hook) \ static const struct pci_fixup __PASTE(__pci_fixup_##name,__LINE__) __used \ __attribute__((__section__(#section), aligned((sizeof(void *))))) \ = { vendor, device, class, class_shift, hook }; +#endif #define DECLARE_PCI_FIXUP_CLASS_EARLY(vendor, device, class, \ class_shift, hook) \ -- cgit v1.2.3 From 46e0c9be206fa7b11aca75da2d6b8535d0139752 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Aug 2018 21:56:22 -0700 Subject: kernel: tracepoints: add support for relative references To avoid the need for relocating absolute references to tracepoint structures at boot time when running relocatable kernels (which may take a disproportionate amount of space), add the option to emit these tables as relative references instead. Link: http://lkml.kernel.org/r/20180704083651.24360-7-ard.biesheuvel@linaro.org Acked-by: Michael Ellerman Acked-by: Ingo Molnar Acked-by: Steven Rostedt (VMware) Signed-off-by: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: James Morris Cc: James Morris Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Nicolas Pitre Cc: Paul Mackerras Cc: Petr Mladek Cc: Russell King Cc: "Serge E. Hallyn" Cc: Sergey Senozhatsky Cc: Thomas Garnier Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tracepoint.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index d9a084c72541..7f2e16e76ac4 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -249,6 +249,19 @@ extern void syscall_unregfunc(void); return static_key_false(&__tracepoint_##name.key); \ } +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#define __TRACEPOINT_ENTRY(name) \ + asm(" .section \"__tracepoints_ptrs\", \"a\" \n" \ + " .balign 4 \n" \ + " .long __tracepoint_" #name " - . \n" \ + " .previous \n") +#else +#define __TRACEPOINT_ENTRY(name) \ + static struct tracepoint * const __tracepoint_ptr_##name __used \ + __attribute__((section("__tracepoints_ptrs"))) = \ + &__tracepoint_##name +#endif + /* * We have no guarantee that gcc and the linker won't up-align the tracepoint * structures, so we create an array of pointers that will be used for iteration @@ -258,11 +271,9 @@ extern void syscall_unregfunc(void); static const char __tpstrtab_##name[] \ __attribute__((section("__tracepoints_strings"))) = #name; \ struct tracepoint __tracepoint_##name \ - __attribute__((section("__tracepoints"))) = \ + __attribute__((section("__tracepoints"), used)) = \ { __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\ - static struct tracepoint * const __tracepoint_ptr_##name __used \ - __attribute__((section("__tracepoints_ptrs"))) = \ - &__tracepoint_##name; + __TRACEPOINT_ENTRY(name); #define DEFINE_TRACE(name) \ DEFINE_TRACE_FN(name, NULL, NULL); -- cgit v1.2.3 From 9144d75e22cad3c89e6b2ccab551db9ee28d250a Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 21 Aug 2018 21:57:03 -0700 Subject: include/linux/bitops.h: introduce BITS_PER_TYPE net_dim.h has a rather useful extension to BITS_PER_BYTE to compute the number of bits in a type (BITS_PER_BYTE * sizeof(T)), so promote the macro to bitops.h, alongside BITS_PER_BYTE, for wider usage. Link: http://lkml.kernel.org/r/20180706094458.14116-1-chris@chris-wilson.co.uk Signed-off-by: Chris Wilson Reviewed-by: Jani Nikula Cc: Randy Dunlap Cc: Andy Gospodarek Cc: David S. Miller Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 3 ++- include/linux/net_dim.h | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index af419012d77d..7ddb1349394d 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -4,7 +4,8 @@ #include #include -#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) +#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h index db99240d00bd..c79e859408e6 100644 --- a/include/linux/net_dim.h +++ b/include/linux/net_dim.h @@ -363,7 +363,6 @@ static inline void net_dim_sample(u16 event_ctr, } #define NET_DIM_NEVENTS 64 -#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) & (BIT_ULL(bits) - 1)) static inline void net_dim_calc_stats(struct net_dim_sample *start, -- cgit v1.2.3 From feba04fd2cf8f6a74865338df2e3e1e94d6cfd13 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 21 Aug 2018 21:57:11 -0700 Subject: lib: add crc64 calculation routines Patch series "add crc64 calculation as kernel library", v5. This patchset adds basic implementation of crc64 calculation as a Linux kernel library. Since bcache already does crc64 by itself, this patchset also modifies bcache code to use the new crc64 library routine. Currently bcache is the only user of crc64 calculation, another potential user is bcachefs which is on the way to be in mainline kernel. Therefore it makes sense to make crc64 calculation to be a public library. bcache uses crc64 as storage checksum, if a change of crc lib routines results an inconsistent result, the unmatched checksum may make bcache 'think' the on-disk is corrupted, such a change should be avoided or detected as early as possible. Therefore a patch is being prepared which adds a crc test framework, to check consistency of different calculations. This patch (of 2): Add the re-write crc64 calculation routines for Linux kernel. The CRC64 polynomical arithmetic follows ECMA-182 specification, inspired by CRC paper of Dr. Ross N. Williams (see http://www.ross.net/crc/download/crc_v3.txt) and other public domain implementations. All the changes work in this way, - When Linux kernel is built, host program lib/gen_crc64table.c will be compiled to lib/gen_crc64table and executed. - The output of gen_crc64table execution is an array called as lookup table (a.k.a POLY 0x42f0e1eba9ea369) which contain 256 64-bit long numbers, this table is dumped into header file lib/crc64table.h. - Then the header file is included by lib/crc64.c for normal 64bit crc calculation. - Function declaration of the crc64 calculation routines is placed in include/linux/crc64.h Currently bcache is the only user of crc64_be(), another potential user is bcachefs which is on the way to be in mainline kernel. Therefore it makes sense to move crc64 calculation into lib/crc64.c as public code. [colyli@suse.de: fix review comments from v4] Link: http://lkml.kernel.org/r/20180726053352.2781-2-colyli@suse.de Link: http://lkml.kernel.org/r/20180718165545.1622-2-colyli@suse.de Signed-off-by: Coly Li Co-developed-by: Andy Shevchenko Signed-off-by: Andy Shevchenko Reviewed-by: Hannes Reinecke Cc: Greg Kroah-Hartman Cc: Andy Shevchenko Cc: Michael Lyle Cc: Kent Overstreet Cc: Thomas Gleixner Cc: Kate Stewart Cc: Eric Biggers Cc: Randy Dunlap Cc: Noah Massey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crc64.h | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 include/linux/crc64.h (limited to 'include/linux') diff --git a/include/linux/crc64.h b/include/linux/crc64.h new file mode 100644 index 000000000000..c756e65a1b58 --- /dev/null +++ b/include/linux/crc64.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * See lib/crc64.c for the related specification and polynomial arithmetic. + */ +#ifndef _LINUX_CRC64_H +#define _LINUX_CRC64_H + +#include + +u64 __pure crc64_be(u64 crc, const void *p, size_t len); +#endif /* _LINUX_CRC64_H */ -- cgit v1.2.3 From 52cba1a274818c3ee6299c1a1b476e7bc5037a2f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 21:59:51 -0700 Subject: signal: make force_sigsegv() void Patch series "signal: refactor some functions", v3. This series refactors a bunch of functions in signal.c to simplify parts of the code. The greatest single change is declaring the static do_sigpending() helper as void which makes it possible to remove a bunch of unnecessary checks in the syscalls later on. This patch (of 17): force_sigsegv() returned 0 unconditionally so it doesn't make sense to have it return at all. In addition, there are no callers that check force_sigsegv()'s return value. Link: http://lkml.kernel.org/r/20180602103653.18181-2-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Al Viro Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/signal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 113d1ad1ced7..e138ac16c650 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -314,7 +314,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey); int force_sig_ptrace_errno_trap(int errno, void __user *addr); extern int send_sig_info(int, struct siginfo *, struct task_struct *); -extern int force_sigsegv(int, struct task_struct *); +extern void force_sigsegv(int sig, struct task_struct *p); extern int force_sig_info(int, struct siginfo *, struct task_struct *); extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); -- cgit v1.2.3 From 67a48a24478841525ba73ec6d64caaf079cf3b7c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:34 -0700 Subject: signal: make unhandled_signal() return bool unhandled_signal() already behaves like a boolean function. Let's actually declare it as such too. All callers treat it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-13-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/signal.h b/include/linux/signal.h index 3c5200137b24..fa23cae66758 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -287,7 +287,7 @@ static inline void disallow_signal(int sig) extern struct kmem_cache *sighand_cachep; -int unhandled_signal(struct task_struct *tsk, int sig); +extern bool unhandled_signal(struct task_struct *tsk, int sig); /* * In POSIX a signal is sent either to a specific thread (Linux task) -- cgit v1.2.3 From 20ab7218d2507b2dbec9c053c2f1b96054e266b6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Aug 2018 22:00:54 -0700 Subject: signal: make get_signal() return bool make get_signal() already behaves like a boolean function. Let's actually declare it as such too. Link: http://lkml.kernel.org/r/20180602103653.18181-18-christian@brauner.io Signed-off-by: Christian Brauner Reviewed-by: Andrew Morton Cc: Al Viro Cc: Eric W. Biederman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Morris Cc: Kees Cook Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/signal.h b/include/linux/signal.h index fa23cae66758..e3d9e0640e6e 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -265,7 +265,7 @@ extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); extern int show_unhandled_signals; -extern int get_signal(struct ksignal *ksig); +extern bool get_signal(struct ksignal *ksig); extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void exit_signals(struct task_struct *tsk); extern void kernel_sigaction(int, __sighandler_t); -- cgit v1.2.3 From dc2c8c84def6ce450c63529e08c1db100020994e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Aug 2018 22:01:52 -0700 Subject: ipc: get rid of ids->tables_initialized hack In sysvipc we have an ids->tables_initialized regarding the rhashtable, introduced in 0cfb6aee70bd ("ipc: optimize semget/shmget/msgget for lots of keys") It's there, specifically, to prevent nil pointer dereferences, from using an uninitialized api. Considering how rhashtable_init() can fail (probably due to ENOMEM, if anything), this made the overall ipc initialization capable of failure as well. That alone is ugly, but fine, however I've spotted a few issues regarding the semantics of tables_initialized (however unlikely they may be): - There is inconsistency in what we return to userspace: ipc_addid() returns ENOSPC which is certainly _wrong_, while ipc_obtain_object_idr() returns EINVAL. - After we started using rhashtables, ipc_findkey() can return nil upon !tables_initialized, but the caller expects nil for when the ipc structure isn't found, and can therefore call into ipcget() callbacks. Now that rhashtable initialization cannot fail, we can properly get rid of the hack altogether. [manfred@colorfullife.com: commit id extended to 12 digits] Link: http://lkml.kernel.org/r/20180712185241.4017-10-manfred@colorfullife.com Signed-off-by: Davidlohr Bueso Signed-off-by: Manfred Spraul Cc: Dmitry Vyukov Cc: Herbert Xu Cc: Kees Cook Cc: Michael Kerrisk Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 6cea726612b7..e9ccdfdb1928 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -16,7 +16,6 @@ struct user_namespace; struct ipc_ids { int in_use; unsigned short seq; - bool tables_initialized; struct rw_semaphore rwsem; struct idr ipcs_idr; int max_id; -- cgit v1.2.3 From 27c331a174614208d0b539019583990967ad9479 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Tue, 21 Aug 2018 22:02:00 -0700 Subject: ipc/util.c: further variable name cleanups The varable names got a mess, thus standardize them again: id: user space id. Called semid, shmid, msgid if the type is known. Most functions use "id" already. idx: "index" for the idr lookup Right now, some functions use lid, ipc_addid() already uses idx as the variable name. seq: sequence number, to avoid quick collisions of the user space id key: user space key, used for the rhash tree Link: http://lkml.kernel.org/r/20180712185241.4017-12-manfred@colorfullife.com Signed-off-by: Manfred Spraul Cc: Dmitry Vyukov Cc: Davidlohr Bueso Cc: Davidlohr Bueso Cc: Herbert Xu Cc: Kees Cook Cc: Michael Kerrisk Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e9ccdfdb1928..6ab8c1bada3f 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -18,7 +18,7 @@ struct ipc_ids { unsigned short seq; struct rw_semaphore rwsem; struct idr ipcs_idr; - int max_id; + int max_idx; #ifdef CONFIG_CHECKPOINT_RESTORE int next_id; #endif -- cgit v1.2.3 From 9abdda5ddab8a899ca8c4b859ef0a7710f40e0dd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 16 Aug 2018 12:05:59 -0400 Subject: sunrpc: Extract target name into svc_cred NFSv4.0 callback needs to know the GSS target name the client used when it established its lease. That information is available from the GSS context created by gssproxy. Make it available in each svc_cred. Note this will also give us access to the real target service principal name (which is typically "nfs", but spec does not require that). Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svcauth.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h index 7c3656505847..04e404a07882 100644 --- a/include/linux/sunrpc/svcauth.h +++ b/include/linux/sunrpc/svcauth.h @@ -31,6 +31,7 @@ struct svc_cred { /* name of form servicetype@hostname, passed down by * rpc.svcgssd, or computed from the above: */ char *cr_principal; + char *cr_targ_princ; struct gss_api_mech *cr_gss_mech; }; @@ -39,6 +40,7 @@ static inline void init_svc_cred(struct svc_cred *cred) cred->cr_group_info = NULL; cred->cr_raw_principal = NULL; cred->cr_principal = NULL; + cred->cr_targ_princ = NULL; cred->cr_gss_mech = NULL; } @@ -48,6 +50,7 @@ static inline void free_svc_cred(struct svc_cred *cred) put_group_info(cred->cr_group_info); kfree(cred->cr_raw_principal); kfree(cred->cr_principal); + kfree(cred->cr_targ_princ); gss_mech_put(cred->cr_gss_mech); init_svc_cred(cred); } -- cgit v1.2.3 From 815f0ddb346c196018d4d8f8f55c12b83da1de3f Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 22 Aug 2018 16:37:24 -0700 Subject: include/linux/compiler*.h: make compiler-*.h mutually exclusive Commit cafa0010cd51 ("Raise the minimum required gcc version to 4.6") recently exposed a brittle part of the build for supporting non-gcc compilers. Both Clang and ICC define __GNUC__, __GNUC_MINOR__, and __GNUC_PATCHLEVEL__ for quick compatibility with code bases that haven't added compiler specific checks for __clang__ or __INTEL_COMPILER. This is brittle, as they happened to get compatibility by posing as a certain version of GCC. This broke when upgrading the minimal version of GCC required to build the kernel, to a version above what ICC and Clang claim to be. Rather than always including compiler-gcc.h then undefining or redefining macros in compiler-intel.h or compiler-clang.h, let's separate out the compiler specific macro definitions into mutually exclusive headers, do more proper compiler detection, and keep shared definitions in compiler_types.h. Fixes: cafa0010cd51 ("Raise the minimum required gcc version to 4.6") Reported-by: Masahiro Yamada Suggested-by: Eli Friedman Suggested-by: Joe Perches Signed-off-by: Nick Desaulniers Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 20 ++-- include/linux/compiler-gcc.h | 88 --------------- include/linux/compiler-intel.h | 13 +-- include/linux/compiler_types.h | 238 ++++++++++++++++++++--------------------- 4 files changed, 127 insertions(+), 232 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 7087446c24c8..b1ce500fe8b3 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -6,11 +6,7 @@ /* Some compiler specific definitions are overwritten here * for Clang compiler */ - -#ifdef uninitialized_var -#undef uninitialized_var #define uninitialized_var(x) x = *(&(x)) -#endif /* same as gcc, this was present in clang-2.6 so we can assume it works * with any version that can compile the kernel @@ -25,14 +21,8 @@ #define __SANITIZE_ADDRESS__ #endif -#undef __no_sanitize_address #define __no_sanitize_address __attribute__((no_sanitize("address"))) -/* Clang doesn't have a way to turn it off per-function, yet. */ -#ifdef __noretpoline -#undef __noretpoline -#endif - /* * Not all versions of clang implement the the type-generic versions * of the builtin overflow checkers. Fortunately, clang implements @@ -40,9 +30,17 @@ * checks. Unfortunately, we don't know which version of gcc clang * pretends to be, so the macro may or may not be defined. */ -#undef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW #if __has_builtin(__builtin_mul_overflow) && \ __has_builtin(__builtin_add_overflow) && \ __has_builtin(__builtin_sub_overflow) #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif + +/* The following are for compatibility with GCC, from compiler-gcc.h, + * and may be redefined here because they should not be shared with other + * compilers, like ICC. + */ +#define barrier() __asm__ __volatile__("" : : : "memory") +#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +#define __assume_aligned(a, ...) \ + __attribute__((__assume_aligned__(a, ## __VA_ARGS__))) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 250b9b7cfd60..763bbad1e258 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -75,48 +75,6 @@ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) #endif -/* - * Feature detection for gnu_inline (gnu89 extern inline semantics). Either - * __GNUC_STDC_INLINE__ is defined (not using gnu89 extern inline semantics, - * and we opt in to the gnu89 semantics), or __GNUC_STDC_INLINE__ is not - * defined so the gnu89 semantics are the default. - */ -#ifdef __GNUC_STDC_INLINE__ -# define __gnu_inline __attribute__((gnu_inline)) -#else -# define __gnu_inline -#endif - -/* - * Force always-inline if the user requests it so via the .config, - * or if gcc is too old. - * GCC does not warn about unused static inline functions for - * -Wunused-function. This turns out to avoid the need for complex #ifdef - * directives. Suppress the warning in clang as well by using "unused" - * function attribute, which is redundant but not harmful for gcc. - * Prefer gnu_inline, so that extern inline functions do not emit an - * externally visible function. This makes extern inline behave as per gnu89 - * semantics rather than c99. This prevents multiple symbol definition errors - * of extern inline functions at link time. - * A lot of inline functions can cause havoc with function tracing. - */ -#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ - !defined(CONFIG_OPTIMIZE_INLINING) -#define inline \ - inline __attribute__((always_inline, unused)) notrace __gnu_inline -#else -#define inline inline __attribute__((unused)) notrace __gnu_inline -#endif - -#define __inline__ inline -#define __inline inline -#define __always_inline inline __attribute__((always_inline)) -#define noinline __attribute__((noinline)) - -#define __packed __attribute__((packed)) -#define __weak __attribute__((weak)) -#define __alias(symbol) __attribute__((alias(#symbol))) - #ifdef RETPOLINE #define __noretpoline __attribute__((indirect_branch("keep"))) #endif @@ -135,55 +93,9 @@ */ #define __naked __attribute__((naked)) noinline __noclone notrace -#define __noreturn __attribute__((noreturn)) - -/* - * From the GCC manual: - * - * Many functions have no effects except the return value and their - * return value depends only on the parameters and/or global - * variables. Such a function can be subject to common subexpression - * elimination and loop optimization just as an arithmetic operator - * would be. - * [...] - */ -#define __pure __attribute__((pure)) -#define __aligned(x) __attribute__((aligned(x))) -#define __aligned_largest __attribute__((aligned)) -#define __printf(a, b) __attribute__((format(printf, a, b))) -#define __scanf(a, b) __attribute__((format(scanf, a, b))) -#define __attribute_const__ __attribute__((__const__)) -#define __maybe_unused __attribute__((unused)) -#define __always_unused __attribute__((unused)) -#define __mode(x) __attribute__((mode(x))) - -#define __must_check __attribute__((warn_unused_result)) -#define __malloc __attribute__((__malloc__)) - -#define __used __attribute__((__used__)) -#define __compiler_offsetof(a, b) \ - __builtin_offsetof(a, b) - -/* Mark functions as cold. gcc will assume any path leading to a call - * to them will be unlikely. This means a lot of manual unlikely()s - * are unnecessary now for any paths leading to the usual suspects - * like BUG(), printk(), panic() etc. [but let's keep them for now for - * older compilers] - * - * Early snapshots of gcc 4.3 don't support this and we can't detect this - * in the preprocessor, but we can live with this because they're unreleased. - * Maketime probing would be overkill here. - * - * gcc also has a __attribute__((__hot__)) to move hot functions into - * a special section, but I don't see any sense in this right now in - * the kernel context - */ -#define __cold __attribute__((__cold__)) - #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) #define __optimize(level) __attribute__((__optimize__(level))) -#define __nostackprotector __optimize("no-stack-protector") #define __compiletime_object_size(obj) __builtin_object_size(obj, 0) diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h index 547cdc920a3c..4c7f9befa9f6 100644 --- a/include/linux/compiler-intel.h +++ b/include/linux/compiler-intel.h @@ -14,10 +14,6 @@ /* Intel ECC compiler doesn't support gcc specific asm stmts. * It uses intrinsics to do the equivalent things. */ -#undef barrier -#undef barrier_data -#undef RELOC_HIDE -#undef OPTIMIZER_HIDE_VAR #define barrier() __memory_barrier() #define barrier_data(ptr) barrier() @@ -38,13 +34,12 @@ #endif -#ifndef __HAVE_BUILTIN_BSWAP16__ /* icc has this, but it's called _bswap16 */ #define __HAVE_BUILTIN_BSWAP16__ #define __builtin_bswap16 _bswap16 -#endif -/* - * icc defines __GNUC__, but does not implement the builtin overflow checkers. +/* The following are for compatibility with GCC, from compiler-gcc.h, + * and may be redefined here because they should not be shared with other + * compilers, like clang. */ -#undef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW +#define __visible __attribute__((externally_visible)) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index fbf337933fd8..90479a0f3986 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -54,32 +54,20 @@ extern void __chk_io_ptr(const volatile void __iomem *); #ifdef __KERNEL__ -#ifdef __GNUC__ -#include -#endif - -#if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__) -#define notrace __attribute__((hotpatch(0,0))) -#else -#define notrace __attribute__((no_instrument_function)) -#endif - -/* Intel compiler defines __GNUC__. So we will overwrite implementations - * coming from above header files here - */ -#ifdef __INTEL_COMPILER -# include -#endif - -/* Clang compiler defines __GNUC__. So we will overwrite implementations - * coming from above header files here - */ +/* Compiler specific macros. */ #ifdef __clang__ #include +#elif defined(__INTEL_COMPILER) +#include +#elif defined(__GNUC__) +/* The above compilers also define __GNUC__, so order is important here. */ +#include +#else +#error "Unknown compiler" #endif /* - * Generic compiler-dependent macros required for kernel + * Generic compiler-independent macros required for kernel * build go below this comment. Actual compiler/compiler version * specific implementations come from the above header files */ @@ -106,93 +94,19 @@ struct ftrace_likely_data { unsigned long constant; }; -#endif /* __KERNEL__ */ - -#endif /* __ASSEMBLY__ */ - -#ifdef __KERNEL__ - /* Don't. Just don't. */ #define __deprecated #define __deprecated_for_modules -#ifndef __must_check -#define __must_check -#endif - -#ifndef CONFIG_ENABLE_MUST_CHECK -#undef __must_check -#define __must_check -#endif - -#ifndef __malloc -#define __malloc -#endif - -/* - * Allow us to avoid 'defined but not used' warnings on functions and data, - * as well as force them to be emitted to the assembly file. - * - * As of gcc 3.4, static functions that are not marked with attribute((used)) - * may be elided from the assembly file. As of gcc 3.4, static data not so - * marked will not be elided, but this may change in a future gcc version. - * - * NOTE: Because distributions shipped with a backported unit-at-a-time - * compiler in gcc 3.3, we must define __used to be __attribute__((used)) - * for gcc >=3.3 instead of 3.4. - * - * In prior versions of gcc, such functions and data would be emitted, but - * would be warned about except with attribute((unused)). - * - * Mark functions that are referenced only in inline assembly as __used so - * the code is emitted even though it appears to be unreferenced. - */ -#ifndef __used -# define __used /* unimplemented */ -#endif - -#ifndef __maybe_unused -# define __maybe_unused /* unimplemented */ -#endif - -#ifndef __always_unused -# define __always_unused /* unimplemented */ -#endif - -#ifndef noinline -#define noinline -#endif - -/* - * Rather then using noinline to prevent stack consumption, use - * noinline_for_stack instead. For documentation reasons. - */ -#define noinline_for_stack noinline - -#ifndef __always_inline -#define __always_inline inline -#endif - #endif /* __KERNEL__ */ +#endif /* __ASSEMBLY__ */ + /* - * From the GCC manual: - * - * Many functions do not examine any values except their arguments, - * and have no effects except the return value. Basically this is - * just slightly more strict class than the `pure' attribute above, - * since function is not allowed to read global memory. - * - * Note that a function that has pointer arguments and examines the - * data pointed to must _not_ be declared `const'. Likewise, a - * function that calls a non-`const' function usually must not be - * `const'. It does not make sense for a `const' function to return - * `void'. + * The below symbols may be defined for one or more, but not ALL, of the above + * compilers. We don't consider that to be an error, so set them to nothing. + * For example, some of them are for compiler specific plugins. */ -#ifndef __attribute_const__ -# define __attribute_const__ /* unimplemented */ -#endif - #ifndef __designated_init # define __designated_init #endif @@ -214,28 +128,10 @@ struct ftrace_likely_data { # define randomized_struct_fields_end #endif -/* - * Tell gcc if a function is cold. The compiler will assume any path - * directly leading to the call is unlikely. - */ - -#ifndef __cold -#define __cold -#endif - -/* Simple shorthand for a section definition */ -#ifndef __section -# define __section(S) __attribute__ ((__section__(#S))) -#endif - #ifndef __visible #define __visible #endif -#ifndef __nostackprotector -# define __nostackprotector -#endif - /* * Assume alignment of return value. */ @@ -243,17 +139,23 @@ struct ftrace_likely_data { #define __assume_aligned(a, ...) #endif - /* Are two types/vars the same type (ignoring qualifiers)? */ -#ifndef __same_type -# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) -#endif +#define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) /* Is this type a native word size -- useful for atomic operations */ -#ifndef __native_word -# define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) +#define __native_word(t) \ + (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ + sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) + +#ifndef __attribute_const__ +#define __attribute_const__ __attribute__((__const__)) #endif +#ifndef __noclone +#define __noclone +#endif + +/* Helpers for emitting diagnostics in pragmas. */ #ifndef __diag #define __diag(string) #endif @@ -272,4 +174,92 @@ struct ftrace_likely_data { #define __diag_error(compiler, version, option, comment) \ __diag_ ## compiler(version, error, option) +/* + * From the GCC manual: + * + * Many functions have no effects except the return value and their + * return value depends only on the parameters and/or global + * variables. Such a function can be subject to common subexpression + * elimination and loop optimization just as an arithmetic operator + * would be. + * [...] + */ +#define __pure __attribute__((pure)) +#define __aligned(x) __attribute__((aligned(x))) +#define __aligned_largest __attribute__((aligned)) +#define __printf(a, b) __attribute__((format(printf, a, b))) +#define __scanf(a, b) __attribute__((format(scanf, a, b))) +#define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) +#define __mode(x) __attribute__((mode(x))) +#define __malloc __attribute__((__malloc__)) +#define __used __attribute__((__used__)) +#define __noreturn __attribute__((noreturn)) +#define __packed __attribute__((packed)) +#define __weak __attribute__((weak)) +#define __alias(symbol) __attribute__((alias(#symbol))) +#define __cold __attribute__((cold)) +#define __section(S) __attribute__((__section__(#S))) + + +#ifdef CONFIG_ENABLE_MUST_CHECK +#define __must_check __attribute__((warn_unused_result)) +#else +#define __must_check +#endif + +#if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__) +#define notrace __attribute__((hotpatch(0, 0))) +#else +#define notrace __attribute__((no_instrument_function)) +#endif + +#define __compiler_offsetof(a, b) __builtin_offsetof(a, b) + +/* + * Feature detection for gnu_inline (gnu89 extern inline semantics). Either + * __GNUC_STDC_INLINE__ is defined (not using gnu89 extern inline semantics, + * and we opt in to the gnu89 semantics), or __GNUC_STDC_INLINE__ is not + * defined so the gnu89 semantics are the default. + */ +#ifdef __GNUC_STDC_INLINE__ +# define __gnu_inline __attribute__((gnu_inline)) +#else +# define __gnu_inline +#endif + +/* + * Force always-inline if the user requests it so via the .config. + * GCC does not warn about unused static inline functions for + * -Wunused-function. This turns out to avoid the need for complex #ifdef + * directives. Suppress the warning in clang as well by using "unused" + * function attribute, which is redundant but not harmful for gcc. + * Prefer gnu_inline, so that extern inline functions do not emit an + * externally visible function. This makes extern inline behave as per gnu89 + * semantics rather than c99. This prevents multiple symbol definition errors + * of extern inline functions at link time. + * A lot of inline functions can cause havoc with function tracing. + */ +#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ + !defined(CONFIG_OPTIMIZE_INLINING) +#define inline \ + inline __attribute__((always_inline, unused)) notrace __gnu_inline +#else +#define inline inline __attribute__((unused)) notrace __gnu_inline +#endif + +#define __inline__ inline +#define __inline inline +#define noinline __attribute__((noinline)) + +#ifndef __always_inline +#define __always_inline inline __attribute__((always_inline)) +#endif + +/* + * Rather then using noinline to prevent stack consumption, use + * noinline_for_stack instead. For documentation reasons. + */ +#define noinline_for_stack noinline + #endif /* __LINUX_COMPILER_TYPES_H */ -- cgit v1.2.3 From 30aba6656f61ed44cba445a3c0d38b296fa9e8f5 Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Thu, 23 Aug 2018 17:00:35 -0700 Subject: namei: allow restricted O_CREAT of FIFOs and regular files Disallows open of FIFOs or regular files not owned by the user in world writable sticky directories, unless the owner is the same as that of the directory or the file is opened without the O_CREAT flag. The purpose is to make data spoofing attacks harder. This protection can be turned on and off separately for FIFOs and regular files via sysctl, just like the symlinks/hardlinks protection. This patch is based on Openwall's "HARDEN_FIFO" feature by Solar Designer. This is a brief list of old vulnerabilities that could have been prevented by this feature, some of them even allow for privilege escalation: CVE-2000-1134 CVE-2007-3852 CVE-2008-0525 CVE-2009-0416 CVE-2011-4834 CVE-2015-1838 CVE-2015-7442 CVE-2016-7489 This list is not meant to be complete. It's difficult to track down all vulnerabilities of this kind because they were often reported without any mention of this particular attack vector. In fact, before hardlinks/symlinks restrictions, fifos/regular files weren't the favorite vehicle to exploit them. [s.mesoraca16@gmail.com: fix bug reported by Dan Carpenter] Link: https://lkml.kernel.org/r/20180426081456.GA7060@mwanda Link: http://lkml.kernel.org/r/1524829819-11275-1-git-send-email-s.mesoraca16@gmail.com [keescook@chromium.org: drop pr_warn_ratelimited() in favor of audit changes in the future] [keescook@chromium.org: adjust commit subjet] Link: http://lkml.kernel.org/r/20180416175918.GA13494@beast Signed-off-by: Salvatore Mesoraca Signed-off-by: Kees Cook Suggested-by: Solar Designer Suggested-by: Kees Cook Cc: Al Viro Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e5710541183b..33322702c910 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -74,6 +74,8 @@ extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; extern int sysctl_protected_symlinks; extern int sysctl_protected_hardlinks; +extern int sysctl_protected_fifos; +extern int sysctl_protected_regular; typedef __kernel_rwf_t rwf_t; -- cgit v1.2.3 From d4ae9916ea2947341180d2b538f48875ff393a86 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 23 Aug 2018 17:00:42 -0700 Subject: mm: soft-offline: close the race against page allocation A process can be killed with SIGBUS(BUS_MCEERR_AR) when it tries to allocate a page that was just freed on the way of soft-offline. This is undesirable because soft-offline (which is about corrected error) is less aggressive than hard-offline (which is about uncorrected error), and we can make soft-offline fail and keep using the page for good reason like "system is busy." Two main changes of this patch are: - setting migrate type of the target page to MIGRATE_ISOLATE. As done in free_unref_page_commit(), this makes kernel bypass pcplist when freeing the page. So we can assume that the page is in freelist just after put_page() returns, - setting PG_hwpoison on free page under zone->lock which protects freelists, so this allows us to avoid setting PG_hwpoison on a page that is decided to be allocated soon. [akpm@linux-foundation.org: tweak set_hwpoison_free_buddy_page() comment] Link: http://lkml.kernel.org/r/1531452366-11661-3-git-send-email-n-horiguchi@ah.jp.nec.com Signed-off-by: Naoya Horiguchi Reported-by: Xishi Qiu Tested-by: Mike Kravetz Cc: Michal Hocko Cc: Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 5 +++++ include/linux/swapops.h | 10 ---------- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 901943e4754b..74bee8cecf4c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -369,8 +369,13 @@ PAGEFLAG_FALSE(Uncached) PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) +extern bool set_hwpoison_free_buddy_page(struct page *page); #else PAGEFLAG_FALSE(HWPoison) +static inline bool set_hwpoison_free_buddy_page(struct page *page) +{ + return 0; +} #define __PG_HWPOISON 0 #endif diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 1d3877c39a00..568a3553d918 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -340,11 +340,6 @@ static inline int is_hwpoison_entry(swp_entry_t entry) return swp_type(entry) == SWP_HWPOISON; } -static inline bool test_set_page_hwpoison(struct page *page) -{ - return TestSetPageHWPoison(page); -} - static inline void num_poisoned_pages_inc(void) { atomic_long_inc(&num_poisoned_pages); @@ -367,11 +362,6 @@ static inline int is_hwpoison_entry(swp_entry_t swp) return 0; } -static inline bool test_set_page_hwpoison(struct page *page) -{ - return false; -} - static inline void num_poisoned_pages_inc(void) { } -- cgit v1.2.3 From 263fade51f7bdd5ad7ebbfe82113a44c5ea4c36c Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 23 Aug 2018 17:01:15 -0700 Subject: docs/mm: make GFP flags descriptions usable as kernel-doc This patch adds DOC: headings for GFP flag descriptions and adjusts the formatting to fit sphinx expectations of paragraphs. Link: http://lkml.kernel.org/r/1532626360-16650-7-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 291 +++++++++++++++++++++++++++------------------------- 1 file changed, 154 insertions(+), 137 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a6afcec53795..24bcc5eec6b4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -59,29 +59,32 @@ struct vm_area_struct; #define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */ #define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) -/* +/** + * DOC: Page mobility and placement hints + * * Page mobility and placement hints + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * These flags provide hints about how mobile the page is. Pages with similar * mobility are placed within the same pageblocks to minimise problems due * to external fragmentation. * - * __GFP_MOVABLE (also a zone modifier) indicates that the page can be - * moved by page migration during memory compaction or can be reclaimed. + * %__GFP_MOVABLE (also a zone modifier) indicates that the page can be + * moved by page migration during memory compaction or can be reclaimed. * - * __GFP_RECLAIMABLE is used for slab allocations that specify - * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers. + * %__GFP_RECLAIMABLE is used for slab allocations that specify + * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers. * - * __GFP_WRITE indicates the caller intends to dirty the page. Where possible, - * these pages will be spread between local zones to avoid all the dirty - * pages being in one zone (fair zone allocation policy). + * %__GFP_WRITE indicates the caller intends to dirty the page. Where possible, + * these pages will be spread between local zones to avoid all the dirty + * pages being in one zone (fair zone allocation policy). * - * __GFP_HARDWALL enforces the cpuset memory allocation policy. + * %__GFP_HARDWALL enforces the cpuset memory allocation policy. * - * __GFP_THISNODE forces the allocation to be satisified from the requested - * node with no fallbacks or placement policy enforcements. + * %__GFP_THISNODE forces the allocation to be satisified from the requested + * node with no fallbacks or placement policy enforcements. * - * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg. + * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg. */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) @@ -89,54 +92,60 @@ struct vm_area_struct; #define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE) #define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT) -/* +/** + * DOC: Watermark modifiers + * * Watermark modifiers -- controls access to emergency reserves + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * - * __GFP_HIGH indicates that the caller is high-priority and that granting - * the request is necessary before the system can make forward progress. - * For example, creating an IO context to clean pages. + * %__GFP_HIGH indicates that the caller is high-priority and that granting + * the request is necessary before the system can make forward progress. + * For example, creating an IO context to clean pages. * - * __GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is - * high priority. Users are typically interrupt handlers. This may be - * used in conjunction with __GFP_HIGH + * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is + * high priority. Users are typically interrupt handlers. This may be + * used in conjunction with %__GFP_HIGH * - * __GFP_MEMALLOC allows access to all memory. This should only be used when - * the caller guarantees the allocation will allow more memory to be freed - * very shortly e.g. process exiting or swapping. Users either should - * be the MM or co-ordinating closely with the VM (e.g. swap over NFS). + * %__GFP_MEMALLOC allows access to all memory. This should only be used when + * the caller guarantees the allocation will allow more memory to be freed + * very shortly e.g. process exiting or swapping. Users either should + * be the MM or co-ordinating closely with the VM (e.g. swap over NFS). * - * __GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. - * This takes precedence over the __GFP_MEMALLOC flag if both are set. + * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. + * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ #define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) -/* +/** + * DOC: Reclaim modifiers + * * Reclaim modifiers + * ~~~~~~~~~~~~~~~~~ * - * __GFP_IO can start physical IO. + * %__GFP_IO can start physical IO. * - * __GFP_FS can call down to the low-level FS. Clearing the flag avoids the - * allocator recursing into the filesystem which might already be holding - * locks. + * %__GFP_FS can call down to the low-level FS. Clearing the flag avoids the + * allocator recursing into the filesystem which might already be holding + * locks. * - * __GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim. - * This flag can be cleared to avoid unnecessary delays when a fallback - * option is available. + * %__GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim. + * This flag can be cleared to avoid unnecessary delays when a fallback + * option is available. * - * __GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when - * the low watermark is reached and have it reclaim pages until the high - * watermark is reached. A caller may wish to clear this flag when fallback - * options are available and the reclaim is likely to disrupt the system. The - * canonical example is THP allocation where a fallback is cheap but - * reclaim/compaction may cause indirect stalls. + * %__GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when + * the low watermark is reached and have it reclaim pages until the high + * watermark is reached. A caller may wish to clear this flag when fallback + * options are available and the reclaim is likely to disrupt the system. The + * canonical example is THP allocation where a fallback is cheap but + * reclaim/compaction may cause indirect stalls. * - * __GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim. + * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim. * * The default allocator behavior depends on the request size. We have a concept - * of so called costly allocations (with order > PAGE_ALLOC_COSTLY_ORDER). + * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER). * !costly allocations are too essential to fail so they are implicitly * non-failing by default (with some exceptions like OOM victims might fail so * the caller still has to check for failures) while costly requests try to be @@ -144,40 +153,40 @@ struct vm_area_struct; * The following three modifiers might be used to override some of these * implicit rules * - * __GFP_NORETRY: The VM implementation will try only very lightweight - * memory direct reclaim to get some memory under memory pressure (thus - * it can sleep). It will avoid disruptive actions like OOM killer. The - * caller must handle the failure which is quite likely to happen under - * heavy memory pressure. The flag is suitable when failure can easily be - * handled at small cost, such as reduced throughput - * - * __GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim - * procedures that have previously failed if there is some indication - * that progress has been made else where. It can wait for other - * tasks to attempt high level approaches to freeing memory such as - * compaction (which removes fragmentation) and page-out. - * There is still a definite limit to the number of retries, but it is - * a larger limit than with __GFP_NORETRY. - * Allocations with this flag may fail, but only when there is - * genuinely little unused memory. While these allocations do not - * directly trigger the OOM killer, their failure indicates that - * the system is likely to need to use the OOM killer soon. The - * caller must handle failure, but can reasonably do so by failing - * a higher-level request, or completing it only in a much less - * efficient manner. - * If the allocation does fail, and the caller is in a position to - * free some non-essential memory, doing so could benefit the system - * as a whole. - * - * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller - * cannot handle allocation failures. The allocation could block - * indefinitely but will never return with failure. Testing for - * failure is pointless. - * New users should be evaluated carefully (and the flag should be - * used only when there is no reasonable failure policy) but it is - * definitely preferable to use the flag rather than opencode endless - * loop around allocator. - * Using this flag for costly allocations is _highly_ discouraged. + * %__GFP_NORETRY: The VM implementation will try only very lightweight + * memory direct reclaim to get some memory under memory pressure (thus + * it can sleep). It will avoid disruptive actions like OOM killer. The + * caller must handle the failure which is quite likely to happen under + * heavy memory pressure. The flag is suitable when failure can easily be + * handled at small cost, such as reduced throughput + * + * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim + * procedures that have previously failed if there is some indication + * that progress has been made else where. It can wait for other + * tasks to attempt high level approaches to freeing memory such as + * compaction (which removes fragmentation) and page-out. + * There is still a definite limit to the number of retries, but it is + * a larger limit than with %__GFP_NORETRY. + * Allocations with this flag may fail, but only when there is + * genuinely little unused memory. While these allocations do not + * directly trigger the OOM killer, their failure indicates that + * the system is likely to need to use the OOM killer soon. The + * caller must handle failure, but can reasonably do so by failing + * a higher-level request, or completing it only in a much less + * efficient manner. + * If the allocation does fail, and the caller is in a position to + * free some non-essential memory, doing so could benefit the system + * as a whole. + * + * %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller + * cannot handle allocation failures. The allocation could block + * indefinitely but will never return with failure. Testing for + * failure is pointless. + * New users should be evaluated carefully (and the flag should be + * used only when there is no reasonable failure policy) but it is + * definitely preferable to use the flag rather than opencode endless + * loop around allocator. + * Using this flag for costly allocations is _highly_ discouraged. */ #define __GFP_IO ((__force gfp_t)___GFP_IO) #define __GFP_FS ((__force gfp_t)___GFP_FS) @@ -188,14 +197,17 @@ struct vm_area_struct; #define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) #define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) -/* +/** + * DOC: Action modifiers + * * Action modifiers + * ~~~~~~~~~~~~~~~~ * - * __GFP_NOWARN suppresses allocation failure reports. + * %__GFP_NOWARN suppresses allocation failure reports. * - * __GFP_COMP address compound page metadata. + * %__GFP_COMP address compound page metadata. * - * __GFP_ZERO returns a zeroed page on success. + * %__GFP_ZERO returns a zeroed page on success. */ #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) @@ -208,66 +220,71 @@ struct vm_area_struct; #define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) -/* +/** + * DOC: Useful GFP flag combinations + * + * Useful GFP flag combinations + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * * Useful GFP flag combinations that are commonly used. It is recommended * that subsystems start with one of these combinations and then set/clear - * __GFP_FOO flags as necessary. - * - * GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower - * watermark is applied to allow access to "atomic reserves" - * - * GFP_KERNEL is typical for kernel-internal allocations. The caller requires - * ZONE_NORMAL or a lower zone for direct access but can direct reclaim. - * - * GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is - * accounted to kmemcg. - * - * GFP_NOWAIT is for kernel allocations that should not stall for direct - * reclaim, start physical IO or use any filesystem callback. - * - * GFP_NOIO will use direct reclaim to discard clean pages or slab pages - * that do not require the starting of any physical IO. - * Please try to avoid using this flag directly and instead use - * memalloc_noio_{save,restore} to mark the whole scope which cannot - * perform any IO with a short explanation why. All allocation requests - * will inherit GFP_NOIO implicitly. - * - * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces. - * Please try to avoid using this flag directly and instead use - * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't - * recurse into the FS layer with a short explanation why. All allocation - * requests will inherit GFP_NOFS implicitly. - * - * GFP_USER is for userspace allocations that also need to be directly - * accessibly by the kernel or hardware. It is typically used by hardware - * for buffers that are mapped to userspace (e.g. graphics) that hardware - * still must DMA to. cpuset limits are enforced for these allocations. - * - * GFP_DMA exists for historical reasons and should be avoided where possible. - * The flags indicates that the caller requires that the lowest zone be - * used (ZONE_DMA or 16M on x86-64). Ideally, this would be removed but - * it would require careful auditing as some users really require it and - * others use the flag to avoid lowmem reserves in ZONE_DMA and treat the - * lowest zone as a type of emergency reserve. - * - * GFP_DMA32 is similar to GFP_DMA except that the caller requires a 32-bit - * address. - * - * GFP_HIGHUSER is for userspace allocations that may be mapped to userspace, - * do not need to be directly accessible by the kernel but that cannot - * move once in use. An example may be a hardware allocation that maps - * data directly into userspace but has no addressing limitations. - * - * GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not - * need direct access to but can use kmap() when access is required. They - * are expected to be movable via page reclaim or page migration. Typically, - * pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE. - * - * GFP_TRANSHUGE and GFP_TRANSHUGE_LIGHT are used for THP allocations. They are - * compound allocations that will generally fail quickly if memory is not - * available and will not wake kswapd/kcompactd on failure. The _LIGHT - * version does not attempt reclaim/compaction at all and is by default used - * in page fault path, while the non-light is used by khugepaged. + * %__GFP_FOO flags as necessary. + * + * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower + * watermark is applied to allow access to "atomic reserves" + * + * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires + * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim. + * + * %GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is + * accounted to kmemcg. + * + * %GFP_NOWAIT is for kernel allocations that should not stall for direct + * reclaim, start physical IO or use any filesystem callback. + * + * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages + * that do not require the starting of any physical IO. + * Please try to avoid using this flag directly and instead use + * memalloc_noio_{save,restore} to mark the whole scope which cannot + * perform any IO with a short explanation why. All allocation requests + * will inherit GFP_NOIO implicitly. + * + * %GFP_NOFS will use direct reclaim but will not use any filesystem interfaces. + * Please try to avoid using this flag directly and instead use + * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't + * recurse into the FS layer with a short explanation why. All allocation + * requests will inherit GFP_NOFS implicitly. + * + * %GFP_USER is for userspace allocations that also need to be directly + * accessibly by the kernel or hardware. It is typically used by hardware + * for buffers that are mapped to userspace (e.g. graphics) that hardware + * still must DMA to. cpuset limits are enforced for these allocations. + * + * %GFP_DMA exists for historical reasons and should be avoided where possible. + * The flags indicates that the caller requires that the lowest zone be + * used (%ZONE_DMA or 16M on x86-64). Ideally, this would be removed but + * it would require careful auditing as some users really require it and + * others use the flag to avoid lowmem reserves in %ZONE_DMA and treat the + * lowest zone as a type of emergency reserve. + * + * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit + * address. + * + * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace, + * do not need to be directly accessible by the kernel but that cannot + * move once in use. An example may be a hardware allocation that maps + * data directly into userspace but has no addressing limitations. + * + * %GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not + * need direct access to but can use kmap() when access is required. They + * are expected to be movable via page reclaim or page migration. Typically, + * pages on the LRU would also be allocated with %GFP_HIGHUSER_MOVABLE. + * + * %GFP_TRANSHUGE and %GFP_TRANSHUGE_LIGHT are used for THP allocations. They + * are compound allocations that will generally fail quickly if memory is not + * available and will not wake kswapd/kcompactd on failure. The _LIGHT + * version does not attempt reclaim/compaction at all and is by default used + * in page fault path, while the non-light is used by khugepaged. */ #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) -- cgit v1.2.3 From 2b7403035459c75e193c6b04a293e518a4212de0 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 23 Aug 2018 17:01:36 -0700 Subject: mm: Change return type int to vm_fault_t for fault handlers Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Ref-> commit 1c8f422059ae ("mm: change return type to vm_fault_t") The aim is to change the return type of finish_fault() and handle_mm_fault() to vm_fault_t type. As part of that clean up return type of all other recursively called functions have been changed to vm_fault_t type. The places from where handle_mm_fault() is getting invoked will be change to vm_fault_t type but in a separate patch. vmf_error() is the newly introduce inline function in 4.17-rc6. [akpm@linux-foundation.org: don't shadow outer local `ret' in __do_huge_pmd_anonymous_page()] Link: http://lkml.kernel.org/r/20180604171727.GA20279@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 9 +++++---- include/linux/hugetlb.h | 2 +- include/linux/mm.h | 14 +++++++------- include/linux/oom.h | 2 +- include/linux/swapops.h | 5 +++-- include/linux/userfaultfd_k.h | 5 +++-- 6 files changed, 20 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a8a126259bc4..27e3e32135a8 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -6,7 +6,7 @@ #include /* only for vma_is_dax() */ -extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf); +extern vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma); @@ -23,7 +23,7 @@ static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) } #endif -extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); +extern vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, @@ -216,7 +216,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags); -extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); +extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *huge_zero_page; @@ -321,7 +321,8 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, return NULL; } -static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd) +static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, + pmd_t orig_pmd) { return 0; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c39d9170a8a0..6b68e345f0ca 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -105,7 +105,7 @@ void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(int, char *); void hugetlb_show_meminfo(void); unsigned long hugetlb_total_pages(void); -int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, +vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, diff --git a/include/linux/mm.h b/include/linux/mm.h index a9e733b5fb76..8fcc36660de6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -728,10 +728,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page); -int finish_fault(struct vm_fault *vmf); -int finish_mkwrite_fault(struct vm_fault *vmf); +vm_fault_t finish_fault(struct vm_fault *vmf); +vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #endif /* @@ -1403,8 +1403,8 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page); int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU -extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags); +extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags); extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); @@ -1413,7 +1413,7 @@ void unmap_mapping_pages(struct address_space *mapping, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); #else -static inline int handle_mm_fault(struct vm_area_struct *vma, +static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { /* should never happen if there's no MMU */ @@ -2563,7 +2563,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma, #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ -static inline int vm_fault_to_errno(int vm_fault, int foll_flags) +static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) return -ENOMEM; diff --git a/include/linux/oom.h b/include/linux/oom.h index 92f70e4c6252..69864a547663 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -88,7 +88,7 @@ static inline bool mm_is_oom_victim(struct mm_struct *mm) * * Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise. */ -static inline int check_stable_address_space(struct mm_struct *mm) +static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) { if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags))) return VM_FAULT_SIGBUS; diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 568a3553d918..22af9d8a84ae 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -4,6 +4,7 @@ #include #include +#include /* * swapcache pages are stored in the swapper_space radix tree. We want to @@ -134,7 +135,7 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry) return pfn_to_page(swp_offset(entry)); } -int device_private_entry_fault(struct vm_area_struct *vma, +vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, swp_entry_t entry, unsigned int flags, @@ -169,7 +170,7 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry) return NULL; } -static inline int device_private_entry_fault(struct vm_area_struct *vma, +static inline vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, swp_entry_t entry, unsigned int flags, diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e091f0a11b11..37c9eba75c98 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -28,7 +28,7 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) -extern int handle_userfault(struct vm_fault *vmf, unsigned long reason); +extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, @@ -77,7 +77,8 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm, #else /* CONFIG_USERFAULTFD */ /* mm helpers */ -static inline int handle_userfault(struct vm_fault *vmf, unsigned long reason) +static inline vm_fault_t handle_userfault(struct vm_fault *vmf, + unsigned long reason) { return VM_FAULT_SIGBUS; } -- cgit v1.2.3 From 0c36dd37d5b80b43f4e3dc5a1dbfe6dbd86e8f2a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 21 Aug 2018 17:02:40 +0200 Subject: i2c: remove deprecated attach_adapter callback There aren't any users left. Remove this callback from the 2.4 times. Phew, finally, that took years to reach... Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 36f357ecdf67..b79387fd57da 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -231,7 +231,6 @@ enum i2c_alert_protocol { /** * struct i2c_driver - represent an I2C device driver * @class: What kind of i2c device we instantiate (for detect) - * @attach_adapter: Callback for bus addition (deprecated) * @probe: Callback for device binding - soon to be deprecated * @probe_new: New callback for device binding * @remove: Callback for device unbinding @@ -268,11 +267,6 @@ enum i2c_alert_protocol { struct i2c_driver { unsigned int class; - /* Notifies the driver that a new bus has appeared. You should avoid - * using this, it will be removed in a near future. - */ - int (*attach_adapter)(struct i2c_adapter *) __deprecated; - /* Standard driver model interfaces */ int (*probe)(struct i2c_client *, const struct i2c_device_id *); int (*remove)(struct i2c_client *); -- cgit v1.2.3 From 5d3a01a228c703dad971cad11f14c0512c4c2713 Mon Sep 17 00:00:00 2001 From: Peter Korsgaard Date: Wed, 22 Aug 2018 18:16:07 +0200 Subject: i2c: ocores: update my email address The old @sunsite.dk address is no longer active, so update the references. Signed-off-by: Peter Korsgaard Signed-off-by: Wolfram Sang --- include/linux/platform_data/i2c-ocores.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/i2c-ocores.h b/include/linux/platform_data/i2c-ocores.h index 01edd96fe1f7..113d6b12f650 100644 --- a/include/linux/platform_data/i2c-ocores.h +++ b/include/linux/platform_data/i2c-ocores.h @@ -1,7 +1,7 @@ /* * i2c-ocores.h - definitions for the i2c-ocores interface * - * Peter Korsgaard + * Peter Korsgaard * * This file is licensed under the terms of the GNU General Public License * version 2. This program is licensed "as is" without any warranty of any -- cgit v1.2.3 From 3ad867001c91657c46dcf6656d52eb6080286fd5 Mon Sep 17 00:00:00 2001 From: Lothar Felten Date: Tue, 14 Aug 2018 09:09:37 +0200 Subject: hwmon: (ina2xx) fix sysfs shunt resistor read access fix the sysfs shunt resistor read access: return the shunt resistor value, not the calibration register contents. update email address Signed-off-by: Lothar Felten Signed-off-by: Guenter Roeck --- include/linux/platform_data/ina2xx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/ina2xx.h b/include/linux/platform_data/ina2xx.h index 9abc0ca7259b..9f0aa1b48c78 100644 --- a/include/linux/platform_data/ina2xx.h +++ b/include/linux/platform_data/ina2xx.h @@ -1,7 +1,7 @@ /* * Driver for Texas Instruments INA219, INA226 power monitor chips * - * Copyright (C) 2012 Lothar Felten + * Copyright (C) 2012 Lothar Felten * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as -- cgit v1.2.3 From 1d8f574708a3fb6f18c85486d0c5217df893c0cf Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 24 Aug 2018 15:08:29 +0100 Subject: arm/arm64: smccc-1.1: Make return values unsigned long An unfortunate consequence of having a strong typing for the input values to the SMC call is that it also affects the type of the return values, limiting r0 to 32 bits and r{1,2,3} to whatever was passed as an input. Let's turn everything into "unsigned long", which satisfies the requirements of both architectures, and allows for the full range of return values. Reported-by: Julien Grall Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon --- include/linux/arm-smccc.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index ca1d2cc2cdfa..5a91ff33720b 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -199,31 +199,31 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, #define __declare_arg_0(a0, res) \ struct arm_smccc_res *___res = res; \ - register u32 r0 asm("r0") = a0; \ + register unsigned long r0 asm("r0") = (u32)a0; \ register unsigned long r1 asm("r1"); \ register unsigned long r2 asm("r2"); \ register unsigned long r3 asm("r3") #define __declare_arg_1(a0, a1, res) \ struct arm_smccc_res *___res = res; \ - register u32 r0 asm("r0") = a0; \ - register typeof(a1) r1 asm("r1") = a1; \ + register unsigned long r0 asm("r0") = (u32)a0; \ + register unsigned long r1 asm("r1") = a1; \ register unsigned long r2 asm("r2"); \ register unsigned long r3 asm("r3") #define __declare_arg_2(a0, a1, a2, res) \ struct arm_smccc_res *___res = res; \ - register u32 r0 asm("r0") = a0; \ - register typeof(a1) r1 asm("r1") = a1; \ - register typeof(a2) r2 asm("r2") = a2; \ + register unsigned long r0 asm("r0") = (u32)a0; \ + register unsigned long r1 asm("r1") = a1; \ + register unsigned long r2 asm("r2") = a2; \ register unsigned long r3 asm("r3") #define __declare_arg_3(a0, a1, a2, a3, res) \ struct arm_smccc_res *___res = res; \ - register u32 r0 asm("r0") = a0; \ - register typeof(a1) r1 asm("r1") = a1; \ - register typeof(a2) r2 asm("r2") = a2; \ - register typeof(a3) r3 asm("r3") = a3 + register unsigned long r0 asm("r0") = (u32)a0; \ + register unsigned long r1 asm("r1") = a1; \ + register unsigned long r2 asm("r2") = a2; \ + register unsigned long r3 asm("r3") = a3 #define __declare_arg_4(a0, a1, a2, a3, a4, res) \ __declare_arg_3(a0, a1, a2, a3, res); \ -- cgit v1.2.3 From 36156f9241cb0f9e37d998052873ca7501ad4b36 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 27 Aug 2018 10:21:45 +0200 Subject: of: add helper to lookup compatible child node Add of_get_compatible_child() helper that can be used to lookup compatible child nodes. Several drivers currently use of_find_compatible_node() to lookup child nodes while failing to notice that the of_find_ functions search the entire tree depth-first (from a given start node) and therefore can match unrelated nodes. The fact that these functions also drop a reference to the node they start searching from (e.g. the parent node) is typically also overlooked, something which can lead to use-after-free bugs. Signed-off-by: Johan Hovold Signed-off-by: Rob Herring --- include/linux/of.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 4d25e4f952d9..b99a1a8c2952 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -290,6 +290,8 @@ extern struct device_node *of_get_next_child(const struct device_node *node, extern struct device_node *of_get_next_available_child( const struct device_node *node, struct device_node *prev); +extern struct device_node *of_get_compatible_child(const struct device_node *parent, + const char *compatible); extern struct device_node *of_get_child_by_name(const struct device_node *node, const char *name); @@ -632,6 +634,12 @@ static inline bool of_have_populated_dt(void) return false; } +static inline struct device_node *of_get_compatible_child(const struct device_node *parent, + const char *compatible) +{ + return NULL; +} + static inline struct device_node *of_get_child_by_name( const struct device_node *node, const char *name) -- cgit v1.2.3 From 755a8bf5579d22eb5636685c516d8dede799e27b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 24 Aug 2018 15:08:30 +0100 Subject: arm/arm64: smccc-1.1: Handle function result as parameters If someone has the silly idea to write something along those lines: extern u64 foo(void); void bar(struct arm_smccc_res *res) { arm_smccc_1_1_smc(0xbad, foo(), res); } they are in for a surprise, as this gets compiled as: 0000000000000588 : 588: a9be7bfd stp x29, x30, [sp, #-32]! 58c: 910003fd mov x29, sp 590: f9000bf3 str x19, [sp, #16] 594: aa0003f3 mov x19, x0 598: aa1e03e0 mov x0, x30 59c: 94000000 bl 0 <_mcount> 5a0: 94000000 bl 0 5a4: aa0003e1 mov x1, x0 5a8: d4000003 smc #0x0 5ac: b4000073 cbz x19, 5b8 5b0: a9000660 stp x0, x1, [x19] 5b4: a9010e62 stp x2, x3, [x19, #16] 5b8: f9400bf3 ldr x19, [sp, #16] 5bc: a8c27bfd ldp x29, x30, [sp], #32 5c0: d65f03c0 ret 5c4: d503201f nop The call to foo "overwrites" the x0 register for the return value, and we end up calling the wrong secure service. A solution is to evaluate all the parameters before assigning anything to specific registers, leading to the expected result: 0000000000000588 : 588: a9be7bfd stp x29, x30, [sp, #-32]! 58c: 910003fd mov x29, sp 590: f9000bf3 str x19, [sp, #16] 594: aa0003f3 mov x19, x0 598: aa1e03e0 mov x0, x30 59c: 94000000 bl 0 <_mcount> 5a0: 94000000 bl 0 5a4: aa0003e1 mov x1, x0 5a8: d28175a0 mov x0, #0xbad 5ac: d4000003 smc #0x0 5b0: b4000073 cbz x19, 5bc 5b4: a9000660 stp x0, x1, [x19] 5b8: a9010e62 stp x2, x3, [x19, #16] 5bc: f9400bf3 ldr x19, [sp, #16] 5c0: a8c27bfd ldp x29, x30, [sp], #32 5c4: d65f03c0 ret Reported-by: Julien Grall Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon --- include/linux/arm-smccc.h | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 5a91ff33720b..18863d56273c 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -205,41 +205,51 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, register unsigned long r3 asm("r3") #define __declare_arg_1(a0, a1, res) \ + typeof(a1) __a1 = a1; \ struct arm_smccc_res *___res = res; \ register unsigned long r0 asm("r0") = (u32)a0; \ - register unsigned long r1 asm("r1") = a1; \ + register unsigned long r1 asm("r1") = __a1; \ register unsigned long r2 asm("r2"); \ register unsigned long r3 asm("r3") #define __declare_arg_2(a0, a1, a2, res) \ + typeof(a1) __a1 = a1; \ + typeof(a2) __a2 = a2; \ struct arm_smccc_res *___res = res; \ register unsigned long r0 asm("r0") = (u32)a0; \ - register unsigned long r1 asm("r1") = a1; \ - register unsigned long r2 asm("r2") = a2; \ + register unsigned long r1 asm("r1") = __a1; \ + register unsigned long r2 asm("r2") = __a2; \ register unsigned long r3 asm("r3") #define __declare_arg_3(a0, a1, a2, a3, res) \ + typeof(a1) __a1 = a1; \ + typeof(a2) __a2 = a2; \ + typeof(a3) __a3 = a3; \ struct arm_smccc_res *___res = res; \ register unsigned long r0 asm("r0") = (u32)a0; \ - register unsigned long r1 asm("r1") = a1; \ - register unsigned long r2 asm("r2") = a2; \ - register unsigned long r3 asm("r3") = a3 + register unsigned long r1 asm("r1") = __a1; \ + register unsigned long r2 asm("r2") = __a2; \ + register unsigned long r3 asm("r3") = __a3 #define __declare_arg_4(a0, a1, a2, a3, a4, res) \ + typeof(a4) __a4 = a4; \ __declare_arg_3(a0, a1, a2, a3, res); \ - register typeof(a4) r4 asm("r4") = a4 + register unsigned long r4 asm("r4") = __a4 #define __declare_arg_5(a0, a1, a2, a3, a4, a5, res) \ + typeof(a5) __a5 = a5; \ __declare_arg_4(a0, a1, a2, a3, a4, res); \ - register typeof(a5) r5 asm("r5") = a5 + register unsigned long r5 asm("r5") = __a5 #define __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res) \ + typeof(a6) __a6 = a6; \ __declare_arg_5(a0, a1, a2, a3, a4, a5, res); \ - register typeof(a6) r6 asm("r6") = a6 + register unsigned long r6 asm("r6") = __a6 #define __declare_arg_7(a0, a1, a2, a3, a4, a5, a6, a7, res) \ + typeof(a7) __a7 = a7; \ __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res); \ - register typeof(a7) r7 asm("r7") = a7 + register unsigned long r7 asm("r7") = __a7 #define ___declare_args(count, ...) __declare_arg_ ## count(__VA_ARGS__) #define __declare_args(count, ...) ___declare_args(count, __VA_ARGS__) -- cgit v1.2.3 From f42b0e18f2e5cf34f73ef1b6327b49040b307a33 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 27 Aug 2018 07:50:47 -0500 Subject: of: add node name compare helper functions In preparation to remove device_node.name pointer, add helper functions for node name comparisons which are a common pattern throughout the kernel. Cc: Frank Rowand Signed-off-by: Rob Herring --- include/linux/of.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index b99a1a8c2952..688c52dd7b3e 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -256,6 +256,9 @@ static inline unsigned long of_read_ulong(const __be32 *cell, int size) #define OF_IS_DYNAMIC(x) test_bit(OF_DYNAMIC, &x->_flags) #define OF_MARK_DYNAMIC(x) set_bit(OF_DYNAMIC, &x->_flags) +extern bool of_node_name_eq(const struct device_node *np, const char *name); +extern bool of_node_name_prefix(const struct device_node *np, const char *prefix); + static inline const char *of_node_full_name(const struct device_node *np) { return np ? np->full_name : ""; @@ -563,6 +566,16 @@ static inline struct device_node *to_of_node(const struct fwnode_handle *fwnode) return NULL; } +static inline bool of_node_name_eq(const struct device_node *np, const char *name) +{ + return false; +} + +static inline bool of_node_name_prefix(const struct device_node *np, const char *prefix) +{ + return false; +} + static inline const char* of_node_full_name(const struct device_node *np) { return ""; -- cgit v1.2.3 From 82fe39a6bc7b866fc3ffd838e3c5a4cadb328b04 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 24 Aug 2018 16:52:44 +0200 Subject: i2c: refactor function to release a DMA safe buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit a) rename to 'put' instead of 'release' to match 'get' when obtaining the buffer b) change the argument order to have the buffer as first argument c) add a new argument telling the function if the message was transferred. This allows the function to be used also in cases where setting up DMA failed, so the buffer needs to be freed without syncing to the message buffer. Also convert the only user. Signed-off-by: Wolfram Sang Reviewed-by: Niklas Söderlund Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index b79387fd57da..65b4eaed1d96 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -855,7 +855,7 @@ static inline u8 i2c_8bit_addr_from_msg(const struct i2c_msg *msg) } u8 *i2c_get_dma_safe_msg_buf(struct i2c_msg *msg, unsigned int threshold); -void i2c_release_dma_safe_msg_buf(struct i2c_msg *msg, u8 *buf); +void i2c_put_dma_safe_msg_buf(u8 *buf, struct i2c_msg *msg, bool xferred); int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr); /** -- cgit v1.2.3 From 0413bedabc886c3a56804d1c80a58e99077b1d91 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 28 Aug 2018 15:10:48 -0500 Subject: of: Add device_type access helper functions In preparation to remove direct access to device_node.type, add of_node_is_type() and of_node_get_device_type() helpers to check and retrieve the device type. Cc: Frank Rowand Signed-off-by: Rob Herring --- include/linux/of.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 688c52dd7b3e..99b0ebf49632 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -988,6 +988,18 @@ static inline struct device_node *of_find_matching_node( return of_find_matching_node_and_match(from, matches, NULL); } +static inline const char *of_node_get_device_type(const struct device_node *np) +{ + return of_get_property(np, "type", NULL); +} + +static inline bool of_node_is_type(const struct device_node *np, const char *type) +{ + const char *match = of_node_get_device_type(np); + + return np && match && type && !strcmp(match, type); +} + /** * of_property_count_u8_elems - Count the number of u8 elements in a property * -- cgit v1.2.3 From 6b06546206868f723f2061d703a3c3c378dcbf4c Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Fri, 31 Aug 2018 16:22:42 -0400 Subject: Revert "blk-throttle: fix race between blkcg_bio_issue_check() and cgroup_rmdir()" This reverts commit 4c6994806f708559c2812b73501406e21ae5dcd0. Destroying blkgs is tricky because of the nature of the relationship. A blkg should go away when either a blkcg or a request_queue goes away. However, blkg's pin the blkcg to ensure they remain valid. To break this cycle, when a blkcg is offlined, blkgs put back their css ref. This eventually lets css_free() get called which frees the blkcg. The above commit (4c6994806f70) breaks this order of events by trying to destroy blkgs in css_free(). As the blkgs still hold references to the blkcg, css_free() is never called. The race between blkcg_bio_issue_check() and cgroup_rmdir() will be addressed in the following patch by delaying destruction of a blkg until all writeback associated with the blkcg has been finished. Fixes: 4c6994806f70 ("blk-throttle: fix race between blkcg_bio_issue_check() and cgroup_rmdir()") Reviewed-by: Josef Bacik Signed-off-by: Dennis Zhou Cc: Jiufei Xue Cc: Joseph Qi Cc: Tejun Heo Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 34aec30e06c7..1615cdd4c797 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -89,7 +89,6 @@ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; - bool offline; }; /* -- cgit v1.2.3 From 59b57717fff8b562825d9d25e0180ad7e8048ca9 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Fri, 31 Aug 2018 16:22:43 -0400 Subject: blkcg: delay blkg destruction until after writeback has finished Currently, blkcg destruction relies on a sequence of events: 1. Destruction starts. blkcg_css_offline() is called and blkgs release their reference to the blkcg. This immediately destroys the cgwbs (writeback). 2. With blkgs giving up their reference, the blkcg ref count should become zero and eventually call blkcg_css_free() which finally frees the blkcg. Jiufei Xue reported that there is a race between blkcg_bio_issue_check() and cgroup_rmdir(). To remedy this, blkg destruction becomes contingent on the completion of all writeback associated with the blkcg. A count of the number of cgwbs is maintained and once that goes to zero, blkg destruction can follow. This should prevent premature blkg destruction related to writeback. The new process for blkcg cleanup is as follows: 1. Destruction starts. blkcg_css_offline() is called which offlines writeback. Blkg destruction is delayed on the cgwb_refcnt count to avoid punting potentially large amounts of outstanding writeback to root while maintaining any ongoing policies. Here, the base cgwb_refcnt is put back. 2. When the cgwb_refcnt becomes zero, blkcg_destroy_blkgs() is called and handles destruction of blkgs. This is where the css reference held by each blkg is released. 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. This finally frees the blkg. It seems in the past blk-throttle didn't do the most understandable things with taking data from a blkg while associating with current. So, the simplification and unification of what blk-throttle is doing caused this. Fixes: 08e18eab0c579 ("block: add bi_blkg to the bio for cgroups") Reviewed-by: Josef Bacik Signed-off-by: Dennis Zhou Cc: Jiufei Xue Cc: Joseph Qi Cc: Tejun Heo Cc: Josef Bacik Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 1615cdd4c797..6d766a19f2bb 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -56,6 +56,7 @@ struct blkcg { struct list_head all_blkcgs_node; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; + refcount_t cgwb_refcnt; #endif }; @@ -386,6 +387,49 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) return cpd ? cpd->blkcg : NULL; } +extern void blkcg_destroy_blkgs(struct blkcg *blkcg); + +#ifdef CONFIG_CGROUP_WRITEBACK + +/** + * blkcg_cgwb_get - get a reference for blkcg->cgwb_list + * @blkcg: blkcg of interest + * + * This is used to track the number of active wb's related to a blkcg. + */ +static inline void blkcg_cgwb_get(struct blkcg *blkcg) +{ + refcount_inc(&blkcg->cgwb_refcnt); +} + +/** + * blkcg_cgwb_put - put a reference for @blkcg->cgwb_list + * @blkcg: blkcg of interest + * + * This is used to track the number of active wb's related to a blkcg. + * When this count goes to zero, all active wb has finished so the + * blkcg can continue destruction by calling blkcg_destroy_blkgs(). + * This work may occur in cgwb_release_workfn() on the cgwb_release + * workqueue. + */ +static inline void blkcg_cgwb_put(struct blkcg *blkcg) +{ + if (refcount_dec_and_test(&blkcg->cgwb_refcnt)) + blkcg_destroy_blkgs(blkcg); +} + +#else + +static inline void blkcg_cgwb_get(struct blkcg *blkcg) { } + +static inline void blkcg_cgwb_put(struct blkcg *blkcg) +{ + /* wb isn't being accounted, so trigger destruction right away */ + blkcg_destroy_blkgs(blkcg); +} + +#endif + /** * blkg_path - format cgroup path of blkg * @blkg: blkg of interest -- cgit v1.2.3 From c43c5e9f524ec914e7e202f9c5ab91779629ccc6 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 3 Sep 2018 10:15:33 +0200 Subject: timekeeping: Fix declaration of read_persistent_wall_and_boot_offset() It is read_persistent_wall_and_boot_offset() and not read_persistent_clock_and_boot_offset() Fixes: 3eca993740b8eb40f51 ("timekeeping: Replace read_boot_clock64() with read_persistent_wall_and_boot_offset()") Signed-off-by: Christian Borntraeger Signed-off-by: Thomas Gleixner Cc: Pavel Tatashin Link: https://lkml.kernel.org/r/20180903081533.34366-1-borntraeger@de.ibm.com --- include/linux/timekeeping.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 5d738804e3d6..a5a3cfc3c2fa 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -258,8 +258,8 @@ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); extern int persistent_clock_is_local; extern void read_persistent_clock64(struct timespec64 *ts); -void read_persistent_clock_and_boot_offset(struct timespec64 *wall_clock, - struct timespec64 *boot_offset); +void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock, + struct timespec64 *boot_offset); extern int update_persistent_clock64(struct timespec64 now); /* -- cgit v1.2.3 From 9fd0e09a4e86499639653243edfcb417a05c5c46 Mon Sep 17 00:00:00 2001 From: Anthony Wong Date: Fri, 31 Aug 2018 20:06:42 +0800 Subject: r8169: add support for NCube 8168 network card This card identifies itself as: Ethernet controller [0200]: NCube Device [10ff:8168] (rev 06) Subsystem: TP-LINK Technologies Co., Ltd. Device [7470:3468] Adding a new entry to rtl8169_pci_tbl makes the card work. Link: http://launchpad.net/bugs/1788730 Signed-off-by: Anthony Wong Signed-off-by: David S. Miller --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 99d366cb0e9f..d157983b84cf 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3084,4 +3084,6 @@ #define PCI_VENDOR_ID_OCZ 0x1b85 +#define PCI_VENDOR_ID_NCUBE 0x10ff + #endif /* _LINUX_PCI_IDS_H */ -- cgit v1.2.3 From 865e63b04e9b2a658d7f26bd13a71dcd964a9118 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Sep 2018 16:26:11 -0400 Subject: tracing: Add back in rcu_irq_enter/exit_irqson() for rcuidle tracepoints Borislav reported the following splat: ============================= WARNING: suspicious RCU usage 4.19.0-rc1+ #1 Not tainted ----------------------------- ./include/linux/rcupdate.h:631 rcu_read_lock() used illegally while idle! other info that might help us debug this: RCU used illegally from idle CPU! rcu_scheduler_active = 2, debug_locks = 1 RCU used illegally from extended quiescent state! 1 lock held by swapper/0/0: #0: 000000004557ee0e (rcu_read_lock){....}, at: perf_event_output_forward+0x0/0x130 stack backtrace: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.19.0-rc1+ #1 Hardware name: LENOVO 2320CTO/2320CTO, BIOS G2ET86WW (2.06 ) 11/13/2012 Call Trace: dump_stack+0x85/0xcb perf_event_output_forward+0xf6/0x130 __perf_event_overflow+0x52/0xe0 perf_swevent_overflow+0x91/0xb0 perf_tp_event+0x11a/0x350 ? find_held_lock+0x2d/0x90 ? __lock_acquire+0x2ce/0x1350 ? __lock_acquire+0x2ce/0x1350 ? retint_kernel+0x2d/0x2d ? find_held_lock+0x2d/0x90 ? tick_nohz_get_sleep_length+0x83/0xb0 ? perf_trace_cpu+0xbb/0xd0 ? perf_trace_buf_alloc+0x5a/0xa0 perf_trace_cpu+0xbb/0xd0 cpuidle_enter_state+0x185/0x340 do_idle+0x1eb/0x260 cpu_startup_entry+0x5f/0x70 start_kernel+0x49b/0x4a6 secondary_startup_64+0xa4/0xb0 This is due to the tracepoints moving to SRCU usage which does not require RCU to be "watching". But perf uses these tracepoints with RCU and expects it to be. Hence, we still need to add in the rcu_irq_enter/exit_irqson() calls for "rcuidle" tracepoints. This is a temporary fix until we have SRCU working in NMI context, and then perf can be converted to use that instead of normal RCU. Link: http://lkml.kernel.org/r/20180904162611.6a120068@gandalf.local.home Cc: x86-ml Cc: Peter Zijlstra Reported-by: Borislav Petkov Tested-by: Borislav Petkov Reviewed-by: "Paul E. McKenney" Fixes: e6753f23d961d ("tracepoint: Make rcuidle tracepoint callers use SRCU") Signed-off-by: Steven Rostedt (VMware) --- include/linux/tracepoint.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 7f2e16e76ac4..041f7e56a289 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -158,8 +158,10 @@ extern void syscall_unregfunc(void); * For rcuidle callers, use srcu since sched-rcu \ * doesn't work from the idle path. \ */ \ - if (rcuidle) \ + if (rcuidle) { \ idx = srcu_read_lock_notrace(&tracepoint_srcu); \ + rcu_irq_enter_irqson(); \ + } \ \ it_func_ptr = rcu_dereference_raw((tp)->funcs); \ \ @@ -171,8 +173,10 @@ extern void syscall_unregfunc(void); } while ((++it_func_ptr)->func); \ } \ \ - if (rcuidle) \ + if (rcuidle) { \ + rcu_irq_exit_irqson(); \ srcu_read_unlock_notrace(&tracepoint_srcu, idx);\ + } \ \ preempt_enable_notrace(); \ } while (0) -- cgit v1.2.3