From a7b8829d242b1a58107e9c02b09e93aec446d55c Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 5 Jul 2017 20:30:01 +0200 Subject: iio: accel: st_accel: add SPI-3wire support Add SPI Serial Interface Mode (SIM) register information in st_sensor_settings look up table to support devices (like LSM303AGR accel sensor) that allow just SPI-3wire communication mode. SIM mode has to be configured before any other operation since it is not enabled by default and the driver is not able to read without that configuration Whilst a fairly substantial patch, the actual logic is simple and it is better to have the generic fix than a band aid. Fixes: ddc05fa28606 (iio: st-accel: add support for lsm303agr accel) Signed-off-by: Lorenzo Bianconi Cc: Signed-off-by: Jonathan Cameron --- include/linux/iio/common/st_sensors.h | 7 +++++++ include/linux/platform_data/st_sensors_pdata.h | 2 ++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index 497f2b3a5a62..97f1b465d04f 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -105,6 +105,11 @@ struct st_sensor_fullscale { struct st_sensor_fullscale_avl fs_avl[ST_SENSORS_FULLSCALE_AVL_MAX]; }; +struct st_sensor_sim { + u8 addr; + u8 value; +}; + /** * struct st_sensor_bdu - ST sensor device block data update * @addr: address of the register. @@ -197,6 +202,7 @@ struct st_sensor_transfer_function { * @bdu: Block data update register. * @das: Data Alignment Selection register. * @drdy_irq: Data ready register of the sensor. + * @sim: SPI serial interface mode register of the sensor. * @multi_read_bit: Use or not particular bit for [I2C/SPI] multi-read. * @bootime: samples to discard when sensor passing from power-down to power-up. */ @@ -213,6 +219,7 @@ struct st_sensor_settings { struct st_sensor_bdu bdu; struct st_sensor_das das; struct st_sensor_data_ready_irq drdy_irq; + struct st_sensor_sim sim; bool multi_read_bit; unsigned int bootime; }; diff --git a/include/linux/platform_data/st_sensors_pdata.h b/include/linux/platform_data/st_sensors_pdata.h index 79b0e4cdb814..f8274b0c6888 100644 --- a/include/linux/platform_data/st_sensors_pdata.h +++ b/include/linux/platform_data/st_sensors_pdata.h @@ -17,10 +17,12 @@ * Available only for accelerometer and pressure sensors. * Accelerometer DRDY on LSM330 available only on pin 1 (see datasheet). * @open_drain: set the interrupt line to be open drain if possible. + * @spi_3wire: enable spi-3wire mode. */ struct st_sensors_platform_data { u8 drdy_int_pin; bool open_drain; + bool spi_3wire; }; #endif /* ST_SENSORS_PDATA_H */ -- cgit v1.2.3 From 7cfdfdc82a467c78af9132cb9c98e84415df34bc Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 10 Jul 2017 14:45:20 +0900 Subject: libata: Cleanup ata_read_log_page() The warning message "READ LOG DMA EXT failed, trying unqueued" in ata_read_log_page() as well as the macro name ATA_HORKAGE_NO_NCQ_LOG are confusing: the command READ LOG DMA EXT is not an queued NCQ command unless it is encapsulated in a RECEIVE FPDMA QUEUED command. From ACS-4 READ LOG DMA EXT description: "The device processes the READ LOG DMA EXT command in the NCQ feature set environment (see 4.13.6) if the READ LOG DMA EXT command is encapsulated in a RECEIVE FPDMA QUEUED command (see 7.30) with the inputs encapsulated as shown in 7.23.6." To avoid confusion, fix the warning messsage to mention switching to PIO and not "unqueued" and rename the macro ATA_HORKAGE_NO_NCQ_LOG to ATA_HORKAGE_NO_DMA_LOG. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Signed-off-by: Tejun Heo --- include/linux/libata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 55de3da58b1c..931c32f1f18d 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -435,7 +435,7 @@ enum { ATA_HORKAGE_NOLPM = (1 << 20), /* don't use LPM */ ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21), /* some WDs have broken LPM */ ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */ - ATA_HORKAGE_NO_NCQ_LOG = (1 << 23), /* don't use NCQ for log read */ + ATA_HORKAGE_NO_DMA_LOG = (1 << 23), /* don't use DMA for log read */ ATA_HORKAGE_NOTRIM = (1 << 24), /* don't use TRIM */ ATA_HORKAGE_MAX_SEC_1024 = (1 << 25), /* Limit max sects to 1024 */ -- cgit v1.2.3 From 36acbd9e8377c27570b887e2332a5e1f0b140e16 Mon Sep 17 00:00:00 2001 From: Faiz Abbas Date: Fri, 14 Jul 2017 18:16:44 +0530 Subject: mmc: host: omap_hsmmc: remove unused platform callbacks Remove unused callbacks in the omap_hsmmc_platform_data structure Signed-off-by: Faiz Abbas Acked-by: Tony Lindgren Signed-off-by: Ulf Hansson --- include/linux/platform_data/hsmmc-omap.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/hsmmc-omap.h b/include/linux/platform_data/hsmmc-omap.h index 8e981be2e2c2..0ff1e0dba720 100644 --- a/include/linux/platform_data/hsmmc-omap.h +++ b/include/linux/platform_data/hsmmc-omap.h @@ -55,9 +55,6 @@ struct omap_hsmmc_platform_data { u32 caps; /* Used for the MMC driver on 2430 and later */ u32 pm_caps; /* PM capabilities of the mmc */ - /* use the internal clock */ - unsigned internal_clock:1; - /* nonremovable e.g. eMMC */ unsigned nonremovable:1; @@ -73,13 +70,6 @@ struct omap_hsmmc_platform_data { int gpio_cd; /* gpio (card detect) */ int gpio_cod; /* gpio (cover detect) */ int gpio_wp; /* gpio (write protect) */ - - int (*set_power)(struct device *dev, int power_on, int vdd); - void (*remux)(struct device *dev, int power_on); - /* Call back before enabling / disabling regulators */ - void (*before_set_reg)(struct device *dev, int power_on, int vdd); - /* Call back after enabling / disabling regulators */ - void (*after_set_reg)(struct device *dev, int power_on, int vdd); /* if we have special card, init it using this callback */ void (*init_card)(struct mmc_card *card); -- cgit v1.2.3 From 43fc509c3efb5c973991ee24c449ab2a0d71dd1e Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Thu, 20 Jul 2017 11:19:58 +0100 Subject: dma-coherent: introduce interface for default DMA pool Christoph noticed [1] that default DMA pool in current form overload the DMA coherent infrastructure. In reply, Robin suggested [2] to split the per-device vs. global pool interfaces, so allocation/release from default DMA pool is driven by dma ops implementation. This patch implements Robin's idea and provide interface to allocate/release/mmap the default (aka global) DMA pool. To make it clear that existing *_from_coherent routines work on per-device pool rename them to *_from_dev_coherent. [1] https://lkml.org/lkml/2017/7/7/370 [2] https://lkml.org/lkml/2017/7/7/431 Cc: Vineet Gupta Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Suggested-by: Robin Murphy Tested-by: Andras Szemzo Reviewed-by: Robin Murphy Signed-off-by: Vladimir Murzin Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 843ab866e0f4..03c0196a6f24 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -157,16 +157,40 @@ static inline int is_device_dma_capable(struct device *dev) * These three functions are only for dma allocator. * Don't use them in device drivers. */ -int dma_alloc_from_coherent(struct device *dev, ssize_t size, +int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret); -int dma_release_from_coherent(struct device *dev, int order, void *vaddr); +int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr); -int dma_mmap_from_coherent(struct device *dev, struct vm_area_struct *vma, +int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret); + +void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle); +int dma_release_from_global_coherent(int order, void *vaddr); +int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, + size_t size, int *ret); + #else -#define dma_alloc_from_coherent(dev, size, handle, ret) (0) -#define dma_release_from_coherent(dev, order, vaddr) (0) -#define dma_mmap_from_coherent(dev, vma, vaddr, order, ret) (0) +#define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0) +#define dma_release_from_dev_coherent(dev, order, vaddr) (0) +#define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0) + +static inline void *dma_alloc_from_global_coherent(ssize_t size, + dma_addr_t *dma_handle) +{ + return NULL; +} + +static inline int dma_release_from_global_coherent(int order, void *vaddr) +{ + return 0; +} + +static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma, + void *cpu_addr, size_t size, + int *ret) +{ + return 0; +} #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ #ifdef CONFIG_HAS_DMA @@ -481,7 +505,7 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size, BUG_ON(!ops); - if (dma_alloc_from_coherent(dev, size, dma_handle, &cpu_addr)) + if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) return cpu_addr; if (!arch_dma_alloc_attrs(&dev, &flag)) @@ -503,7 +527,7 @@ static inline void dma_free_attrs(struct device *dev, size_t size, BUG_ON(!ops); WARN_ON(irqs_disabled()); - if (dma_release_from_coherent(dev, get_order(size), cpu_addr)) + if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr)) return; if (!ops->free || !cpu_addr) -- cgit v1.2.3 From 832e4c83abc5ec25af77db6c8a0f36d78f1cf825 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 May 2017 09:16:24 +0200 Subject: uuid: remove uuid_be Everything uses uuid_t now. Signed-off-by: Christoph Hellwig Reviewed-by: Amir Goldstein Reviewed-by: Andy Shevchenko --- include/linux/uuid.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uuid.h b/include/linux/uuid.h index 2251e1925ea4..33b0bdbb613c 100644 --- a/include/linux/uuid.h +++ b/include/linux/uuid.h @@ -84,26 +84,12 @@ int guid_parse(const char *uuid, guid_t *u); int uuid_parse(const char *uuid, uuid_t *u); /* backwards compatibility, don't use in new code */ -typedef uuid_t uuid_be; -#define UUID_BE(a, _b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ - UUID_INIT(a, _b, c, d0, d1, d2, d3, d4, d5, d6, d7) -#define NULL_UUID_BE \ - UUID_BE(0x00000000, 0x0000, 0x0000, 0x00, 0x00, 0x00, 0x00, \ - 0x00, 0x00, 0x00, 0x00) - #define uuid_le_gen(u) guid_gen(u) -#define uuid_be_gen(u) uuid_gen(u) #define uuid_le_to_bin(guid, u) guid_parse(guid, u) -#define uuid_be_to_bin(uuid, u) uuid_parse(uuid, u) static inline int uuid_le_cmp(const guid_t u1, const guid_t u2) { return memcmp(&u1, &u2, sizeof(guid_t)); } -static inline int uuid_be_cmp(const uuid_t u1, const uuid_t u2) -{ - return memcmp(&u1, &u2, sizeof(uuid_t)); -} - #endif -- cgit v1.2.3 From 6c423f5751b9f68bfe7c7545519d4c7159f93e1b Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 24 Jul 2017 13:58:00 -0600 Subject: sched/wait: Clean up some documentation warnings A couple of kerneldoc comments in had incorrect names for macro parameters, with this unsightly result: ./include/linux/wait.h:555: warning: No description found for parameter 'wq' ./include/linux/wait.h:555: warning: Excess function parameter 'wq_head' description in 'wait_event_interruptible_hrtimeout' ./include/linux/wait.h:759: warning: No description found for parameter 'wq_head' ./include/linux/wait.h:759: warning: Excess function parameter 'wq' description in 'wait_event_killable' Correct the comments and kill the warnings. Signed-off-by: Jonathan Corbet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20170724135800.769c4042@lwn.net Signed-off-by: Ingo Molnar --- include/linux/wait.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index b289c96151ee..5b74e36c0ca8 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -529,13 +529,13 @@ do { \ /** * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses - * @wq_head: the waitqueue to wait on + * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, as a ktime_t * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. - * The @condition is checked each time the waitqueue @wq_head is woken up. + * The @condition is checked each time the waitqueue @wq is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. @@ -735,12 +735,12 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); /** * wait_event_killable - sleep until a condition gets true - * @wq: the waitqueue to wait on + * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_KILLABLE) until the * @condition evaluates to true or a signal is received. - * The @condition is checked each time the waitqueue @wq is woken up. + * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. -- cgit v1.2.3 From 2fd4167fadd1360ab015e4f0e88e51843e49556c Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Wed, 12 Jul 2017 10:58:19 -0600 Subject: nvme: fabrics commands should use the fctype field for data direction Fabrics commands with opcode 0x7F use the fctype field to indicate data direction. Signed-off-by: Jon Derrick Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Fixes: eb793e2c ("nvme.h: add NVMe over Fabrics definitions") --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bc74da018bdc..25d8225dbd04 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1006,7 +1006,7 @@ static inline bool nvme_is_write(struct nvme_command *cmd) * Why can't we simply have a Fabrics In and Fabrics out command? */ if (unlikely(cmd->common.opcode == nvme_fabrics_command)) - return cmd->fabrics.opcode & 1; + return cmd->fabrics.fctype & 1; return cmd->common.opcode & 1; } -- cgit v1.2.3 From 9c5358e15ca12ed3dc3b1e51671dee5d155de8e0 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 17 Jul 2017 13:59:39 -0700 Subject: nvme-fc: revise TRADDR parsing The FC-NVME spec hasn't locked down on the format string for TRADDR. Currently the spec is lobbying for "nn-<16hexdigits>:pn-<16hexdigits>" where the wwn's are hex values but not prefixed by 0x. Most implementations so far expect a string format of "nn-0x<16hexdigits>:pn-0x<16hexdigits>" to be used. The transport uses the match_u64 parser which requires a leading 0x prefix to set the base properly. If it's not there, a match will either fail or return a base 10 value. The resolution in T11 is pushing out. Therefore, to fix things now and to cover any eventuality and any implementations already in the field, this patch adds support for both formats. The change consists of replacing the token matching routine with a routine that validates the fixed string format, and then builds a local copy of the hex name with a 0x prefix before calling the system parser. Note: the same parser routine exists in both the initiator and target transports. Given this is about the only "shared" item, we chose to replicate rather than create an interdendency on some shared code. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- include/linux/nvme-fc.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h index 21c37e39e41a..36cca93a5ff2 100644 --- a/include/linux/nvme-fc.h +++ b/include/linux/nvme-fc.h @@ -334,5 +334,24 @@ struct fcnvme_ls_disconnect_acc { #define NVME_FC_LS_TIMEOUT_SEC 2 /* 2 seconds */ #define NVME_FC_TGTOP_TIMEOUT_SEC 2 /* 2 seconds */ +/* + * TRADDR string must be of form "nn-<16hexdigits>:pn-<16hexdigits>" + * the string is allowed to be specified with or without a "0x" prefix + * infront of the <16hexdigits>. Without is considered the "min" string + * and with is considered the "max" string. The hexdigits may be upper + * or lower case. + */ +#define NVME_FC_TRADDR_NNLEN 3 /* "?n-" */ +#define NVME_FC_TRADDR_OXNNLEN 5 /* "?n-0x" */ +#define NVME_FC_TRADDR_HEXNAMELEN 16 +#define NVME_FC_TRADDR_MINLENGTH \ + (2 * (NVME_FC_TRADDR_NNLEN + NVME_FC_TRADDR_HEXNAMELEN) + 1) +#define NVME_FC_TRADDR_MAXLENGTH \ + (2 * (NVME_FC_TRADDR_OXNNLEN + NVME_FC_TRADDR_HEXNAMELEN) + 1) +#define NVME_FC_TRADDR_MIN_PN_OFFSET \ + (NVME_FC_TRADDR_NNLEN + NVME_FC_TRADDR_HEXNAMELEN + 1) +#define NVME_FC_TRADDR_MAX_PN_OFFSET \ + (NVME_FC_TRADDR_OXNNLEN + NVME_FC_TRADDR_HEXNAMELEN + 1) + #endif /* _NVME_FC_H */ -- cgit v1.2.3 From 0a94efb5acbb6980d7c9ab604372d93cd507e4d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jul 2017 08:36:15 -0400 Subject: workqueue: implicit ordered attribute should be overridable 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") automatically enabled ordered attribute for unbound workqueues w/ max_active == 1. Because ordered workqueues reject max_active and some attribute changes, this implicit ordered mode broke cases where the user creates an unbound workqueue w/ max_active == 1 and later explicitly changes the related attributes. This patch distinguishes explicit and implicit ordered setting and overrides from attribute changes if implict. Signed-off-by: Tejun Heo Fixes: 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") --- include/linux/workqueue.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index c102ef65cb64..db6dc9dc0482 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -323,6 +323,7 @@ enum { __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ + __WQ_ORDERED_EXPLICIT = 1 << 18, /* internal: alloc_ordered_workqueue() */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ @@ -422,7 +423,8 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ - alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) + alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | \ + __WQ_ORDERED_EXPLICIT | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) -- cgit v1.2.3 From 2eaa38d9fcba5294182268b8d11770cf3fdc9bc9 Mon Sep 17 00:00:00 2001 From: Marc Gonzalez Date: Tue, 25 Jul 2017 11:08:15 +0200 Subject: net: phy: Remove trailing semicolon in macro definition Commit e5a03bfd873c2 ("phy: Add an mdio_device structure") introduced a spurious trailing semicolon. Remove it. Signed-off-by: Marc Gonzalez Signed-off-by: David S. Miller --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 2a9567bb8186..0bb5b212ab42 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -830,7 +830,7 @@ static inline int phy_read_status(struct phy_device *phydev) dev_err(&_phydev->mdio.dev, format, ##args) #define phydev_dbg(_phydev, format, args...) \ - dev_dbg(&_phydev->mdio.dev, format, ##args); + dev_dbg(&_phydev->mdio.dev, format, ##args) static inline const char *phydev_name(const struct phy_device *phydev) { -- cgit v1.2.3 From fdeaf7e3eb37c6dbc4b4ac97dbe1945d239eb788 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Mon, 24 Jul 2017 13:40:03 +0200 Subject: KVM: make pid available for uevents without debugfs Simplify and improve the code so that the PID is always available in the uevent even when debugfs is not available. This adds a userspace_pid field to struct kvm, as per Radim's suggestion, so that the PID can be retrieved on destruction too. Acked-by: Janosch Frank Fixes: 286de8f6ac9202 ("KVM: trigger uevents when creating or destroying a VM") Signed-off-by: Claudio Imbrenda Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 648b34cabb38..890b706d1943 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -445,6 +445,7 @@ struct kvm { struct kvm_stat_data **debugfs_stat_data; struct srcu_struct srcu; struct srcu_struct irq_srcu; + pid_t userspace_pid; }; #define kvm_err(fmt, ...) \ -- cgit v1.2.3 From bb67b496c338e15813f075f482067da930f52e39 Mon Sep 17 00:00:00 2001 From: Murilo Opsfelder Araujo Date: Tue, 18 Jul 2017 14:22:20 -0300 Subject: include/linux/vfio.h: Guard powerpc-specific functions with CONFIG_VFIO_SPAPR_EEH When CONFIG_EEH=y and CONFIG_VFIO_SPAPR_EEH=n, build fails with the following: drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_release': vfio_pci.c:(.text+0xa98): undefined reference to `.vfio_spapr_pci_eeh_release' drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_open': vfio_pci.c:(.text+0x1420): undefined reference to `.vfio_spapr_pci_eeh_open' In this case, vfio_pci.c should use the empty definitions of vfio_spapr_pci_eeh_open and vfio_spapr_pci_eeh_release functions. This patch fixes it by guarding these function definitions with CONFIG_VFIO_SPAPR_EEH, the symbol that controls whether vfio_spapr_eeh.c is built, which is where the non-empty versions of these functions are. We need to make use of IS_ENABLED() macro because CONFIG_VFIO_SPAPR_EEH is a tristate option. This issue was found during a randconfig build. Logs are here: http://kisskb.ellerman.id.au/kisskb/buildresult/12982362/ Signed-off-by: Murilo Opsfelder Araujo Reviewed-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Alex Williamson --- include/linux/vfio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 586809abb273..a47b985341d1 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -152,7 +152,7 @@ extern int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, size_t *data_size); struct pci_dev; -#ifdef CONFIG_EEH +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) extern void vfio_spapr_pci_eeh_open(struct pci_dev *pdev); extern void vfio_spapr_pci_eeh_release(struct pci_dev *pdev); extern long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, @@ -173,7 +173,7 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, { return -ENOTTY; } -#endif /* CONFIG_EEH */ +#endif /* CONFIG_VFIO_SPAPR_EEH */ /* * IRQfd - generic -- cgit v1.2.3 From 273752c9ff03eb83856601b2a3458218bb949e46 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 26 Jul 2017 09:35:09 -0400 Subject: dm, dax: Make sure dm_dax_flush() is called if device supports it Currently dm_dax_flush() is not being called, even if underlying dax device supports write cache, because DAXDEV_WRITE_CACHE is not being propagated up to the DM dax device. If the underlying dax device supports write cache, set DAXDEV_WRITE_CACHE on the DM dax device. This will cause dm_dax_flush() to be called. Fixes: abebfbe2f7 ("dm: add ->flush() dax operation support") Signed-off-by: Vivek Goyal Acked-by: Dan Williams Signed-off-by: Mike Snitzer --- include/linux/dax.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 794811875732..df97b7af7e2c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -87,6 +87,7 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t size); void dax_write_cache(struct dax_device *dax_dev, bool wc); +bool dax_write_cache_enabled(struct dax_device *dax_dev); /* * We use lowest available bit in exceptional entry for locking, one bit for -- cgit v1.2.3 From 1937f8a29f4a650bc27e0311b43b53509a34fd22 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Jul 2017 12:52:59 +0200 Subject: scsi: bnx2fc: Simplify CPU hotplug code The CPU hotplug related code of this driver can be simplified by: 1) Consolidating the callbacks into a single state. The CPU thread can be torn down on the CPU which goes offline. There is no point in delaying that to the CPU dead state 2) Let the core code invoke the online/offline callbacks and remove the extra for_each_online_cpu() loops. Signed-off-by: Thomas Gleixner Signed-off-by: Martin K. Petersen --- include/linux/cpuhotplug.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index b56573bf440d..2e7b1731ad12 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -39,7 +39,6 @@ enum cpuhp_state { CPUHP_PCI_XGENE_DEAD, CPUHP_IOMMU_INTEL_DEAD, CPUHP_LUSTRE_CFS_DEAD, - CPUHP_SCSI_BNX2FC_DEAD, CPUHP_SCSI_BNX2I_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, -- cgit v1.2.3 From f9f22a86912f9d36b50e9b3b383fabfb9f22dd46 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Jul 2017 12:53:00 +0200 Subject: scsi: bnx2i: Simplify cpu hotplug code The CPU hotplug related code of this driver can be simplified by: 1) Consolidating the callbacks into a single state. The CPU thread can be torn down on the CPU which goes offline. There is no point in delaying that to the CPU dead state 2) Let the core code invoke the online/offline callbacks and remove the extra for_each_online_cpu() loops. Signed-off-by: Thomas Gleixner Acked-by: Chad Dupuis Signed-off-by: Martin K. Petersen --- include/linux/cpuhotplug.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 2e7b1731ad12..82b30e638430 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -39,7 +39,6 @@ enum cpuhp_state { CPUHP_PCI_XGENE_DEAD, CPUHP_IOMMU_INTEL_DEAD, CPUHP_LUSTRE_CFS_DEAD, - CPUHP_SCSI_BNX2I_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, -- cgit v1.2.3 From a3287c41ff405025bc57b165a0f6cd698bbbc1be Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 25 Jul 2017 16:30:34 +0100 Subject: drivers/perf: arm_pmu: Request PMU SPIs with IRQF_PER_CPU Since the PMU register interface is banked per CPU, CPU PMU interrrupts cannot be handled by a CPU other than the one with the PMU asserting the interrupt. This means that migrating PMU SPIs, as we do during a CPU hotplug operation doesn't make any sense and can lead to the IRQ being disabled entirely if we route a spurious IRQ to the new affinity target. This has been observed in practice on AMD Seattle, where CPUs on the non-boot cluster appear to take a spurious PMU IRQ when coming online, which is routed to CPU0 where it cannot be handled. This patch passes IRQF_PERCPU for PMU SPIs and forcefully sets their affinity prior to requesting them, ensuring that they cannot be migrated during hotplug events. This interacts badly with the DB8500 erratum workaround that ping-pongs the interrupt affinity from the handler, so we avoid passing IRQF_PERCPU in that case by allowing the IRQ flags to be overridden in the platdata. Fixes: 3cf7ee98b848 ("drivers/perf: arm_pmu: move irq request/free into probe") Cc: Mark Rutland Cc: Linus Walleij Signed-off-by: Will Deacon --- include/linux/perf/arm_pmu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 1360dd6d5e61..af0f44effd44 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -24,10 +24,14 @@ * interrupt and passed the address of the low level handler, * and can be used to implement any platform specific handling * before or after calling it. + * + * @irq_flags: if non-zero, these flags will be passed to request_irq + * when requesting interrupts for this PMU device. */ struct arm_pmu_platdata { irqreturn_t (*handle_irq)(int irq, void *dev, irq_handler_t pmu_handler); + unsigned long irq_flags; }; #ifdef CONFIG_ARM_PMU -- cgit v1.2.3 From 8397913303abc9333f376a518a8368fa22ca5e6e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 27 Jul 2017 12:21:11 +0200 Subject: genirq/cpuhotplug: Revert "Set force affinity flag on hotplug migration" That commit was part of the changes moving x86 to the generic CPU hotplug interrupt migration code. The force flag was required on x86 before the hierarchical irqdomain rework, but invoking set_affinity() with force=true stayed and had no side effects. At some point in the past, the force flag got repurposed to support the exynos timer interrupt affinity setting to a not yet online CPU, so the interrupt controller callback does not verify the supplied affinity mask against cpu_online_mask. Setting the flag in the CPU hotplug code causes the cpu online masking to be blocked on these irq controllers and results in potentially affining an interrupt to the CPU which is unplugged, i.e. instead of moving it away, it's just reassigned to it. As the force flags is not longer needed on x86, it's safe to revert that patch so the ARM irqchips which use the force flag work again. Add comments to that effect, so this won't happen again. Note: The online mask handling should be done in the generic code and the force flag and the masking in the irq chips removed all together, but that's not a change possible for 4.13. Fixes: 77f85e66aa8b ("genirq/cpuhotplug: Set force affinity flag on hotplug migration") Reported-by: Will Deacon Signed-off-by: Thomas Gleixner Acked-by: Will Deacon Cc: Marc Zyngier Cc: Russell King Cc: LAK Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1707271217590.3109@nanos Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 00db35b61e9e..d2d543794093 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -388,7 +388,12 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @irq_mask_ack: ack and mask an interrupt source * @irq_unmask: unmask an interrupt source * @irq_eoi: end of interrupt - * @irq_set_affinity: set the CPU affinity on SMP machines + * @irq_set_affinity: Set the CPU affinity on SMP machines. If the force + * argument is true, it tells the driver to + * unconditionally apply the affinity setting. Sanity + * checks against the supplied affinity mask are not + * required. This is used for CPU hotplug where the + * target CPU is not yet set in the cpu_online_mask. * @irq_retrigger: resend an IRQ to the CPU * @irq_set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ * @irq_set_wake: enable/disable power-management wake-on of an IRQ -- cgit v1.2.3 From 0b794ffae7afa7c4e5accac8791c4b78e8d080ce Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Thu, 25 May 2017 15:11:26 +0300 Subject: net/mlx5: Fix mlx5_ifc_mtpps_reg_bits structure size Fix miscalculation in reserved_at_1a0 field. Fixes: ee7f12205abc ('net/mlx5e: Implement 1PPS support') Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 87869c04849a..fd98aef4545c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -8175,7 +8175,7 @@ struct mlx5_ifc_mtpps_reg_bits { u8 out_pulse_duration[0x10]; u8 out_periodic_adjustment[0x10]; - u8 reserved_at_1a0[0x60]; + u8 reserved_at_1a0[0x40]; }; struct mlx5_ifc_mtppse_reg_bits { -- cgit v1.2.3 From fa3676885e3b5be1edfa1b2cc775e20a45b34a19 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Thu, 25 May 2017 16:09:34 +0300 Subject: net/mlx5e: Add field select to MTPPS register In order to mark relevant fields while setting the MTPPS register add field select. Otherwise it can cause a misconfiguration in firmware. Fixes: ee7f12205abc ('net/mlx5e: Implement 1PPS support') Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index fd98aef4545c..3030121b4746 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -7749,8 +7749,10 @@ struct mlx5_ifc_pcam_reg_bits { }; struct mlx5_ifc_mcam_enhanced_features_bits { - u8 reserved_at_0[0x7f]; + u8 reserved_at_0[0x7d]; + u8 mtpps_enh_out_per_adj[0x1]; + u8 mtpps_fs[0x1]; u8 pcie_performance_group[0x1]; }; @@ -8159,7 +8161,8 @@ struct mlx5_ifc_mtpps_reg_bits { u8 reserved_at_78[0x4]; u8 cap_pin_4_mode[0x4]; - u8 reserved_at_80[0x80]; + u8 field_select[0x20]; + u8 reserved_at_a0[0x60]; u8 enable[0x1]; u8 reserved_at_101[0xb]; @@ -8174,8 +8177,9 @@ struct mlx5_ifc_mtpps_reg_bits { u8 out_pulse_duration[0x10]; u8 out_periodic_adjustment[0x10]; + u8 enhanced_out_periodic_adjustment[0x20]; - u8 reserved_at_1a0[0x40]; + u8 reserved_at_1c0[0x20]; }; struct mlx5_ifc_mtppse_reg_bits { -- cgit v1.2.3 From 37ef38f3f83891a2f413fb872bae7d0f9bb95b27 Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Thu, 27 Jul 2017 16:15:52 -0500 Subject: tty: pl011: fix initialization order of QDF2400 E44 The work-around for Qualcomm Technologies QDF2400 Erratum 44 hinges on a global variable defined in the pl011 driver. The ACPI SPCR parsing code determines whether the work-around is needed, and if so, it changes the console name from "pl011" to "qdf2400_e44". The expectation is that the pl011 driver will implement the work-around when it sees the console name. The global variable qdf2400_e44_present is set when that happens. The problem is that work-around needs to be enabled when the pl011 driver probes, not when the console name is queried. However, sbsa_probe() is called before pl011_console_match(). The work-around appeared to work previously because the default console on QDF2400 platforms was always ttyAMA1. The first time sbsa_probe() is called (for ttyAMA0), qdf2400_e44_present is still false. Then pl011_console_match() is called, and it sets qdf2400_e44_present to true. All subsequent calls to sbsa_probe() enable the work-around. The solution is to move the global variable into spcr.c and let the pl011 driver query it during probe time. This works because all QDF2400 platforms require SPCR, so parse_spcr() will always be called. pl011_console_match still checks for the "qdf2400_e44" console name, but it doesn't do anything else special. Fixes: 5a0722b898f8 ("tty: pl011: use "qdf2400_e44" as the earlycon name for QDF2400 E44") Tested-by: Jeffrey Hugo Signed-off-by: Timur Tabi Signed-off-by: Greg Kroah-Hartman --- include/linux/acpi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index c749eef1daa1..27b4b6615263 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1209,6 +1209,7 @@ static inline bool acpi_has_watchdog(void) { return false; } #endif #ifdef CONFIG_ACPI_SPCR_TABLE +extern bool qdf2400_e44_present; int parse_spcr(bool earlycon); #else static inline int parse_spcr(bool earlycon) { return 0; } -- cgit v1.2.3 From 99f828436788f0155798145853607ca8f0e6de93 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 28 Jul 2017 22:29:51 +0100 Subject: dma-buf/sync_file: Allow multiple sync_files to wrap a single dma-fence Up until recently sync_file were create to export a single dma-fence to userspace, and so we could canabalise a bit insie dma-fence to mark whether or not we had enable polling for the sync_file itself. However, with the advent of syncobj, we do allow userspace to create multiple sync_files for a single dma-fence. (Similarly, that the sw-sync validation framework also started returning multiple sync-files wrapping a single dma-fence for a syncpt also triggering the problem.) This patch reverts my suggestion in commit e24165537312 ("dma-buf/sync_file: only enable fence signalling on poll()") to use a single bit in the shared dma-fence and restores the sync_file->flags for tracking the bits individually. Reported-by: Gustavo Padovan Fixes: f1e8c67123cf ("dma-buf/sw-sync: Use an rbtree to sort fences in the timeline") Fixes: e9083420bbac ("drm: introduce sync objects (v4)") Signed-off-by: Chris Wilson Cc: Sumit Semwal Cc: Sean Paul Cc: Gustavo Padovan Cc: dri-devel@lists.freedesktop.org Cc: # v4.13-rc1+ Signed-off-by: Gustavo Padovan Link: http://patchwork.freedesktop.org/patch/msgid/20170728212951.7818-1-chris@chris-wilson.co.uk (cherry picked from commit db1fc97ca0c0d3fdeeadf314d99a26188438940a) --- include/linux/sync_file.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sync_file.h b/include/linux/sync_file.h index 5726107963b2..0ad87c434ae6 100644 --- a/include/linux/sync_file.h +++ b/include/linux/sync_file.h @@ -43,12 +43,13 @@ struct sync_file { #endif wait_queue_head_t wq; + unsigned long flags; struct dma_fence *fence; struct dma_fence_cb cb; }; -#define POLL_ENABLED DMA_FENCE_FLAG_USER_BITS +#define POLL_ENABLED 0 struct sync_file *sync_file_create(struct dma_fence *fence); struct dma_fence *sync_file_get_fence(int fd); -- cgit v1.2.3 From 9c80034921d1ece5c0e3241ba1645bce32387684 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 29 Jul 2017 14:11:43 +0200 Subject: i2c: rephrase explanation of I2C_CLASS_DEPRECATED Hopefully making clear that it is not needed for new drivers. Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 00ca5b86a753..d501d3956f13 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -689,7 +689,8 @@ i2c_unlock_adapter(struct i2c_adapter *adapter) #define I2C_CLASS_HWMON (1<<0) /* lm_sensors, ... */ #define I2C_CLASS_DDC (1<<3) /* DDC bus on graphics adapters */ #define I2C_CLASS_SPD (1<<7) /* Memory modules */ -#define I2C_CLASS_DEPRECATED (1<<8) /* Warn users that adapter will stop using classes */ +/* Warn users that the adapter doesn't support classes anymore */ +#define I2C_CLASS_DEPRECATED (1<<8) /* Internal numbers to terminate lists */ #define I2C_CLIENT_END 0xfffeU -- cgit v1.2.3 From cb891fa6a1d5f52c5f5c07b6f7f4c6d65ea55fc0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 31 Jul 2017 16:52:36 +0200 Subject: udp6: fix jumbogram reception Since commit 67a51780aebb ("ipv6: udp: leverage scratch area helpers") udp6_recvmsg() read the skb len from the scratch area, to avoid a cache miss. But the UDP6 rx path support RFC 2675 UDPv6 jumbograms, and their length exceeds the 16 bits available in the scratch area. As a side effect the length returned by recvmsg() is: % (1<<16) This commit addresses the issue allocating one more bit in the IP6CB flags field and setting it for incoming jumbograms. Such field is still in the first cacheline, so at recvmsg() time we can check it and fallback to access skb->len if required, without a measurable overhead. Fixes: 67a51780aebb ("ipv6: udp: leverage scratch area helpers") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/ipv6.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index e1b442996f81..474d6bbc158c 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -128,6 +128,7 @@ struct inet6_skb_parm { #define IP6SKB_FRAGMENTED 16 #define IP6SKB_HOPBYHOP 32 #define IP6SKB_L3SLAVE 64 +#define IP6SKB_JUMBOGRAM 128 }; #if defined(CONFIG_NET_L3_MASTER_DEV) @@ -152,6 +153,11 @@ static inline int inet6_iif(const struct sk_buff *skb) return l3_slave ? skb->skb_iif : IP6CB(skb)->iif; } +static inline bool inet6_is_jumbogram(const struct sk_buff *skb) +{ + return !!(IP6CB(skb)->flags & IP6SKB_JUMBOGRAM); +} + /* can not be used in TCP layer after tcp_v6_fill_cb */ static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb) { -- cgit v1.2.3 From e17e8969f5c59a10083af5e260bdad6026872203 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 24 Jul 2017 16:43:49 +0200 Subject: libceph: fallback for when there isn't a pool-specific choose_arg There is now a fallback to a choose_arg index of -1 if there isn't a pool-specific choose_arg set. If you create a per-pool weight-set, that works for that pool. Otherwise we try the compat/default one. If that doesn't exist either, then we use the normal CRUSH weights. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 92e165d417a6..07eed95e10c7 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -193,7 +193,7 @@ struct crush_choose_arg { struct crush_choose_arg_map { #ifdef __KERNEL__ struct rb_node node; - u64 choose_args_index; + s64 choose_args_index; #endif struct crush_choose_arg *args; /*!< replacement for each bucket in the crushmap */ -- cgit v1.2.3 From ae78dd8139ce93a528beb7f3914531b7a7be9e30 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 27 Jul 2017 17:59:14 +0200 Subject: libceph: make RECOVERY_DELETES feature create a new interval This is needed so that the OSDs can regenerate the missing set at the start of a new interval where support for recovery deletes changed. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osd_client.h | 1 + include/linux/ceph/osdmap.h | 2 ++ include/linux/ceph/rados.h | 4 ++++ 3 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index c6d96a5f46fd..adf670ecaf94 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -148,6 +148,7 @@ struct ceph_osd_request_target { int size; int min_size; bool sort_bitwise; + bool recovery_deletes; unsigned int flags; /* CEPH_OSD_FLAG_* */ bool paused; diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index a0996cb9faed..af3444a5bfdd 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -272,6 +272,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting, u32 new_pg_num, bool old_sort_bitwise, bool new_sort_bitwise, + bool old_recovery_deletes, + bool new_recovery_deletes, const struct ceph_pg *pgid); bool ceph_osds_changed(const struct ceph_osds *old_acting, const struct ceph_osds *new_acting, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 385db08bb8b2..b8281feda9c7 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -158,6 +158,10 @@ extern const char *ceph_osd_state_name(int s); #define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ #define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ #define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ +#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */ +#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */ +#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */ +#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */ /* * The error code to return when an OSD can't handle a write -- cgit v1.2.3 From fd40559c8657418385e42f797e0b04bfc0add748 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 1 Aug 2017 16:02:47 -0400 Subject: NFSv4: Fix EXCHANGE_ID corrupt verifier issue The verifier is allocated on the stack, but the EXCHANGE_ID RPC call was changed to be asynchronous by commit 8d89bd70bc939. If we interrrupt the call to rpc_wait_for_completion_task(), we can therefore end up transmitting random stack contents in lieu of the verifier. Fixes: 8d89bd70bc939 ("NFS setup async exchange_id") Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index ca3bcc4ed4e5..62cbcb842f99 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1235,7 +1235,7 @@ struct nfs41_state_protection { struct nfs41_exchange_id_args { struct nfs_client *client; - nfs4_verifier *verifier; + nfs4_verifier verifier; u32 flags; struct nfs41_state_protection state_protect; }; -- cgit v1.2.3 From d9535cb7b7603aeb549c697ecdf92024e4d0a650 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Fri, 28 Jul 2017 17:30:02 -0500 Subject: ptp: introduce ptp auxiliary worker Many PTP drivers required to perform some asynchronous or periodic work, like periodically handling PHC counter overflow or handle delayed timestamp for RX/TX network packets. In most of the cases, such work is implemented using workqueues. Unfortunately, Kernel workqueues might introduce significant delay in work scheduling under high system load and on -RT, which could cause misbehavior of PTP drivers due to internal counter overflow, for example, and there is no way to tune its execution policy and priority manuallly. Hence, The kthread_worker can be used insted of workqueues, as it create separte named kthread for each worker and its its execution policy and priority can be configured using chrt tool. This prblem was reported for two drivers TI CPSW CPTS and dp83640, so instead of modifying each of these driver it was proposed to add PTP auxiliary worker to the PHC subsystem. The patch adds PTP auxiliary worker in PHC subsystem using kthread_worker and kthread_delayed_work and introduces two new PHC subsystem APIs: - long (*do_aux_work)(struct ptp_clock_info *ptp) callback in ptp_clock_info structure, which driver should assign if it require to perform asynchronous or periodic work. Driver should return the delay of the PTP next auxiliary work scheduling time (>=0) or negative value in case further scheduling is not required. - int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay) which allows schedule PTP auxiliary work. The name of kthread_worker thread corresponds PTP PHC device name "ptp%d". Signed-off-by: Grygorii Strashko Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index a026bfd089db..51349d124ee5 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -99,6 +99,11 @@ struct system_device_crosststamp; * parameter func: the desired function to use. * parameter chan: the function channel index to use. * + * @do_work: Request driver to perform auxiliary (periodic) operations + * Driver should return delay of the next auxiliary work scheduling + * time (>=0) or negative value in case further scheduling + * is not required. + * * Drivers should embed their ptp_clock_info within a private * structure, obtaining a reference to it using container_of(). * @@ -126,6 +131,7 @@ struct ptp_clock_info { struct ptp_clock_request *request, int on); int (*verify)(struct ptp_clock_info *ptp, unsigned int pin, enum ptp_pin_function func, unsigned int chan); + long (*do_aux_work)(struct ptp_clock_info *ptp); }; struct ptp_clock; @@ -211,6 +217,16 @@ extern int ptp_clock_index(struct ptp_clock *ptp); int ptp_find_pin(struct ptp_clock *ptp, enum ptp_pin_function func, unsigned int chan); +/** + * ptp_schedule_worker() - schedule ptp auxiliary work + * + * @ptp: The clock obtained from ptp_clock_register(). + * @delay: number of jiffies to wait before queuing + * See kthread_queue_delayed_work() for more info. + */ + +int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay); + #else static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, struct device *parent) @@ -225,6 +241,10 @@ static inline int ptp_clock_index(struct ptp_clock *ptp) static inline int ptp_find_pin(struct ptp_clock *ptp, enum ptp_pin_function func, unsigned int chan) { return -1; } +static inline int ptp_schedule_worker(struct ptp_clock *ptp, + unsigned long delay) +{ return -EOPNOTSUPP; } + #endif #endif -- cgit v1.2.3 From a477b9cd37aa81a490dfa3265b7ff4f2c5a92463 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 1 Aug 2017 20:11:02 -0500 Subject: PCI: Add pci_reset_function_locked() The implementation of PCI workarounds may require that the device is reset from its probe function. This implies that the PCI device lock is already held, and makes calling pci_reset_function() impossible (since it will itself try to take that lock). Add pci_reset_function_locked(), which is the equivalent of pci_reset_function(), except that it requires the PCI device lock to be already held by the caller. Tested-by: Ard Biesheuvel Signed-off-by: Marc Zyngier [bhelgaas: folded in fix for conflict with 52354b9d1f46 ("PCI: Remove __pci_dev_reset() and pci_dev_reset()")] Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # 4.11: 52354b9d1f46: PCI: Remove __pci_dev_reset() and pci_dev_reset() Cc: stable@vger.kernel.org # 4.11 --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 4869e66dd659..a75c13673852 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1067,6 +1067,7 @@ void pcie_flr(struct pci_dev *dev); int __pci_reset_function(struct pci_dev *dev); int __pci_reset_function_locked(struct pci_dev *dev); int pci_reset_function(struct pci_dev *dev); +int pci_reset_function_locked(struct pci_dev *dev); int pci_try_reset_function(struct pci_dev *dev); int pci_probe_reset_slot(struct pci_slot *slot); int pci_reset_slot(struct pci_slot *slot); -- cgit v1.2.3 From 6d29231000bbe0fb9e4893a9c68151ffdd3b5469 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 31 Jul 2017 10:31:27 +0200 Subject: mtd: nand: Declare tBERS, tR and tPROG as u64 to avoid integer overflow All timings in nand_sdr_timings are expressed in picoseconds but some of them may not fit in an u32. Signed-off-by: Boris Brezillon Fixes: 204e7ecd47e2 ("mtd: nand: Add a few more timings to nand_sdr_timings") Reported-by: Alexander Dahl Cc: Reviewed-by: Alexander Dahl Tested-by: Alexander Dahl Signed-off-by: Boris Brezillon --- include/linux/mtd/nand.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 892148c448cc..5216d2eb2289 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -681,10 +681,10 @@ struct nand_buffers { * @tWW_min: WP# transition to WE# low */ struct nand_sdr_timings { - u32 tBERS_max; + u64 tBERS_max; u32 tCCS_min; - u32 tPROG_max; - u32 tR_max; + u64 tPROG_max; + u64 tR_max; u32 tALH_min; u32 tADL_min; u32 tALS_min; -- cgit v1.2.3 From c994f778bb1cca8ebe7a4e528cefec233e93b5cc Mon Sep 17 00:00:00 2001 From: Inbar Karmy Date: Tue, 1 Aug 2017 16:43:43 +0300 Subject: net/mlx4_en: Fix wrong indication of Wake-on-LAN (WoL) support Currently when WoL is supported but disabled, ethtool reports: "Supports Wake-on: d". Fix the indication of Wol support, so that the indication remains "g" all the time if the NIC supports WoL. Tested: As accepted, when NIC supports WoL- ethtool reports: Supports Wake-on: g Wake-on: d when NIC doesn't support WoL- ethtool reports: Supports Wake-on: d Wake-on: d Fixes: 14c07b1358ed ("mlx4: Wake on LAN support") Signed-off-by: Inbar Karmy Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index aad5d81dfb44..b54517c05e9a 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -620,6 +620,7 @@ struct mlx4_caps { u32 dmfs_high_rate_qpn_base; u32 dmfs_high_rate_qpn_range; u32 vf_caps; + bool wol_port[MLX4_MAX_PORTS + 1]; struct mlx4_rate_limit_caps rl_caps; }; -- cgit v1.2.3 From 3898da947bbaf9e7fd5816e825978d360028bba2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 2 Aug 2017 17:55:54 +0200 Subject: KVM: avoid using rcu_dereference_protected MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During teardown, accesses to memslots and buses are using rcu_dereference_protected with an always-true condition because these accesses are done outside the usual mutexes. This is because the last reference is gone and there cannot be any concurrent modifications, but rcu_dereference_protected is ugly and unobvious. Instead, check the refcount in kvm_get_bus and __kvm_memslots. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- include/linux/kvm_host.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 890b706d1943..21a6fd6c44af 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -477,7 +477,8 @@ struct kvm { static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) { return srcu_dereference_check(kvm->buses[idx], &kvm->srcu, - lockdep_is_held(&kvm->slots_lock)); + lockdep_is_held(&kvm->slots_lock) || + !refcount_read(&kvm->users_count)); } static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) @@ -570,7 +571,8 @@ void kvm_put_kvm(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu, - lockdep_is_held(&kvm->slots_lock)); + lockdep_is_held(&kvm->slots_lock) || + !refcount_read(&kvm->users_count)); } static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) -- cgit v1.2.3 From 3ea277194daaeaa84ce75180ec7c7a2075027a68 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 2 Aug 2017 13:31:52 -0700 Subject: mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries Nadav Amit identified a theoritical race between page reclaim and mprotect due to TLB flushes being batched outside of the PTL being held. He described the race as follows: CPU0 CPU1 ---- ---- user accesses memory using RW PTE [PTE now cached in TLB] try_to_unmap_one() ==> ptep_get_and_clear() ==> set_tlb_ubc_flush_pending() mprotect(addr, PROT_READ) ==> change_pte_range() ==> [ PTE non-present - no flush ] user writes using cached RW PTE ... try_to_unmap_flush() The same type of race exists for reads when protecting for PROT_NONE and also exists for operations that can leave an old TLB entry behind such as munmap, mremap and madvise. For some operations like mprotect, it's not necessarily a data integrity issue but it is a correctness issue as there is a window where an mprotect that limits access still allows access. For munmap, it's potentially a data integrity issue although the race is massive as an munmap, mmap and return to userspace must all complete between the window when reclaim drops the PTL and flushes the TLB. However, it's theoritically possible so handle this issue by flushing the mm if reclaim is potentially currently batching TLB flushes. Other instances where a flush is required for a present pte should be ok as either the page lock is held preventing parallel reclaim or a page reference count is elevated preventing a parallel free leading to corruption. In the case of page_mkclean there isn't an obvious path that userspace could take advantage of without using the operations that are guarded by this patch. Other users such as gup as a race with reclaim looks just at PTEs. huge page variants should be ok as they don't race with reclaim. mincore only looks at PTEs. userfault also should be ok as if a parallel reclaim takes place, it will either fault the page back in or read some of the data before the flush occurs triggering a fault. Note that a variant of this patch was acked by Andy Lutomirski but this was for the x86 parts on top of his PCID work which didn't make the 4.13 merge window as expected. His ack is dropped from this version and there will be a follow-on patch on top of PCID that will include his ack. [akpm@linux-foundation.org: tweak comments] [akpm@linux-foundation.org: fix spello] Link: http://lkml.kernel.org/r/20170717155523.emckq2esjro6hf3z@suse.de Reported-by: Nadav Amit Signed-off-by: Mel Gorman Cc: Andy Lutomirski Cc: [v4.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ff151814a02d..7f384bb62d8e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -494,6 +494,10 @@ struct mm_struct { * PROT_NONE or PROT_NUMA mapped page. */ bool tlb_flush_pending; +#endif +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + /* See flush_tlb_batched_pending() */ + bool tlb_flush_batched; #endif struct uprobes_state uprobes_state; #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.2.3 From d16977f3a6cfbb5e9ce477f423a1bf343347c1ed Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Wed, 2 Aug 2017 13:32:01 -0700 Subject: kthread: fix documentation build warning The kerneldoc comment for kthread_create() had an incorrect argument name, leading to a warning in the docs build. Correct it, and make one more small step toward a warning-free build. Link: http://lkml.kernel.org/r/20170724135916.7f486c6f@lwn.net Signed-off-by: Jonathan Corbet Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kthread.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 4fec8b775895..82e197eeac91 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -15,7 +15,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), * @threadfn: the function to run in the thread * @data: data pointer for @threadfn() * @namefmt: printf-style format string for the thread name - * @...: arguments for @namefmt. + * @arg...: arguments for @namefmt. * * This macro will create a kthread on the current node, leaving it in * the stopped state. This is just a helper for kthread_create_on_node(); -- cgit v1.2.3 From 89affbf5d9ebb15c6460596822e8857ea2f9e735 Mon Sep 17 00:00:00 2001 From: Dima Zavin Date: Wed, 2 Aug 2017 13:32:18 -0700 Subject: cpuset: fix a deadlock due to incomplete patching of cpusets_enabled() In codepaths that use the begin/retry interface for reading mems_allowed_seq with irqs disabled, there exists a race condition that stalls the patch process after only modifying a subset of the static_branch call sites. This problem manifested itself as a deadlock in the slub allocator, inside get_any_partial. The loop reads mems_allowed_seq value (via read_mems_allowed_begin), performs the defrag operation, and then verifies the consistency of mem_allowed via the read_mems_allowed_retry and the cookie returned by xxx_begin. The issue here is that both begin and retry first check if cpusets are enabled via cpusets_enabled() static branch. This branch can be rewritted dynamically (via cpuset_inc) if a new cpuset is created. The x86 jump label code fully synchronizes across all CPUs for every entry it rewrites. If it rewrites only one of the callsites (specifically the one in read_mems_allowed_retry) and then waits for the smp_call_function(do_sync_core) to complete while a CPU is inside the begin/retry section with IRQs off and the mems_allowed value is changed, we can hang. This is because begin() will always return 0 (since it wasn't patched yet) while retry() will test the 0 against the actual value of the seq counter. The fix is to use two different static keys: one for begin (pre_enable_key) and one for retry (enable_key). In cpuset_inc(), we first bump the pre_enable key to ensure that cpuset_mems_allowed_begin() always return a valid seqcount if are enabling cpusets. Similarly, when disabling cpusets via cpuset_dec(), we first ensure that callers of cpuset_mems_allowed_retry() will start ignoring the seqcount value before we let cpuset_mems_allowed_begin() return 0. The relevant stack traces of the two stuck threads: CPU: 1 PID: 1415 Comm: mkdir Tainted: G L 4.9.36-00104-g540c51286237 #4 Hardware name: Default string Default string/Hardware, BIOS 4.29.1-20170526215256 05/26/2017 task: ffff8817f9c28000 task.stack: ffffc9000ffa4000 RIP: smp_call_function_many+0x1f9/0x260 Call Trace: smp_call_function+0x3b/0x70 on_each_cpu+0x2f/0x90 text_poke_bp+0x87/0xd0 arch_jump_label_transform+0x93/0x100 __jump_label_update+0x77/0x90 jump_label_update+0xaa/0xc0 static_key_slow_inc+0x9e/0xb0 cpuset_css_online+0x70/0x2e0 online_css+0x2c/0xa0 cgroup_apply_control_enable+0x27f/0x3d0 cgroup_mkdir+0x2b7/0x420 kernfs_iop_mkdir+0x5a/0x80 vfs_mkdir+0xf6/0x1a0 SyS_mkdir+0xb7/0xe0 entry_SYSCALL_64_fastpath+0x18/0xad ... CPU: 2 PID: 1 Comm: init Tainted: G L 4.9.36-00104-g540c51286237 #4 Hardware name: Default string Default string/Hardware, BIOS 4.29.1-20170526215256 05/26/2017 task: ffff8818087c0000 task.stack: ffffc90000030000 RIP: int3+0x39/0x70 Call Trace: <#DB> ? ___slab_alloc+0x28b/0x5a0 ? copy_process.part.40+0xf7/0x1de0 __slab_alloc.isra.80+0x54/0x90 copy_process.part.40+0xf7/0x1de0 copy_process.part.40+0xf7/0x1de0 kmem_cache_alloc_node+0x8a/0x280 copy_process.part.40+0xf7/0x1de0 _do_fork+0xe7/0x6c0 _raw_spin_unlock_irq+0x2d/0x60 trace_hardirqs_on_caller+0x136/0x1d0 entry_SYSCALL_64_fastpath+0x5/0xad do_syscall_64+0x27/0x350 SyS_clone+0x19/0x20 do_syscall_64+0x60/0x350 entry_SYSCALL64_slow_path+0x25/0x25 Link: http://lkml.kernel.org/r/20170731040113.14197-1-dmitriyz@waymo.com Fixes: 46e700abc44c ("mm, page_alloc: remove unnecessary taking of a seqlock when cpusets are disabled") Signed-off-by: Dima Zavin Reported-by: Cliff Spradlin Acked-by: Vlastimil Babka Cc: Peter Zijlstra Cc: Christopher Lameter Cc: Li Zefan Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 119a3f9604b0..898cfe2eeb42 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -18,6 +18,19 @@ #ifdef CONFIG_CPUSETS +/* + * Static branch rewrites can happen in an arbitrary order for a given + * key. In code paths where we need to loop with read_mems_allowed_begin() and + * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need + * to ensure that begin() always gets rewritten before retry() in the + * disabled -> enabled transition. If not, then if local irqs are disabled + * around the loop, we can deadlock since retry() would always be + * comparing the latest value of the mems_allowed seqcount against 0 as + * begin() still would see cpusets_enabled() as false. The enabled -> disabled + * transition should happen in reverse order for the same reasons (want to stop + * looking at real value of mems_allowed.sequence in retry() first). + */ +extern struct static_key_false cpusets_pre_enable_key; extern struct static_key_false cpusets_enabled_key; static inline bool cpusets_enabled(void) { @@ -32,12 +45,14 @@ static inline int nr_cpusets(void) static inline void cpuset_inc(void) { + static_branch_inc(&cpusets_pre_enable_key); static_branch_inc(&cpusets_enabled_key); } static inline void cpuset_dec(void) { static_branch_dec(&cpusets_enabled_key); + static_branch_dec(&cpusets_pre_enable_key); } extern int cpuset_init(void); @@ -115,7 +130,7 @@ extern void cpuset_print_current_mems_allowed(void); */ static inline unsigned int read_mems_allowed_begin(void) { - if (!cpusets_enabled()) + if (!static_branch_unlikely(&cpusets_pre_enable_key)) return 0; return read_seqcount_begin(¤t->mems_allowed_seq); @@ -129,7 +144,7 @@ static inline unsigned int read_mems_allowed_begin(void) */ static inline bool read_mems_allowed_retry(unsigned int seq) { - if (!cpusets_enabled()) + if (!static_branch_unlikely(&cpusets_enabled_key)) return false; return read_seqcount_retry(¤t->mems_allowed_seq, seq); -- cgit v1.2.3 From 1ee1c3f5b5cff959e0ac95a125bd15eaf88cc638 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 2 Aug 2017 13:32:27 -0700 Subject: mm: allow page_cache_get_speculative in interrupt context Kernel panic when calling the IRQ-safe __get_user_pages_fast in NMI handler. The bug was introduced by commit 2947ba054a4d ("x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation"). The original x86 __get_user_page_fast used plain get_page() or page_ref_add(). However, the generic __get_user_page_fast uses page_cache_get_speculative(), which has VM_BUG_ON(in_interrupt()). There is no reason to prevent page_cache_get_speculative from using in interrupt context. According to the author, putting a BUG_ON there is just because the code is not verifying correctness of interrupt races. I did some tests in interrupt context. There is no issue found. Removing VM_BUG_ON(in_interrupt()) for page_cache_get_speculative(). Link: http://lkml.kernel.org/r/1501609146-59730-1-git-send-email-kan.liang@intel.com Fixes: 2947ba054a4d ("x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation") Signed-off-by: Kan Liang Cc: Jens Axboe Cc: Al Viro Cc: Kirill A. Shutemov Cc: Ying Huang Cc: Nicholas Piggin Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index baa9344dcd10..79b36f57c3ba 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -163,8 +163,6 @@ void release_pages(struct page **pages, int nr, bool cold); */ static inline int page_cache_get_speculative(struct page *page) { - VM_BUG_ON(in_interrupt()); - #ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic() && !irqs_disabled()); -- cgit v1.2.3 From 931b3c1a832621b4bdcbaf783096fc267eb36fbe Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 1 Aug 2017 09:41:37 +0300 Subject: RDMA/mlx5: Fix existence check for extended address vector The extended address vector is the highest bit in be32 variable, but it was compared with the lowest. This patch fixes the endianness of that check and removes already declared define. Fixes: 17d2f88f92ce ("IB/mlx5: Add ODP atomics support") Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/linux/mlx5/qp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 6f41270d80c0..f378dc0e7eaf 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -212,7 +212,6 @@ struct mlx5_wqe_ctrl_seg { #define MLX5_WQE_CTRL_OPCODE_MASK 0xff #define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00 #define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8 -#define MLX5_WQE_AV_EXT 0x80000000 enum { MLX5_ETH_WQE_L3_INNER_CSUM = 1 << 4, -- cgit v1.2.3 From 0cca6c8920ade95e2741b2062cf1397dc546fb0f Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Sun, 6 Aug 2017 16:00:05 +0200 Subject: pinctrl: generic: update references to Documentation/pinctrl.txt Update deprecated references to Documentation/pinctrl.txt since it has been moved to Documentation/driver-api/pinctl.rst. Signed-off-by: Ludovic Desroches Fixes: 5a9b73832e9e ("pinctrl.txt: move it to the driver-api book") Signed-off-by: Linus Walleij --- include/linux/device.h | 2 +- include/linux/pinctrl/pinconf-generic.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 723cd54b94da..beabdbc08420 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -843,7 +843,7 @@ struct dev_links_info { * hibernation, system resume and during runtime PM transitions * along with subsystem-level and driver-level callbacks. * @pins: For device pin management. - * See Documentation/pinctrl.txt for details. + * See Documentation/driver-api/pinctl.rst for details. * @msi_list: Hosts MSI descriptors * @msi_domain: The generic MSI domain this device is using. * @numa_node: NUMA node this device is close to. diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 231d3075815a..e91d1b6a260d 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -81,8 +81,8 @@ * it. * @PIN_CONFIG_OUTPUT: this will configure the pin as an output and drive a * value on the line. Use argument 1 to indicate high level, argument 0 to - * indicate low level. (Please see Documentation/pinctrl.txt, section - * "GPIO mode pitfalls" for a discussion around this parameter.) + * indicate low level. (Please see Documentation/driver-api/pinctl.rst, + * section "GPIO mode pitfalls" for a discussion around this parameter.) * @PIN_CONFIG_POWER_SOURCE: if the pin can select between different power * supplies, the argument to this parameter (on a custom format) tells * the driver which alternative power source to use. -- cgit v1.2.3 From 0fb228d30b8d72bfee51f57e638d412324d44a11 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 1 Aug 2017 15:12:39 -0700 Subject: nvmet_fc: add defer_req callback for deferment of cmd buffer return At queue creation, the transport allocates a local job struct (struct nvmet_fc_fcp_iod) for each possible element of the queue. When a new CMD is received from the wire, a jobs struct is allocated from the queue and then used for the duration of the command. The job struct contains buffer space for the wire command iu. Thus, upon allocation of the job struct, the cmd iu buffer is copied to the job struct and the LLDD may immediately free/reuse the CMD IU buffer passed in the call. However, in some circumstances, due to the packetized nature of FC and the api of the FC LLDD which may issue a hw command to send the wire response, but the LLDD may not get the hw completion for the command and upcall the nvmet_fc layer before a new command may be asynchronously received on the wire. In other words, its possible for the initiator to get the response from the wire, thus believe a command slot free, and send a new command iu. The new command iu may be received by the LLDD and passed to the transport before the LLDD had serviced the hw completion and made the teardown calls for the original job struct. As such, there is no available job struct available for the new io. E.g. it appears like the host sent more queue elements than the queue size. It didn't based on it's understanding. Rather than treat this as a hard connection failure queue the new request until the job struct does free up. As the buffer isn't copied as there's no job struct, a special return value must be returned to the LLDD to signify to hold off on recycling the cmd iu buffer. And later, when a job struct is allocated and the buffer copied, a new LLDD callback is introduced to notify the LLDD and allow it to recycle it's command iu buffer. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- include/linux/nvme-fc-driver.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 6c8c5d8041b7..2591878c1d48 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -346,6 +346,11 @@ struct nvme_fc_remote_port { * indicating an FC transport Aborted status. * Entrypoint is Mandatory. * + * @defer_rcv: Called by the transport to signal the LLLD that it has + * begun processing of a previously received NVME CMD IU. The LLDD + * is now free to re-use the rcv buffer associated with the + * nvmefc_tgt_fcp_req. + * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. * Value is Mandatory. Must be at least 1. @@ -846,6 +851,8 @@ struct nvmet_fc_target_template { struct nvmefc_tgt_fcp_req *fcpreq); void (*fcp_req_release)(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *fcpreq); + void (*defer_rcv)(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *fcpreq); u32 max_hw_queues; u16 max_sgl_segments; -- cgit v1.2.3 From 16af97dc5a8975371a83d9e30a64038b48f40a2d Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 10 Aug 2017 15:23:56 -0700 Subject: mm: migrate: prevent racy access to tlb_flush_pending Patch series "fixes of TLB batching races", v6. It turns out that Linux TLB batching mechanism suffers from various races. Races that are caused due to batching during reclamation were recently handled by Mel and this patch-set deals with others. The more fundamental issue is that concurrent updates of the page-tables allow for TLB flushes to be batched on one core, while another core changes the page-tables. This other core may assume a PTE change does not require a flush based on the updated PTE value, while it is unaware that TLB flushes are still pending. This behavior affects KSM (which may result in memory corruption) and MADV_FREE and MADV_DONTNEED (which may result in incorrect behavior). A proof-of-concept can easily produce the wrong behavior of MADV_DONTNEED. Memory corruption in KSM is harder to produce in practice, but was observed by hacking the kernel and adding a delay before flushing and replacing the KSM page. Finally, there is also one memory barrier missing, which may affect architectures with weak memory model. This patch (of 7): Setting and clearing mm->tlb_flush_pending can be performed by multiple threads, since mmap_sem may only be acquired for read in task_numa_work(). If this happens, tlb_flush_pending might be cleared while one of the threads still changes PTEs and batches TLB flushes. This can lead to the same race between migration and change_protection_range() that led to the introduction of tlb_flush_pending. The result of this race was data corruption, which means that this patch also addresses a theoretically possible data corruption. An actual data corruption was not observed, yet the race was was confirmed by adding assertion to check tlb_flush_pending is not set by two threads, adding artificial latency in change_protection_range() and using sysctl to reduce kernel.numa_balancing_scan_delay_ms. Link: http://lkml.kernel.org/r/20170802000818.4760-2-namit@vmware.com Fixes: 20841405940e ("mm: fix TLB flush race between migration, and change_protection_range") Signed-off-by: Nadav Amit Acked-by: Mel Gorman Acked-by: Rik van Riel Acked-by: Minchan Kim Cc: Andy Lutomirski Cc: Hugh Dickins Cc: "David S. Miller" Cc: Andrea Arcangeli Cc: Heiko Carstens Cc: Ingo Molnar Cc: Jeff Dike Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Russell King Cc: Sergey Senozhatsky Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f384bb62d8e..f58f76ee1dfa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -493,7 +493,7 @@ struct mm_struct { * can move process memory needs to flush the TLB when moving a * PROT_NONE or PROT_NUMA mapped page. */ - bool tlb_flush_pending; + atomic_t tlb_flush_pending; #endif #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ @@ -532,33 +532,46 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { barrier(); - return mm->tlb_flush_pending; + return atomic_read(&mm->tlb_flush_pending) > 0; } -static inline void set_tlb_flush_pending(struct mm_struct *mm) + +static inline void init_tlb_flush_pending(struct mm_struct *mm) { - mm->tlb_flush_pending = true; + atomic_set(&mm->tlb_flush_pending, 0); +} + +static inline void inc_tlb_flush_pending(struct mm_struct *mm) +{ + atomic_inc(&mm->tlb_flush_pending); /* - * Guarantee that the tlb_flush_pending store does not leak into the + * Guarantee that the tlb_flush_pending increase does not leak into the * critical section updating the page tables */ smp_mb__before_spinlock(); } + /* Clearing is done after a TLB flush, which also provides a barrier. */ -static inline void clear_tlb_flush_pending(struct mm_struct *mm) +static inline void dec_tlb_flush_pending(struct mm_struct *mm) { barrier(); - mm->tlb_flush_pending = false; + atomic_dec(&mm->tlb_flush_pending); } #else static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { return false; } -static inline void set_tlb_flush_pending(struct mm_struct *mm) + +static inline void init_tlb_flush_pending(struct mm_struct *mm) { } -static inline void clear_tlb_flush_pending(struct mm_struct *mm) + +static inline void inc_tlb_flush_pending(struct mm_struct *mm) +{ +} + +static inline void dec_tlb_flush_pending(struct mm_struct *mm) { } #endif -- cgit v1.2.3 From 0a2c40487f3e4215c6ab46e7f837036badfb542b Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 10 Aug 2017 15:23:59 -0700 Subject: mm: migrate: fix barriers around tlb_flush_pending Reading tlb_flush_pending while the page-table lock is taken does not require a barrier, since the lock/unlock already acts as a barrier. Removing the barrier in mm_tlb_flush_pending() to address this issue. However, migrate_misplaced_transhuge_page() calls mm_tlb_flush_pending() while the page-table lock is already released, which may present a problem on architectures with weak memory model (PPC). To deal with this case, a new parameter is added to mm_tlb_flush_pending() to indicate if it is read without the page-table lock taken, and calling smp_mb__after_unlock_lock() in this case. Link: http://lkml.kernel.org/r/20170802000818.4760-3-namit@vmware.com Signed-off-by: Nadav Amit Acked-by: Rik van Riel Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Andy Lutomirski Cc: Mel Gorman Cc: "David S. Miller" Cc: Andrea Arcangeli Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jeff Dike Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Nadav Amit Cc: Russell King Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f58f76ee1dfa..0e478ebd2706 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -526,12 +526,12 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) /* * Memory barriers to keep this state in sync are graciously provided by * the page table locks, outside of which no page table modifications happen. - * The barriers below prevent the compiler from re-ordering the instructions - * around the memory barriers that are already present in the code. + * The barriers are used to ensure the order between tlb_flush_pending updates, + * which happen while the lock is not taken, and the PTE updates, which happen + * while the lock is taken, are serialized. */ static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { - barrier(); return atomic_read(&mm->tlb_flush_pending) > 0; } @@ -554,7 +554,13 @@ static inline void inc_tlb_flush_pending(struct mm_struct *mm) /* Clearing is done after a TLB flush, which also provides a barrier. */ static inline void dec_tlb_flush_pending(struct mm_struct *mm) { - barrier(); + /* + * Guarantee that the tlb_flush_pending does not not leak into the + * critical section, since we must order the PTE change and changes to + * the pending TLB flush indication. We could have relied on TLB flush + * as a memory barrier, but this behavior is not clearly documented. + */ + smp_mb__before_atomic(); atomic_dec(&mm->tlb_flush_pending); } #else -- cgit v1.2.3 From 56236a59556cfd3bae7bffb7e5f438b5ef0af880 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 10 Aug 2017 15:24:05 -0700 Subject: mm: refactor TLB gathering API This patch is a preparatory patch for solving race problems caused by TLB batch. For that, we will increase/decrease TLB flush pending count of mm_struct whenever tlb_[gather|finish]_mmu is called. Before making it simple, this patch separates architecture specific part and rename it to arch_tlb_[gather|finish]_mmu and generic part just calls it. It shouldn't change any behavior. Link: http://lkml.kernel.org/r/20170802000818.4760-5-namit@vmware.com Signed-off-by: Minchan Kim Signed-off-by: Nadav Amit Acked-by: Mel Gorman Cc: Ingo Molnar Cc: Russell King Cc: Tony Luck Cc: Martin Schwidefsky Cc: "David S. Miller" Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Jeff Dike Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Hugh Dickins Cc: Mel Gorman Cc: Nadav Amit Cc: Rik van Riel Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0e478ebd2706..c605f2a3a68e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -522,6 +522,12 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) return mm->cpu_vm_mask_var; } +struct mmu_gather; +extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + unsigned long start, unsigned long end); +extern void tlb_finish_mmu(struct mmu_gather *tlb, + unsigned long start, unsigned long end); + #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) /* * Memory barriers to keep this state in sync are graciously provided by -- cgit v1.2.3 From 0a2dd266dd6b7a31503b5bbe63af05961a6b446d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 10 Aug 2017 15:24:09 -0700 Subject: mm: make tlb_flush_pending global Currently, tlb_flush_pending is used only for CONFIG_[NUMA_BALANCING| COMPACTION] but upcoming patches to solve subtle TLB flush batching problem will use it regardless of compaction/NUMA so this patch doesn't remove the dependency. [akpm@linux-foundation.org: remove more ifdefs from world's ugliest printk statement] Link: http://lkml.kernel.org/r/20170802000818.4760-6-namit@vmware.com Signed-off-by: Minchan Kim Signed-off-by: Nadav Amit Acked-by: Mel Gorman Cc: "David S. Miller" Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jeff Dike Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Nadav Amit Cc: Rik van Riel Cc: Russell King Cc: Sergey Senozhatsky Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c605f2a3a68e..892a7b0196fd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -487,14 +487,12 @@ struct mm_struct { /* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; #endif -#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) /* * An operation with batched TLB flushing is going on. Anything that * can move process memory needs to flush the TLB when moving a * PROT_NONE or PROT_NUMA mapped page. */ atomic_t tlb_flush_pending; -#endif #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ bool tlb_flush_batched; @@ -528,7 +526,6 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, extern void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end); -#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) /* * Memory barriers to keep this state in sync are graciously provided by * the page table locks, outside of which no page table modifications happen. @@ -569,24 +566,6 @@ static inline void dec_tlb_flush_pending(struct mm_struct *mm) smp_mb__before_atomic(); atomic_dec(&mm->tlb_flush_pending); } -#else -static inline bool mm_tlb_flush_pending(struct mm_struct *mm) -{ - return false; -} - -static inline void init_tlb_flush_pending(struct mm_struct *mm) -{ -} - -static inline void inc_tlb_flush_pending(struct mm_struct *mm) -{ -} - -static inline void dec_tlb_flush_pending(struct mm_struct *mm) -{ -} -#endif struct vm_fault; -- cgit v1.2.3 From 99baac21e4585f4258f919502c6e23f1e5edc98c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 10 Aug 2017 15:24:12 -0700 Subject: mm: fix MADV_[FREE|DONTNEED] TLB flush miss problem Nadav reported parallel MADV_DONTNEED on same range has a stale TLB problem and Mel fixed it[1] and found same problem on MADV_FREE[2]. Quote from Mel Gorman: "The race in question is CPU 0 running madv_free and updating some PTEs while CPU 1 is also running madv_free and looking at the same PTEs. CPU 1 may have writable TLB entries for a page but fail the pte_dirty check (because CPU 0 has updated it already) and potentially fail to flush. Hence, when madv_free on CPU 1 returns, there are still potentially writable TLB entries and the underlying PTE is still present so that a subsequent write does not necessarily propagate the dirty bit to the underlying PTE any more. Reclaim at some unknown time at the future may then see that the PTE is still clean and discard the page even though a write has happened in the meantime. I think this is possible but I could have missed some protection in madv_free that prevents it happening." This patch aims for solving both problems all at once and is ready for other problem with KSM, MADV_FREE and soft-dirty story[3]. TLB batch API(tlb_[gather|finish]_mmu] uses [inc|dec]_tlb_flush_pending and mmu_tlb_flush_pending so that when tlb_finish_mmu is called, we can catch there are parallel threads going on. In that case, forcefully, flush TLB to prevent for user to access memory via stale TLB entry although it fail to gather page table entry. I confirmed this patch works with [4] test program Nadav gave so this patch supersedes "mm: Always flush VMA ranges affected by zap_page_range v2" in current mmotm. NOTE: This patch modifies arch-specific TLB gathering interface(x86, ia64, s390, sh, um). It seems most of architecture are straightforward but s390 need to be careful because tlb_flush_mmu works only if mm->context.flush_mm is set to non-zero which happens only a pte entry really is cleared by ptep_get_and_clear and friends. However, this problem never changes the pte entries but need to flush to prevent memory access from stale tlb. [1] http://lkml.kernel.org/r/20170725101230.5v7gvnjmcnkzzql3@techsingularity.net [2] http://lkml.kernel.org/r/20170725100722.2dxnmgypmwnrfawp@suse.de [3] http://lkml.kernel.org/r/BD3A0EBE-ECF4-41D4-87FA-C755EA9AB6BD@gmail.com [4] https://patchwork.kernel.org/patch/9861621/ [minchan@kernel.org: decrease tlb flush pending count in tlb_finish_mmu] Link: http://lkml.kernel.org/r/20170808080821.GA31730@bbox Link: http://lkml.kernel.org/r/20170802000818.4760-7-namit@vmware.com Signed-off-by: Minchan Kim Signed-off-by: Nadav Amit Reported-by: Nadav Amit Reported-by: Mel Gorman Acked-by: Mel Gorman Cc: Ingo Molnar Cc: Russell King Cc: Tony Luck Cc: Martin Schwidefsky Cc: "David S. Miller" Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Jeff Dike Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Hugh Dickins Cc: Mel Gorman Cc: Nadav Amit Cc: Rik van Riel Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 892a7b0196fd..3cadee0a3508 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -538,6 +538,14 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm) return atomic_read(&mm->tlb_flush_pending) > 0; } +/* + * Returns true if there are two above TLB batching threads in parallel. + */ +static inline bool mm_tlb_flush_nested(struct mm_struct *mm) +{ + return atomic_read(&mm->tlb_flush_pending) > 1; +} + static inline void init_tlb_flush_pending(struct mm_struct *mm) { atomic_set(&mm->tlb_flush_pending, 0); -- cgit v1.2.3 From 739a6d5d640a9811beef6a828253ee184dd431c5 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 11 Aug 2017 16:49:12 +0300 Subject: drm: omapdrm: Remove omapdrm platform data The omapdrm platform data are not used anymore, remove them. Signed-off-by: Laurent Pinchart Reviewed-by: Tomi Valkeinen Signed-off-by: Tomi Valkeinen --- include/linux/platform_data/omap_drm.h | 53 ---------------------------------- 1 file changed, 53 deletions(-) delete mode 100644 include/linux/platform_data/omap_drm.h (limited to 'include/linux') diff --git a/include/linux/platform_data/omap_drm.h b/include/linux/platform_data/omap_drm.h deleted file mode 100644 index f4e4a237ebd2..000000000000 --- a/include/linux/platform_data/omap_drm.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * DRM/KMS platform data for TI OMAP platforms - * - * Copyright (C) 2012 Texas Instruments - * Author: Rob Clark - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published by - * the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#ifndef __PLATFORM_DATA_OMAP_DRM_H__ -#define __PLATFORM_DATA_OMAP_DRM_H__ - -/* - * Optional platform data to configure the default configuration of which - * pipes/overlays/CRTCs are used.. if this is not provided, then instead the - * first CONFIG_DRM_OMAP_NUM_CRTCS are used, and they are each connected to - * one manager, with priority given to managers that are connected to - * detected devices. Remaining overlays are used as video planes. This - * should be a good default behavior for most cases, but yet there still - * might be times when you wish to do something different. - */ -struct omap_kms_platform_data { - /* overlays to use as CRTCs: */ - int ovl_cnt; - const int *ovl_ids; - - /* overlays to use as video planes: */ - int pln_cnt; - const int *pln_ids; - - int mgr_cnt; - const int *mgr_ids; - - int dev_cnt; - const char **dev_names; -}; - -struct omap_drm_platform_data { - uint32_t omaprev; - struct omap_kms_platform_data *kms_pdata; -}; - -#endif /* __PLATFORM_DATA_OMAP_DRM_H__ */ -- cgit v1.2.3