From c0a13aa6da5da19f9eedb562b226ec585aabdca9 Mon Sep 17 00:00:00 2001 From: Vince Hsu Date: Mon, 13 Jul 2015 13:39:39 +0100 Subject: reset: add of_reset_control_get_by_index() Add of_reset_control_get_by_index() to allow the drivers to get reset device without knowing its name. Signed-off-by: Vince Hsu [jonathanh@nvidia.com: Updated stub function to return -ENOTSUPP instead of -ENOSYS which should only be used for system calls.] Signed-off-by: Jon Hunter Signed-off-by: Philipp Zabel --- include/linux/reset.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reset.h b/include/linux/reset.h index 7f65f9cff951..6db74ad3dec7 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -38,6 +38,9 @@ static inline struct reset_control *devm_reset_control_get_optional( struct reset_control *of_reset_control_get(struct device_node *node, const char *id); +struct reset_control *of_reset_control_get_by_index( + struct device_node *node, int index); + #else static inline int reset_control_reset(struct reset_control *rstc) @@ -106,6 +109,12 @@ static inline struct reset_control *of_reset_control_get( return ERR_PTR(-ENOSYS); } +static inline struct reset_control *of_reset_control_get_by_index( + struct device_node *node, int index) +{ + return ERR_PTR(-ENOTSUPP); +} + #endif /* CONFIG_RESET_CONTROLLER */ #endif -- cgit v1.2.3 From 39b4da71ca334354f30941067f214ea2f2b92f3e Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Thu, 29 Oct 2015 09:55:00 +0100 Subject: reset: use ENOTSUPP instead of ENOSYS ENOSYS is reserved to report invalid syscalls to userspace. Consistently return ENOTSUPP to indicate that the driver doesn't support the functionality or the reset framework is not enabled at all. Signed-off-by: Philipp Zabel --- include/linux/reset.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/reset.h b/include/linux/reset.h index 6db74ad3dec7..c4c097de0ba9 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -74,7 +74,7 @@ static inline void reset_control_put(struct reset_control *rstc) static inline int device_reset_optional(struct device *dev) { - return -ENOSYS; + return -ENOTSUPP; } static inline struct reset_control *__must_check reset_control_get( @@ -94,19 +94,19 @@ static inline struct reset_control *__must_check devm_reset_control_get( static inline struct reset_control *reset_control_get_optional( struct device *dev, const char *id) { - return ERR_PTR(-ENOSYS); + return ERR_PTR(-ENOTSUPP); } static inline struct reset_control *devm_reset_control_get_optional( struct device *dev, const char *id) { - return ERR_PTR(-ENOSYS); + return ERR_PTR(-ENOTSUPP); } static inline struct reset_control *of_reset_control_get( struct device_node *node, const char *id) { - return ERR_PTR(-ENOSYS); + return ERR_PTR(-ENOTSUPP); } static inline struct reset_control *of_reset_control_get_by_index( -- cgit v1.2.3 From f70be6dac6c39c939cef82e068b7e94aca96dc99 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 17 Nov 2015 15:25:23 +0800 Subject: security: remove unused cap_is_fs_cap function Since commit 3bc1fa8a ("LSM: remove BSD secure level security module") there is no user of cap_is_fs_cap any more, so remove it. Signed-off-by: Yaowei Bai Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/capability.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index af9f0b9e80e6..b03200374608 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -171,12 +171,6 @@ static inline int cap_issubset(const kernel_cap_t a, const kernel_cap_t set) /* Used to decide between falling back on the old suser() or fsuser(). */ -static inline int cap_is_fs_cap(int cap) -{ - const kernel_cap_t __cap_fs_set = CAP_FS_SET; - return !!(CAP_TO_MASK(cap) & __cap_fs_set.cap[CAP_TO_INDEX(cap)]); -} - static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a) { const kernel_cap_t __cap_fs_set = CAP_FS_SET; -- cgit v1.2.3 From e42852bf88144affc227884b62637118ba74b783 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 17 Nov 2015 15:25:24 +0800 Subject: security/capability.h: cap_issubset/isclear can be boolean This patch makes cap_issubset/isclear return bool due to these functions only using either one or zero as their return value. No functional change. Signed-off-by: Yaowei Bai Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/capability.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index b03200374608..f314275d4e3f 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -145,24 +145,24 @@ static inline kernel_cap_t cap_invert(const kernel_cap_t c) return dest; } -static inline int cap_isclear(const kernel_cap_t a) +static inline bool cap_isclear(const kernel_cap_t a) { unsigned __capi; CAP_FOR_EACH_U32(__capi) { if (a.cap[__capi] != 0) - return 0; + return false; } - return 1; + return true; } /* * Check if "a" is a subset of "set". - * return 1 if ALL of the capabilities in "a" are also in "set" - * cap_issubset(0101, 1111) will return 1 - * return 0 if ANY of the capabilities in "a" are not in "set" - * cap_issubset(1111, 0101) will return 0 + * return true if ALL of the capabilities in "a" are also in "set" + * cap_issubset(0101, 1111) will return true + * return false if ANY of the capabilities in "a" are not in "set" + * cap_issubset(1111, 0101) will return false */ -static inline int cap_issubset(const kernel_cap_t a, const kernel_cap_t set) +static inline bool cap_issubset(const kernel_cap_t a, const kernel_cap_t set) { kernel_cap_t dest; dest = cap_drop(a, set); -- cgit v1.2.3 From b9a1a743818ea3265abf98f9431623afa8c50c86 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Nov 2015 15:25:23 +0100 Subject: ASoC: samsung: pass DMA channels as pointers ARM64 allmodconfig produces a bunch of warnings when building the samsung ASoC code: sound/soc/samsung/dmaengine.c: In function 'samsung_asoc_init_dma_data': sound/soc/samsung/dmaengine.c:53:32: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] playback_data->filter_data = (void *)playback->channel; sound/soc/samsung/dmaengine.c:60:31: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] capture_data->filter_data = (void *)capture->channel; We could easily shut up the warning by adding an intermediate cast, but there is a bigger underlying problem: The use of IORESOURCE_DMA to pass data from platform code to device drivers is dubious to start with, as what we really want is a pointer that can be passed into a filter function. Note that on s3c64xx, the pl08x DMA data is already a pointer, but gets cast to resource_size_t so we can pass it as a resource, and it then gets converted back to a pointer. In contrast, the data we pass for s3c24xx is an index into a device specific table, and we artificially convert that into a pointer for the filter function. Signed-off-by: Arnd Bergmann Reviewed-by: Krzysztof Kozlowski Signed-off-by: Mark Brown --- include/linux/platform_data/asoc-s3c.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/asoc-s3c.h b/include/linux/platform_data/asoc-s3c.h index 5e0bc779e6c5..33f88b4479e4 100644 --- a/include/linux/platform_data/asoc-s3c.h +++ b/include/linux/platform_data/asoc-s3c.h @@ -39,6 +39,10 @@ struct samsung_i2s { */ struct s3c_audio_pdata { int (*cfg_gpio)(struct platform_device *); + void *dma_playback; + void *dma_capture; + void *dma_play_sec; + void *dma_capture_mic; union { struct samsung_i2s i2s; } type; -- cgit v1.2.3 From 58383c78425e4ee1c077253cf297b641c861c02e Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 4 Nov 2015 09:56:26 +0100 Subject: gpio: change member .dev to .parent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The name .dev in a struct is normally reserved for a struct device that is let us say a superclass to the thing described by the struct. struct gpio_chip stands out by confusingly using a struct device *dev to point to the parent device (such as a platform_device) that represents the hardware. As we want to give gpio_chip:s real devices, this is not working. We need to rename this member to parent. This was done by two coccinelle scripts, I guess it is possible to combine them into one, but I don't know such stuff. They look like this: @@ struct gpio_chip *var; @@ -var->dev +var->parent and: @@ struct gpio_chip var; @@ -var.dev +var.parent and: @@ struct bgpio_chip *var; @@ -var->gc.dev +var->gc.parent Plus a few instances of bgpio that I couldn't figure out how to teach Coccinelle to rewrite. This patch hits all over the place, but I *strongly* prefer this solution to any piecemal approaches that just exercise patch mechanics all over the place. It mainly hits drivers/gpio and drivers/pinctrl which is my own backyard anyway. Cc: Haavard Skinnemoen Cc: Rafał Miłecki Cc: Richard Purdie Cc: Mauro Carvalho Chehab Cc: Alek Du Cc: Jaroslav Kysela Cc: Takashi Iwai Acked-by: Dmitry Torokhov Acked-by: Greg Kroah-Hartman Acked-by: Lee Jones Acked-by: Jiri Kosina Acked-by: Hans-Christian Egtvedt Acked-by: Jacek Anaszewski Signed-off-by: Linus Walleij --- include/linux/gpio/driver.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index d1baebf350d8..b02c43be7859 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -20,7 +20,7 @@ struct seq_file; /** * struct gpio_chip - abstract a GPIO controller * @label: for diagnostics - * @dev: optional device providing the GPIOs + * @parent: optional parent device providing the GPIOs * @cdev: class device used by sysfs interface (may be NULL) * @owner: helps prevent removal of modules exporting active GPIOs * @list: links gpio_chips together for traversal @@ -89,7 +89,7 @@ struct seq_file; */ struct gpio_chip { const char *label; - struct device *dev; + struct device *parent; struct device *cdev; struct module *owner; struct list_head list; -- cgit v1.2.3 From 1971dfb7e8f1cb9d26e8c37fee9e85a7fba6cde4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 5 Nov 2015 18:02:34 +0900 Subject: clk: fix a typo in comment block of struct clk_rate_request Signed-off-by: Masahiro Yamada Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index c56988ac63f7..7e931e75b800 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -44,7 +44,7 @@ struct dentry; * @rate: Requested clock rate. This field will be adjusted by * clock drivers according to hardware capabilities. * @min_rate: Minimum rate imposed by clk users. - * @max_rate: Maximum rate a imposed by clk users. + * @max_rate: Maximum rate imposed by clk users. * @best_parent_rate: The best parent rate a parent can provide to fulfill the * requested constraints. * @best_parent_hw: The most appropriate parent clock that fulfills the -- cgit v1.2.3 From 20dd882a09d3cce183eef4c9132c23439caaf0d6 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 29 Oct 2015 22:12:56 +0100 Subject: clk: Use static inline functions instead of macros for dummies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit if CONFIG_OF=n: drivers/clk/clk-cs2000-cp.c: In function ‘cs2000_remove’: drivers/clk/clk-cs2000-cp.c:453:22: warning: unused variable ‘np’ [-Wunused-variable] struct device_node *np = dev->of_node; ^ Convert dummies of_clk_del_provider() and of_clk_init() from macros to static inline functions to kill such compiler warnings. Reported-by: kbuild test robot Signed-off-by: Geert Uytterhoeven Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 7e931e75b800..1796f7d8526c 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -715,8 +715,7 @@ static inline int of_clk_add_provider(struct device_node *np, { return 0; } -#define of_clk_del_provider(np) \ - { while (0); } +static inline void of_clk_del_provider(struct device_node *np) {} static inline struct clk *of_clk_src_simple_get( struct of_phandle_args *clkspec, void *data) { @@ -741,8 +740,7 @@ static inline const char *of_clk_get_parent_name(struct device_node *np, { return NULL; } -#define of_clk_init(matches) \ - { while (0); } +static inline void of_clk_init(const struct of_device_id *matches) {} #endif /* CONFIG_OF */ /* -- cgit v1.2.3 From 9bdca822cbd6b66124f2298504b6c4526599dc8f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Nov 2015 22:31:11 +0100 Subject: ASoC: samsung: pass filter function as pointer As we are now passing the filter data as pointers to the drivers, we can take the final step and also pass the filter function the same way. I'm keeping this change separate, as there it's less obvious that this is a net win. Upsides of this are: - The ASoC drivers are completely independent from the DMA engine implementation, which simplifies the Kconfig logic and in theory allows the same sound drivers to be built in a kernel that supports different kinds of dmaengine drivers. - Consistency with other subsystems and drivers On the other hand, we have a few downsides: - The s3c24xx-dma driver now needs to be built-in for the ac97 platform device to be instantiated on s3c2440. - samsung_dmaengine_pcm_config cannot be marked 'const' any more because the filter function pointer needs to be set at runtime. This is safe as long we don't have multiple different DMA engines in thet same system at runtime, but is nonetheless ugly. Signed-off-by: Arnd Bergmann Reviewed-by: Krzysztof Kozlowski Signed-off-by: Mark Brown --- include/linux/platform_data/asoc-s3c.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/asoc-s3c.h b/include/linux/platform_data/asoc-s3c.h index 33f88b4479e4..15bf56ee8af7 100644 --- a/include/linux/platform_data/asoc-s3c.h +++ b/include/linux/platform_data/asoc-s3c.h @@ -13,6 +13,9 @@ */ #define S3C64XX_AC97_GPD 0 #define S3C64XX_AC97_GPE 1 + +#include + extern void s3c64xx_ac97_setup_gpio(int); struct samsung_i2s { @@ -39,6 +42,7 @@ struct samsung_i2s { */ struct s3c_audio_pdata { int (*cfg_gpio)(struct platform_device *); + dma_filter_fn dma_filter; void *dma_playback; void *dma_capture; void *dma_play_sec; -- cgit v1.2.3 From 90069ad1b602d05740ed0fc5f72f09a616ceddd0 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 19 Nov 2015 19:32:07 +0100 Subject: drivers: sh: clk: Remove obsolete and unused clk_round_parent() clk_round_parent() was only ever used by AP4EVB, until commit b24bd7e97b3784af ("ARM: shmobile: Remove AP4EVB board support"). The Common Clock Framework does not provide clk_round_parent(), hence remove it. Signed-off-by: Geert Uytterhoeven Signed-off-by: Simon Horman --- include/linux/sh_clk.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sh_clk.h b/include/linux/sh_clk.h index 1f208b2a1ed6..645896b81244 100644 --- a/include/linux/sh_clk.h +++ b/include/linux/sh_clk.h @@ -113,10 +113,6 @@ long clk_rate_div_range_round(struct clk *clk, unsigned int div_min, long clk_rate_mult_range_round(struct clk *clk, unsigned int mult_min, unsigned int mult_max, unsigned long rate); -long clk_round_parent(struct clk *clk, unsigned long target, - unsigned long *best_freq, unsigned long *parent_freq, - unsigned int div_min, unsigned int div_max); - #define SH_CLK_MSTP(_parent, _enable_reg, _enable_bit, _status_reg, _flags) \ { \ .parent = _parent, \ -- cgit v1.2.3 From e80e7edc55ba711f3fe23975061b3f1c336ceb95 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 21 Oct 2015 12:17:35 -0200 Subject: PCI/MSI: Initialize MSI capability for all architectures 1851617cd2da ("PCI/MSI: Disable MSI at enumeration even if kernel doesn't support MSI") moved dev->msi_cap and dev->msix_cap initialization from the pci_init_capabilities() path (used on all architectures) to the pci_setup_device() path (not used on Open Firmware architectures). This broke MSI or MSI-X on Open Firmware machines. 4d9aac397a5d ("powerpc/PCI: Disable MSI/MSI-X interrupts at PCI probe time in OF case") fixed it for PowerPC but not for SPARC. Set up MSI and MSI-X (initialize msi_cap and msix_cap and disable MSI and MSI-X) in pci_init_capabilities() so all architectures do it the same way. This reverts 4d9aac397a5d since this patch fixes the problem generically for both PowerPC and SPARC. [bhelgaas: changelog, make pci_msi_setup_pci_dev() static] Fixes: 1851617cd2da ("PCI/MSI: Disable MSI at enumeration even if kernel doesn't support MSI") Signed-off-by: Guilherme G. Piccoli Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index e828e7b4afec..f9f79add0afb 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1248,8 +1248,6 @@ struct msix_entry { u16 entry; /* driver uses to specify entry, OS writes */ }; -void pci_msi_setup_pci_dev(struct pci_dev *dev); - #ifdef CONFIG_PCI_MSI int pci_msi_vec_count(struct pci_dev *dev); void pci_msi_shutdown(struct pci_dev *dev); -- cgit v1.2.3 From f6ba86363908e3f4e3ef11f768be7ca2745b18cf Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 13 Aug 2014 18:33:55 +0200 Subject: drbd: Move enum write_ordering_e to drbd.h Also change the enum values to all-capital letters. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 8723f2a99e15..15a14724a087 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -357,6 +357,13 @@ enum drbd_timeout_flag { #define UUID_JUST_CREATED ((__u64)4) +enum write_ordering_e { + WO_NONE, + WO_DRAIN_IO, + WO_BDEV_FLUSH, + WO_BIO_BARRIER +}; + /* magic numbers used in meta data and network packets */ #define DRBD_MAGIC 0x83740267 #define DRBD_MAGIC_BIG 0x835a -- cgit v1.2.3 From a29728463b254ce81ecefdf20c1a02e01d9361da Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 31 Jul 2014 17:41:33 +0200 Subject: drbd: Backport the "events2" command The events2 command originates from drbd-9 development. It features more information but requires a incompatible change in output format. Therefore the previous events command continues to exist, the new improved events2 command becomes available now. This prepares the user-base for a later switch to the complete drbd9 code base. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd.h | 16 +++++++ include/linux/drbd_genl.h | 114 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+) (limited to 'include/linux') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 15a14724a087..2c44d7eadd30 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -339,6 +339,8 @@ enum drbd_state_rv { #define MDF_AL_CLEAN (1 << 7) #define MDF_AL_DISABLED (1 << 8) +#define MAX_PEERS 32 + enum drbd_uuid_index { UI_CURRENT, UI_BITMAP, @@ -349,12 +351,26 @@ enum drbd_uuid_index { UI_EXTENDED_SIZE /* Everything. */ }; +#define HISTORY_UUIDS MAX_PEERS + enum drbd_timeout_flag { UT_DEFAULT = 0, UT_DEGRADED = 1, UT_PEER_OUTDATED = 2, }; +enum drbd_notification_type { + NOTIFY_EXISTS, + NOTIFY_CREATE, + NOTIFY_CHANGE, + NOTIFY_DESTROY, + NOTIFY_CALL, + NOTIFY_RESPONSE, + + NOTIFY_CONTINUES = 0x8000, + NOTIFY_FLAGS = NOTIFY_CONTINUES, +}; + #define UUID_JUST_CREATED ((__u64)4) enum write_ordering_e { diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index 7b131ed8f9c6..90304f8697ec 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -250,6 +250,76 @@ GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms, __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach) ) +GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info, + __u32_field(1, 0, res_role) + __flg_field(2, 0, res_susp) + __flg_field(3, 0, res_susp_nod) + __flg_field(4, 0, res_susp_fen) + /* __flg_field(5, 0, res_weak) */ +) + +GENL_struct(DRBD_NLA_DEVICE_INFO, 16, device_info, + __u32_field(1, 0, dev_disk_state) +) + +GENL_struct(DRBD_NLA_CONNECTION_INFO, 17, connection_info, + __u32_field(1, 0, conn_connection_state) + __u32_field(2, 0, conn_role) +) + +GENL_struct(DRBD_NLA_PEER_DEVICE_INFO, 18, peer_device_info, + __u32_field(1, 0, peer_repl_state) + __u32_field(2, 0, peer_disk_state) + __u32_field(3, 0, peer_resync_susp_user) + __u32_field(4, 0, peer_resync_susp_peer) + __u32_field(5, 0, peer_resync_susp_dependency) +) + +GENL_struct(DRBD_NLA_RESOURCE_STATISTICS, 19, resource_statistics, + __u32_field(1, 0, res_stat_write_ordering) +) + +GENL_struct(DRBD_NLA_DEVICE_STATISTICS, 20, device_statistics, + __u64_field(1, 0, dev_size) /* (sectors) */ + __u64_field(2, 0, dev_read) /* (sectors) */ + __u64_field(3, 0, dev_write) /* (sectors) */ + __u64_field(4, 0, dev_al_writes) /* activity log writes (count) */ + __u64_field(5, 0, dev_bm_writes) /* bitmap writes (count) */ + __u32_field(6, 0, dev_upper_pending) /* application requests in progress */ + __u32_field(7, 0, dev_lower_pending) /* backing device requests in progress */ + __flg_field(8, 0, dev_upper_blocked) + __flg_field(9, 0, dev_lower_blocked) + __flg_field(10, 0, dev_al_suspended) /* activity log suspended */ + __u64_field(11, 0, dev_exposed_data_uuid) + __u64_field(12, 0, dev_current_uuid) + __u32_field(13, 0, dev_disk_flags) + __bin_field(14, 0, history_uuids, HISTORY_UUIDS * sizeof(__u64)) +) + +GENL_struct(DRBD_NLA_CONNECTION_STATISTICS, 21, connection_statistics, + __flg_field(1, 0, conn_congested) +) + +GENL_struct(DRBD_NLA_PEER_DEVICE_STATISTICS, 22, peer_device_statistics, + __u64_field(1, 0, peer_dev_received) /* sectors */ + __u64_field(2, 0, peer_dev_sent) /* sectors */ + __u32_field(3, 0, peer_dev_pending) /* number of requests */ + __u32_field(4, 0, peer_dev_unacked) /* number of requests */ + __u64_field(5, 0, peer_dev_out_of_sync) /* sectors */ + __u64_field(6, 0, peer_dev_resync_failed) /* sectors */ + __u64_field(7, 0, peer_dev_bitmap_uuid) + __u32_field(9, 0, peer_dev_flags) +) + +GENL_struct(DRBD_NLA_NOTIFICATION_HEADER, 23, drbd_notification_header, + __u32_field(1, DRBD_GENLA_F_MANDATORY, nh_type) +) + +GENL_struct(DRBD_NLA_HELPER, 24, drbd_helper_info, + __str_field(1, DRBD_GENLA_F_MANDATORY, helper_name, 32) + __u32_field(2, DRBD_GENLA_F_MANDATORY, helper_status) +) + /* * Notifications and commands (genlmsghdr->cmd) */ @@ -382,3 +452,47 @@ GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) + +GENL_notification( + DRBD_RESOURCE_STATE, 34, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_F_REQUIRED)) + +GENL_notification( + DRBD_DEVICE_STATE, 35, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_F_REQUIRED)) + +GENL_notification( + DRBD_CONNECTION_STATE, 36, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_F_REQUIRED)) + +GENL_notification( + DRBD_PEER_DEVICE_STATE, 37, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_F_REQUIRED)) + +GENL_op( + DRBD_ADM_GET_INITIAL_STATE, 38, + GENL_op_init( + .dumpit = drbd_adm_get_initial_state, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)) + +GENL_notification( + DRBD_HELPER, 40, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_HELPER, DRBD_F_REQUIRED)) + +GENL_notification( + DRBD_INITIAL_STATE_DONE, 41, events, + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)) -- cgit v1.2.3 From a55bbd375d1802141f0f043e2cd08f85c23d6209 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 28 Aug 2014 13:31:14 +0200 Subject: drbd: Backport the "status" command The status command originates the drbd9 code base. While for now we keep the status information in /proc/drbd available, this commit allows the user base to gracefully migrate their monitoring infrastructure to the new status reporting interface. In drbd9 no status information is exposed through /proc/drbd. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd_genl.h | 35 +++++++++++++++++++++++++++++++++++ include/linux/idr.h | 14 ++++++++++++++ 2 files changed, 49 insertions(+) (limited to 'include/linux') diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index 90304f8697ec..2d0e5ad5de9d 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -453,6 +453,41 @@ GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type), GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) +GENL_op(DRBD_ADM_GET_RESOURCES, 30, + GENL_op_init( + .dumpit = drbd_adm_dump_resources, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + +GENL_op(DRBD_ADM_GET_DEVICES, 31, + GENL_op_init( + .dumpit = drbd_adm_dump_devices, + .done = drbd_adm_dump_devices_done, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + +GENL_op(DRBD_ADM_GET_CONNECTIONS, 32, + GENL_op_init( + .dumpit = drbd_adm_dump_connections, + .done = drbd_adm_dump_connections_done, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_GENLA_F_MANDATORY)) + +GENL_op(DRBD_ADM_GET_PEER_DEVICES, 33, + GENL_op_init( + .dumpit = drbd_adm_dump_peer_devices, + .done = drbd_adm_dump_peer_devices_done, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + GENL_notification( DRBD_RESOURCE_STATE, 34, events, GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) diff --git a/include/linux/idr.h b/include/linux/idr.h index 013fd9bc4cb6..083d61e92706 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -135,6 +135,20 @@ static inline void *idr_find(struct idr *idr, int id) #define idr_for_each_entry(idp, entry, id) \ for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id) +/** + * idr_for_each_entry - continue iteration over an idr's elements of a given type + * @idp: idr handle + * @entry: the type * to use as cursor + * @id: id entry's key + * + * Continue to iterate over list of given type, continuing after + * the current position. + */ +#define idr_for_each_entry_continue(idp, entry, id) \ + for ((entry) = idr_get_next((idp), &(id)); \ + entry; \ + ++id, (entry) = idr_get_next((idp), &(id))) + /* * IDA - IDR based id allocator, use when translation from id to * pointer isn't necessary. -- cgit v1.2.3 From 92f108b41efdeace60e354bb619c164b50abf6f8 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 19 Jan 2015 15:43:04 +0100 Subject: drbd: drop remnants of connector -- we don't use it anymore in drbd 8.4 Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 2c44d7eadd30..392fc0edb516 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -25,7 +25,6 @@ */ #ifndef DRBD_H #define DRBD_H -#include #include #ifdef __KERNEL__ -- cgit v1.2.3 From 63a7c8ad92af5f57d4a2c5be223d6ca424c3670b Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 26 Mar 2015 20:53:55 +0100 Subject: drbd: make drbd known to lsblk: use bd_link_disk_holder lsblk should be able to pick up stacking device driver relations involving DRBD conveniently. Even though upstream kernel since 2011 says "DON'T USE THIS UNLESS YOU'RE ALREADY USING IT." a new user has been added since (bcache), which sets the precedences for us to use it as well. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 392fc0edb516..d6b3c9943a2c 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -51,7 +51,7 @@ #endif extern const char *drbd_buildtag(void); -#define REL_VERSION "8.4.5" +#define REL_VERSION "8.4.6" #define API_VERSION 1 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 101 -- cgit v1.2.3 From bb649b34dd3d8f69308f5f193cb64457069c7222 Mon Sep 17 00:00:00 2001 From: Roland Kammerer Date: Thu, 16 Apr 2015 10:17:51 +0200 Subject: lru_cache: Converted lc_seq_printf_status to return void Fix the semantic of lc_seq_printf. Currently, it always returns 0 and the return value is unused, therefore, convert the return type to void. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/lru_cache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h index 46262284de47..04fc6e6c7ff0 100644 --- a/include/linux/lru_cache.h +++ b/include/linux/lru_cache.h @@ -264,7 +264,7 @@ extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); extern void lc_committed(struct lru_cache *lc); struct seq_file; -extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); +extern void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, void (*detail) (struct seq_file *, struct lc_element *)); -- cgit v1.2.3 From b6a89194182fe7a33d383463b8b9af6e117d8146 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Wed, 16 Sep 2015 18:48:22 -0500 Subject: ARM: OMAP2+: Remove omap_mmu_dev_attr structure The structure omap_mmu_dev_attr was used in the hwmod data for supplying device-specific data through the .dev_attr field and used in constructing the platform data for legacy device creation. The legacy device creation of OMAP IOMMU devices has been cleaned up, and this structure is no longer needed, so remove it. Signed-off-by: Suman Anna Signed-off-by: Tony Lindgren --- include/linux/platform_data/iommu-omap.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/iommu-omap.h b/include/linux/platform_data/iommu-omap.h index 54a0a9582fad..0496d171700a 100644 --- a/include/linux/platform_data/iommu-omap.h +++ b/include/linux/platform_data/iommu-omap.h @@ -29,15 +29,6 @@ struct omap_iommu_arch_data { struct omap_iommu *iommu_dev; }; -/** - * struct omap_mmu_dev_attr - OMAP mmu device attributes for omap_hwmod - * @nr_tlb_entries: number of entries supported by the translation - * look-aside buffer (TLB). - */ -struct omap_mmu_dev_attr { - int nr_tlb_entries; -}; - struct iommu_platform_data { const char *name; const char *reset_name; -- cgit v1.2.3 From 07ff73a932b725b2a4675bd0cc1a86b4933e433e Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Mon, 30 Nov 2015 16:43:25 +0200 Subject: clk: ti: omap5+: dpll: implement errata i810 Errata i810 states that DPLL controller can get stuck while transitioning to a power saving state, while its M/N ratio is being re-programmed. As a workaround, before re-programming the M/N ratio, SW has to ensure the DPLL cannot start an idle state transition. SW can disable DPLL idling by setting the DPLL AUTO_DPLL_MODE=0 or keeping a clock request active by setting a dependent clock domain in SW_WKUP. This errata impacts OMAP5 and DRA7 chips, so enable the errata for these. Signed-off-by: Tero Kristo Signed-off-by: Stephen Boyd --- include/linux/clk/ti.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h index 223be696df27..75205df29b9c 100644 --- a/include/linux/clk/ti.h +++ b/include/linux/clk/ti.h @@ -286,6 +286,7 @@ struct ti_clk_features { #define TI_CLK_DPLL_HAS_FREQSEL BIT(0) #define TI_CLK_DPLL4_DENY_REPROGRAM BIT(1) #define TI_CLK_DISABLE_CLKDM_CONTROL BIT(2) +#define TI_CLK_ERRATA_I810 BIT(3) void ti_clk_setup_features(struct ti_clk_features *features); const struct ti_clk_features *ti_clk_get_features(void); -- cgit v1.2.3 From 6f3b0e8bcf3cbb87a7459b3ed018d31d918df3f8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 09:13:05 +0100 Subject: blk-mq: add a flags parameter to blk_mq_alloc_request We already have the reserved flag, and a nowait flag awkwardly encoded as a gfp_t. Add a real flags argument to make the scheme more extensible and allow for a nicer calling convention. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 8 +++++++- include/linux/blkdev.h | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index daf17d70aeca..7fc9296b5742 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -188,8 +188,14 @@ void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_free_request(struct request *rq); void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); + +enum { + BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ + BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ +}; + struct request *blk_mq_alloc_request(struct request_queue *q, int rw, - gfp_t gfp, bool reserved); + unsigned int flags); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c0d2b7927c1f..e711f294934c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -794,7 +794,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); -extern int blk_queue_enter(struct request_queue *q, gfp_t gfp); +extern int blk_queue_enter(struct request_queue *q, bool nowait); extern void blk_queue_exit(struct request_queue *q); extern void blk_start_queue(struct request_queue *q); extern void blk_stop_queue(struct request_queue *q); -- cgit v1.2.3 From 7a67cbea653e444d04d7e850ab9631a14a196422 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Nov 2015 08:58:10 +0100 Subject: nvme: use offset instead of a struct for registers This makes life easier for future non-PCI drivers where access to the registers might be more complicated. Note that Linux drivers are pretty evenly split between the two versions, and in fact the NVMe driver already uses offsets for the doorbells. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch [Fixed CMBSZ offset] Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 3af5f454c04a..a55986f6fe38 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -17,20 +17,19 @@ #include -struct nvme_bar { - __u64 cap; /* Controller Capabilities */ - __u32 vs; /* Version */ - __u32 intms; /* Interrupt Mask Set */ - __u32 intmc; /* Interrupt Mask Clear */ - __u32 cc; /* Controller Configuration */ - __u32 rsvd1; /* Reserved */ - __u32 csts; /* Controller Status */ - __u32 nssr; /* Subsystem Reset */ - __u32 aqa; /* Admin Queue Attributes */ - __u64 asq; /* Admin SQ Base Address */ - __u64 acq; /* Admin CQ Base Address */ - __u32 cmbloc; /* Controller Memory Buffer Location */ - __u32 cmbsz; /* Controller Memory Buffer Size */ +enum { + NVME_REG_CAP = 0x0000, /* Controller Capabilities */ + NVME_REG_VS = 0x0008, /* Version */ + NVME_REG_INTMS = 0x000c, /* Interrupt Mask Set */ + NVME_REG_INTMC = 0x0010, /* Interrupt Mask Set */ + NVME_REG_CC = 0x0014, /* Controller Configuration */ + NVME_REG_CSTS = 0x001c, /* Controller Status */ + NVME_REG_NSSR = 0x0020, /* NVM Subsystem Reset */ + NVME_REG_AQA = 0x0024, /* Admin Queue Attributes */ + NVME_REG_ASQ = 0x0028, /* Admin SQ Base Address */ + NVME_REG_ACQ = 0x0030, /* Admin SQ Base Address */ + NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ + NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */ }; #define NVME_CAP_MQES(cap) ((cap) & 0xffff) -- cgit v1.2.3 From 990f2f223cb479a15afda9eb8552582aa82e2404 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 15 Apr 2014 15:20:50 +0200 Subject: clk: mmp: stop using platform headers The mmp clock drivers currently hardcode the physical addresses for the clock registers. This is generally a bad idea, and it also gets in the way of multiplatform builds, which make the platform header files inaccessible to device drivers. To work around the header file problem, this patch changes the calling convention so the three mmp clock drivers get initialized with the base addresses as arguments from the platform code. It would still be useful to have a larger rework of the clock drivers, with DT integration to let the clocks actually be probed automatically, and the base addresses passed as DT properties. I am unsure if anyone is still interested in the mmp platform, so it is possible that this won't happen. Signed-off-by: Arnd Bergmann Cc: Mike Turquette Cc: Chao Xie Cc: Eric Miao Cc: Haojian Zhuang --- include/linux/clk/mmp.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 include/linux/clk/mmp.h (limited to 'include/linux') diff --git a/include/linux/clk/mmp.h b/include/linux/clk/mmp.h new file mode 100644 index 000000000000..607321fa2c2b --- /dev/null +++ b/include/linux/clk/mmp.h @@ -0,0 +1,17 @@ +#ifndef __CLK_MMP_H +#define __CLK_MMP_H + +#include + +extern void pxa168_clk_init(phys_addr_t mpmu_phys, + phys_addr_t apmu_phys, + phys_addr_t apbc_phys); +extern void pxa910_clk_init(phys_addr_t mpmu_phys, + phys_addr_t apmu_phys, + phys_addr_t apbc_phys, + phys_addr_t apbcp_phys); +extern void mmp2_clk_init(phys_addr_t mpmu_phys, + phys_addr_t apmu_phys, + phys_addr_t apbc_phys); + +#endif -- cgit v1.2.3 From a829ae57f8b17bbebc7b9b2cbec99686b88a9e25 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 2 Mar 2015 09:47:23 +0100 Subject: ARM: s3c64xx: use new adc/touchscreen driver The old ADC and touchscreen drivers are not compatible with multiplatform support, but we can use the exynos-adc driver as a replacement. This changes the common device creation functions for s3c64xx (but not s3c24xx for now) to use the new driver. To do this, we have to pass the interrupt resources in the opposite order and pass the platform data in the adc device node. Signed-off-by: Arnd Bergmann --- include/linux/platform_data/touchscreen-s3c2410.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/touchscreen-s3c2410.h b/include/linux/platform_data/touchscreen-s3c2410.h index 58dc7c5ae63b..71eccaa9835d 100644 --- a/include/linux/platform_data/touchscreen-s3c2410.h +++ b/include/linux/platform_data/touchscreen-s3c2410.h @@ -17,6 +17,7 @@ struct s3c2410_ts_mach_info { }; extern void s3c24xx_ts_set_platdata(struct s3c2410_ts_mach_info *); +extern void s3c64xx_ts_set_platdata(struct s3c2410_ts_mach_info *); /* defined by architecture to configure gpio */ extern void s3c24xx_ts_cfg_gpio(struct platform_device *dev); -- cgit v1.2.3 From 9d2aa8c7961ae9af5f75af2dc171dd4e4f441e89 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 1 Dec 2015 15:00:24 +0100 Subject: ARM/clocksource: use automatic DT probing for ux500 PRCMU The ARM core kernel already calls clocksource_of_init() so why go to all the trouble of locating and probing this node in the machine. CLOCKSOURCE_OF_DECLARE() will take care of it in the clocksource driver, and thus we can also get rid of the dangling header file Suggested-by: Arnd Bergmann Acked-by: Thomas Gleixner Acked-by: Daniel Lezcano Signed-off-by: Linus Walleij Signed-off-by: Arnd Bergmann --- include/linux/clksrc-dbx500-prcmu.h | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 include/linux/clksrc-dbx500-prcmu.h (limited to 'include/linux') diff --git a/include/linux/clksrc-dbx500-prcmu.h b/include/linux/clksrc-dbx500-prcmu.h deleted file mode 100644 index 4fb8119c49e4..000000000000 --- a/include/linux/clksrc-dbx500-prcmu.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (C) ST-Ericsson SA 2011 - * - * License Terms: GNU General Public License v2 - * Author: Mattias Wallin - * - */ -#ifndef __CLKSRC_DBX500_PRCMU_H -#define __CLKSRC_DBX500_PRCMU_H - -#include -#include - -#ifdef CONFIG_CLKSRC_DBX500_PRCMU -void __init clksrc_dbx500_prcmu_init(void __iomem *base); -#else -static inline void __init clksrc_dbx500_prcmu_init(void __iomem *base) {} -#endif - -#endif -- cgit v1.2.3 From cdd5de500b2c90d5181ebc963826019a0a4234ba Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Tue, 22 Sep 2015 19:14:54 -0500 Subject: soc: ti: Add wkup_m3_ipc driver Introduce a wkup_m3_ipc driver to handle communication between the MPU and Cortex M3 wkup_m3 present on am335x. This driver is responsible for actually booting the wkup_m3_rproc and also handling all IPC which is done using the IPC registers in the control module, a mailbox, and a separate interrupt back from the wkup_m3. A small API is exposed for executing specific power commands, which include configuring for low power mode, request a transition to a low power mode, and status info on a previous transition. Signed-off-by: Dave Gerlach Signed-off-by: Tony Lindgren --- include/linux/wkup_m3_ipc.h | 55 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 include/linux/wkup_m3_ipc.h (limited to 'include/linux') diff --git a/include/linux/wkup_m3_ipc.h b/include/linux/wkup_m3_ipc.h new file mode 100644 index 000000000000..d6ba7d39a62f --- /dev/null +++ b/include/linux/wkup_m3_ipc.h @@ -0,0 +1,55 @@ +/* + * TI Wakeup M3 for AMx3 SoCs Power Management Routines + * + * Copyright (C) 2015 Texas Instruments Incorporated - http://www.ti.com/ + * Dave Gerlach + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation version 2. + * + * This program is distributed "as is" WITHOUT ANY WARRANTY of any + * kind, whether express or implied; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_WKUP_M3_IPC_H +#define _LINUX_WKUP_M3_IPC_H + +#define WKUP_M3_DEEPSLEEP 1 +#define WKUP_M3_STANDBY 2 +#define WKUP_M3_IDLE 3 + +#include + +struct wkup_m3_ipc_ops; + +struct wkup_m3_ipc { + struct rproc *rproc; + + void __iomem *ipc_mem_base; + struct device *dev; + + int mem_type; + unsigned long resume_addr; + int state; + + struct completion sync_complete; + struct mbox_client mbox_client; + struct mbox_chan *mbox; + + struct wkup_m3_ipc_ops *ops; +}; + +struct wkup_m3_ipc_ops { + void (*set_mem_type)(struct wkup_m3_ipc *m3_ipc, int mem_type); + void (*set_resume_address)(struct wkup_m3_ipc *m3_ipc, void *addr); + int (*prepare_low_power)(struct wkup_m3_ipc *m3_ipc, int state); + int (*finish_low_power)(struct wkup_m3_ipc *m3_ipc); + int (*request_pm_status)(struct wkup_m3_ipc *m3_ipc); +}; + +struct wkup_m3_ipc *wkup_m3_ipc_get(void); +void wkup_m3_ipc_put(struct wkup_m3_ipc *m3_ipc); +#endif /* _LINUX_WKUP_M3_IPC_H */ -- cgit v1.2.3 From 06c1e3902aa74b7432a7e82bb4a5aca233a42839 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 3 Dec 2015 09:32:21 -0700 Subject: blk-integrity: empty implementation when disabled This patch moves the blk_integrity_payload definition outside the CONFIG_BLK_DEV_INTERITY dependency and provides empty function implementations when the kernel configuration disables integrity extensions. This simplifies drivers that make use of these to map user data so they don't need to repeat the same configuration checks. Signed-off-by: Keith Busch Updated by Jens to pass an error pointer return from bio_integrity_alloc(), otherwise if CONFIG_BLK_DEV_INTEGRITY isn't set, we return a weird ENOMEM from __nvme_submit_user_cmd() if a meta buffer is set. Signed-off-by: Jens Axboe --- include/linux/bio.h | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index b9b6e046b52e..5349e6816cbb 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -318,16 +318,6 @@ enum bip_flags { BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ }; -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) -{ - if (bio->bi_rw & REQ_INTEGRITY) - return bio->bi_integrity; - - return NULL; -} - /* * bio integrity payload */ @@ -349,6 +339,16 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[0];/* embedded bvec array */ }; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) +{ + if (bio->bi_rw & REQ_INTEGRITY) + return bio->bi_integrity; + + return NULL; +} + static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { struct bio_integrity_payload *bip = bio_integrity(bio); @@ -795,6 +795,18 @@ static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) return false; } +static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp, + unsigned int nr) +{ + return ERR_PTR(-EINVAL); +} + +static inline int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + return 0; +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ #endif /* CONFIG_BLOCK */ -- cgit v1.2.3 From 67098119abeb596823ed0a74dd8cdcfbee4c2210 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 8 Dec 2015 10:43:28 +0000 Subject: soc: dove: add legacy support to PMU driver Add support for legacy non-DT Dove to the PMU driver, so that we can transition the legacy support over. [gregory.clement@free-electrons.com: removed pm_genpd_poweroff_unused] Acked-by: Arnd Bergmann Signed-off-by: Russell King Signed-off-by: Gregory CLEMENT --- include/linux/soc/dove/pmu.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/dove/pmu.h b/include/linux/soc/dove/pmu.h index 9c99f84bcc0e..765386972b55 100644 --- a/include/linux/soc/dove/pmu.h +++ b/include/linux/soc/dove/pmu.h @@ -1,6 +1,25 @@ #ifndef LINUX_SOC_DOVE_PMU_H #define LINUX_SOC_DOVE_PMU_H +#include + +struct dove_pmu_domain_initdata { + u32 pwr_mask; + u32 rst_mask; + u32 iso_mask; + const char *name; +}; + +struct dove_pmu_initdata { + void __iomem *pmc_base; + void __iomem *pmu_base; + int irq; + int irq_domain_start; + const struct dove_pmu_domain_initdata *domains; +}; + +int dove_init_pmu_legacy(const struct dove_pmu_initdata *); + int dove_init_pmu(void); #endif -- cgit v1.2.3 From 9460ae2ff3081b43e4f93126cfd26a27cda1b6a1 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 24 Sep 2015 18:25:01 -0700 Subject: soc: qcom: Introduce common SMEM state machine code This implements a common API for handling and exposing SMP2P and SMSM state information. Signed-off-by: Bjorn Andersson Signed-off-by: Andy Gross --- include/linux/soc/qcom/smem_state.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 include/linux/soc/qcom/smem_state.h (limited to 'include/linux') diff --git a/include/linux/soc/qcom/smem_state.h b/include/linux/soc/qcom/smem_state.h new file mode 100644 index 000000000000..f35e1512fcaa --- /dev/null +++ b/include/linux/soc/qcom/smem_state.h @@ -0,0 +1,18 @@ +#ifndef __QCOM_SMEM_STATE__ +#define __QCOM_SMEM_STATE__ + +struct qcom_smem_state; + +struct qcom_smem_state_ops { + int (*update_bits)(void *, u32, u32); +}; + +struct qcom_smem_state *qcom_smem_state_get(struct device *dev, const char *con_id, unsigned *bit); +void qcom_smem_state_put(struct qcom_smem_state *); + +int qcom_smem_state_update_bits(struct qcom_smem_state *state, u32 mask, u32 value); + +struct qcom_smem_state *qcom_smem_state_register(struct device_node *of_node, const struct qcom_smem_state_ops *ops, void *data); +void qcom_smem_state_unregister(struct qcom_smem_state *state); + +#endif -- cgit v1.2.3 From 4e34df0cba14e95e941bf73721352fa4d9c2622f Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 3 Dec 2015 12:02:31 -0800 Subject: ARM: OMAP2+: Add DPPLS clock manager for dm814x On dm814x we have some clocks at DPLLS and some at PRCM. Let's add a new omap_prcm_init_data entry for the DPLLS so we can initalize timer clocks early. Cc: Paul Walmsley Cc: Tero Kristo Signed-off-by: Tony Lindgren --- include/linux/clk/ti.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h index 223be696df27..57663c162e1c 100644 --- a/include/linux/clk/ti.h +++ b/include/linux/clk/ti.h @@ -195,6 +195,7 @@ enum { TI_CLKM_PRM, TI_CLKM_SCRM, TI_CLKM_CTRL, + TI_CLKM_PLLSS, CLK_MAX_MEMMAPS }; -- cgit v1.2.3 From a755e169031dac9ebaed03302c4921687c271d62 Mon Sep 17 00:00:00 2001 From: "Jason S. McMullan" Date: Wed, 30 Sep 2015 15:35:06 +0900 Subject: PCI: Add Netronome vendor and device IDs Device IDs for the Netronome NFP3200, NFP3240, NFP6000, and NFP6000 SR-IOV devices. Signed-off-by: Jason S. McMullan [simon: edited changelog] Signed-off-by: Simon Horman Signed-off-by: Bjorn Helgaas --- include/linux/pci_ids.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index d9ba49cedc5d..526e2c12ae59 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2495,6 +2495,12 @@ #define PCI_DEVICE_ID_KORENIX_JETCARDF2 0x1700 #define PCI_DEVICE_ID_KORENIX_JETCARDF3 0x17ff +#define PCI_VENDOR_ID_NETRONOME 0x19ee +#define PCI_DEVICE_ID_NETRONOME_NFP3200 0x3200 +#define PCI_DEVICE_ID_NETRONOME_NFP3240 0x3240 +#define PCI_DEVICE_ID_NETRONOME_NFP6000 0x6000 +#define PCI_DEVICE_ID_NETRONOME_NFP6000_VF 0x6003 + #define PCI_VENDOR_ID_QMI 0x1a32 #define PCI_VENDOR_ID_AZWAVE 0x1a3b -- cgit v1.2.3 From 8b092be9fd6a2cd84c437128e9b0d85e364efcfb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 4 Dec 2015 16:33:52 +0100 Subject: gpio: rcar: Remove obsolete platform data support Since commit 4baadb9e05c68962 ("ARM: shmobile: r8a7778: remove obsolete setup code"), Renesas R-Car SoCs are only supported in generic DT-only ARM multi-platform builds. The driver doesn't need to use platform data anymore, hence remove platform data configuration. Make gpio_rcar_priv.has_both_edge_trigger a boolean for consistency with gpio_rcar_info.has_both_edge_trigger. Move gpio_rcar_priv.irq_parent down while we're at it, to prevent gaps on 64-bit. Signed-off-by: Geert Uytterhoeven Acked-by: Simon Horman Signed-off-by: Linus Walleij --- include/linux/platform_data/gpio-rcar.h | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 include/linux/platform_data/gpio-rcar.h (limited to 'include/linux') diff --git a/include/linux/platform_data/gpio-rcar.h b/include/linux/platform_data/gpio-rcar.h deleted file mode 100644 index 2d8d69432813..000000000000 --- a/include/linux/platform_data/gpio-rcar.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Renesas R-Car GPIO Support - * - * Copyright (C) 2013 Magnus Damm - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#ifndef __GPIO_RCAR_H__ -#define __GPIO_RCAR_H__ - -struct gpio_rcar_config { - int gpio_base; - unsigned int irq_base; - unsigned int number_of_pins; - const char *pctl_name; - unsigned has_both_edge_trigger:1; -}; - -#define RCAR_GP_PIN(bank, pin) (((bank) * 32) + (pin)) - -#endif /* __GPIO_RCAR_H__ */ -- cgit v1.2.3 From 511cbce2ff8b9d322077909ee90c5d4b67b29b75 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Nov 2015 14:56:14 +0100 Subject: irq_poll: make blk-iopoll available outside the block layer The new name is irq_poll as iopoll is already taken. Better suggestions welcome. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche --- include/linux/blk-iopoll.h | 46 ---------------------------------------------- include/linux/interrupt.h | 2 +- include/linux/irq_poll.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 47 deletions(-) delete mode 100644 include/linux/blk-iopoll.h create mode 100644 include/linux/irq_poll.h (limited to 'include/linux') diff --git a/include/linux/blk-iopoll.h b/include/linux/blk-iopoll.h deleted file mode 100644 index 77ae77c0b704..000000000000 --- a/include/linux/blk-iopoll.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef BLK_IOPOLL_H -#define BLK_IOPOLL_H - -struct blk_iopoll; -typedef int (blk_iopoll_fn)(struct blk_iopoll *, int); - -struct blk_iopoll { - struct list_head list; - unsigned long state; - unsigned long data; - int weight; - int max; - blk_iopoll_fn *poll; -}; - -enum { - IOPOLL_F_SCHED = 0, - IOPOLL_F_DISABLE = 1, -}; - -/* - * Returns 0 if we successfully set the IOPOLL_F_SCHED bit, indicating - * that we were the first to acquire this iop for scheduling. If this iop - * is currently disabled, return "failure". - */ -static inline int blk_iopoll_sched_prep(struct blk_iopoll *iop) -{ - if (!test_bit(IOPOLL_F_DISABLE, &iop->state)) - return test_and_set_bit(IOPOLL_F_SCHED, &iop->state); - - return 1; -} - -static inline int blk_iopoll_disable_pending(struct blk_iopoll *iop) -{ - return test_bit(IOPOLL_F_DISABLE, &iop->state); -} - -extern void blk_iopoll_sched(struct blk_iopoll *); -extern void blk_iopoll_init(struct blk_iopoll *, int, blk_iopoll_fn *); -extern void blk_iopoll_complete(struct blk_iopoll *); -extern void __blk_iopoll_complete(struct blk_iopoll *); -extern void blk_iopoll_enable(struct blk_iopoll *); -extern void blk_iopoll_disable(struct blk_iopoll *); - -#endif diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index ad16809c8596..7ff98c23199a 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -412,7 +412,7 @@ enum NET_TX_SOFTIRQ, NET_RX_SOFTIRQ, BLOCK_SOFTIRQ, - BLOCK_IOPOLL_SOFTIRQ, + IRQ_POLL_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the diff --git a/include/linux/irq_poll.h b/include/linux/irq_poll.h new file mode 100644 index 000000000000..50c39dcd2cba --- /dev/null +++ b/include/linux/irq_poll.h @@ -0,0 +1,46 @@ +#ifndef IRQ_POLL_H +#define IRQ_POLL_H + +struct irq_poll; +typedef int (irq_poll_fn)(struct irq_poll *, int); + +struct irq_poll { + struct list_head list; + unsigned long state; + unsigned long data; + int weight; + int max; + irq_poll_fn *poll; +}; + +enum { + IRQ_POLL_F_SCHED = 0, + IRQ_POLL_F_DISABLE = 1, +}; + +/* + * Returns 0 if we successfully set the IRQ_POLL_F_SCHED bit, indicating + * that we were the first to acquire this iop for scheduling. If this iop + * is currently disabled, return "failure". + */ +static inline int irq_poll_sched_prep(struct irq_poll *iop) +{ + if (!test_bit(IRQ_POLL_F_DISABLE, &iop->state)) + return test_and_set_bit(IRQ_POLL_F_SCHED, &iop->state); + + return 1; +} + +static inline int irq_poll_disable_pending(struct irq_poll *iop) +{ + return test_bit(IRQ_POLL_F_DISABLE, &iop->state); +} + +extern void irq_poll_sched(struct irq_poll *); +extern void irq_poll_init(struct irq_poll *, int, irq_poll_fn *); +extern void irq_poll_complete(struct irq_poll *); +extern void __irq_poll_complete(struct irq_poll *); +extern void irq_poll_enable(struct irq_poll *); +extern void irq_poll_disable(struct irq_poll *); + +#endif -- cgit v1.2.3 From ea51190c03150fce4d9e428bfb608abbe0991db8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Dec 2015 06:41:11 -0800 Subject: irq_poll: fold irq_poll_sched_prep into irq_poll_sched There is no good reason to keep them apart, and this makes using the API a bit simpler. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche --- include/linux/irq_poll.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq_poll.h b/include/linux/irq_poll.h index 50c39dcd2cba..57efae661400 100644 --- a/include/linux/irq_poll.h +++ b/include/linux/irq_poll.h @@ -18,19 +18,6 @@ enum { IRQ_POLL_F_DISABLE = 1, }; -/* - * Returns 0 if we successfully set the IRQ_POLL_F_SCHED bit, indicating - * that we were the first to acquire this iop for scheduling. If this iop - * is currently disabled, return "failure". - */ -static inline int irq_poll_sched_prep(struct irq_poll *iop) -{ - if (!test_bit(IRQ_POLL_F_DISABLE, &iop->state)) - return test_and_set_bit(IRQ_POLL_F_SCHED, &iop->state); - - return 1; -} - static inline int irq_poll_disable_pending(struct irq_poll *iop) { return test_bit(IRQ_POLL_F_DISABLE, &iop->state); -- cgit v1.2.3 From 0bc92ace52ef3ed1c8eb9bcf36cd3d7ca72d5d14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Dec 2015 06:56:36 -0800 Subject: irq_poll: fold irq_poll_disable_pending into irq_poll_softirq Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche --- include/linux/irq_poll.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq_poll.h b/include/linux/irq_poll.h index 57efae661400..b4ad03cee9d4 100644 --- a/include/linux/irq_poll.h +++ b/include/linux/irq_poll.h @@ -18,11 +18,6 @@ enum { IRQ_POLL_F_DISABLE = 1, }; -static inline int irq_poll_disable_pending(struct irq_poll *iop) -{ - return test_bit(IRQ_POLL_F_DISABLE, &iop->state); -} - extern void irq_poll_sched(struct irq_poll *); extern void irq_poll_init(struct irq_poll *, int, irq_poll_fn *); extern void irq_poll_complete(struct irq_poll *); -- cgit v1.2.3 From 83af187d1b776753d58b53d155318d94f9428e92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Dec 2015 06:57:25 -0800 Subject: irq_poll: mark __irq_poll_complete static Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche --- include/linux/irq_poll.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq_poll.h b/include/linux/irq_poll.h index b4ad03cee9d4..8c4b4087f1f2 100644 --- a/include/linux/irq_poll.h +++ b/include/linux/irq_poll.h @@ -21,7 +21,6 @@ enum { extern void irq_poll_sched(struct irq_poll *); extern void irq_poll_init(struct irq_poll *, int, irq_poll_fn *); extern void irq_poll_complete(struct irq_poll *); -extern void __irq_poll_complete(struct irq_poll *); extern void irq_poll_enable(struct irq_poll *); extern void irq_poll_disable(struct irq_poll *); -- cgit v1.2.3 From 839a301dc2c007ec942b73a0025695056648f59b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Dec 2015 06:57:52 -0800 Subject: irq_poll: remove unused data and max fields Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche --- include/linux/irq_poll.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq_poll.h b/include/linux/irq_poll.h index 8c4b4087f1f2..3e8c1b8fb9be 100644 --- a/include/linux/irq_poll.h +++ b/include/linux/irq_poll.h @@ -7,9 +7,7 @@ typedef int (irq_poll_fn)(struct irq_poll *, int); struct irq_poll { struct list_head list; unsigned long state; - unsigned long data; int weight; - int max; irq_poll_fn *poll; }; -- cgit v1.2.3 From 2165bf524da5f5e496d1cdb8c5afae1345ecce1e Mon Sep 17 00:00:00 2001 From: Damien Riegel Date: Mon, 16 Nov 2015 12:27:59 -0500 Subject: watchdog: core: add restart handler support Many watchdog drivers implement the same code to register a restart handler. This patch provides a generic way to set such a function. The patch adds a new restart watchdog operation. If a restart priority greater than 0 is needed, the driver can call watchdog_set_restart_priority to set it. Suggested-by: Vivien Didelot Signed-off-by: Damien Riegel Reviewed-by: Guenter Roeck Reviewed-by: Vivien Didelot Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/watchdog.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 027b1f43f12d..5b52c834f7aa 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -12,6 +12,7 @@ #include #include #include +#include #include struct watchdog_ops; @@ -26,6 +27,7 @@ struct watchdog_device; * @status: The routine that shows the status of the watchdog device. * @set_timeout:The routine for setting the watchdog devices timeout value (in seconds). * @get_timeleft:The routine that gets the time left before a reset (in seconds). + * @restart: The routine for restarting the machine. * @ref: The ref operation for dyn. allocated watchdog_device structs * @unref: The unref operation for dyn. allocated watchdog_device structs * @ioctl: The routines that handles extra ioctl calls. @@ -45,6 +47,7 @@ struct watchdog_ops { unsigned int (*status)(struct watchdog_device *); int (*set_timeout)(struct watchdog_device *, unsigned int); unsigned int (*get_timeleft)(struct watchdog_device *); + int (*restart)(struct watchdog_device *); void (*ref)(struct watchdog_device *); void (*unref)(struct watchdog_device *); long (*ioctl)(struct watchdog_device *, unsigned int, unsigned long); @@ -62,6 +65,7 @@ struct watchdog_ops { * @timeout: The watchdog devices timeout value (in seconds). * @min_timeout:The watchdog devices minimum timeout value (in seconds). * @max_timeout:The watchdog devices maximum timeout value (in seconds). + * @restart_nb: The notifier block to register a restart function. * @driver-data:Pointer to the drivers private data. * @lock: Lock for watchdog core internal use only. * @status: Field that contains the devices internal status bits. @@ -88,6 +92,7 @@ struct watchdog_device { unsigned int timeout; unsigned int min_timeout; unsigned int max_timeout; + struct notifier_block restart_nb; void *driver_data; struct mutex lock; unsigned long status; @@ -142,6 +147,7 @@ static inline void *watchdog_get_drvdata(struct watchdog_device *wdd) } /* drivers/watchdog/watchdog_core.c */ +void watchdog_set_restart_priority(struct watchdog_device *wdd, int priority); extern int watchdog_init_timeout(struct watchdog_device *wdd, unsigned int timeout_parm, struct device *dev); extern int watchdog_register_device(struct watchdog_device *); -- cgit v1.2.3 From 65a4a1dc31ad9d73918971f1b89c617812c494bf Mon Sep 17 00:00:00 2001 From: Damien Riegel Date: Mon, 16 Nov 2015 12:28:00 -0500 Subject: watchdog: bcm47xx_wdt: use core restart handler Get rid of the custom restart handler by using the one provided by the watchdog core. Signed-off-by: Damien Riegel Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/bcm47xx_wdt.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bcm47xx_wdt.h b/include/linux/bcm47xx_wdt.h index 5582c211f594..b708786d4cbf 100644 --- a/include/linux/bcm47xx_wdt.h +++ b/include/linux/bcm47xx_wdt.h @@ -16,7 +16,6 @@ struct bcm47xx_wdt { struct watchdog_device wdd; struct notifier_block notifier; - struct notifier_block restart_handler; struct timer_list soft_timer; atomic_t soft_ticks; -- cgit v1.2.3 From e131319669e0ef5e6fcd75174daeffa40492135c Mon Sep 17 00:00:00 2001 From: Damien Riegel Date: Fri, 20 Nov 2015 16:54:51 -0500 Subject: watchdog: core: add reboot notifier support Many watchdog drivers register a reboot notifier in order to stop the watchdog on system reboot. Thus we can factorize this code in the watchdog core. For that purpose, a new notifier block is added in watchdog_device for internal use only, as well as a new watchdog_stop_on_reboot helper function. If this helper is called, watchdog core registers the related notifier block and will stop the watchdog when SYS_HALT or SYS_DOWN is received. Since this operation can be critical on some platforms, abort the device registration if the reboot notifier registration fails. Suggested-by: Vivien Didelot Signed-off-by: Damien Riegel Reviewed-by: Vivien Didelot Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/watchdog.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 5b52c834f7aa..a88f955fde92 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -65,6 +65,7 @@ struct watchdog_ops { * @timeout: The watchdog devices timeout value (in seconds). * @min_timeout:The watchdog devices minimum timeout value (in seconds). * @max_timeout:The watchdog devices maximum timeout value (in seconds). + * @reboot_nb: The notifier block to stop watchdog on reboot. * @restart_nb: The notifier block to register a restart function. * @driver-data:Pointer to the drivers private data. * @lock: Lock for watchdog core internal use only. @@ -92,6 +93,7 @@ struct watchdog_device { unsigned int timeout; unsigned int min_timeout; unsigned int max_timeout; + struct notifier_block reboot_nb; struct notifier_block restart_nb; void *driver_data; struct mutex lock; @@ -102,6 +104,7 @@ struct watchdog_device { #define WDOG_ALLOW_RELEASE 2 /* Did we receive the magic char ? */ #define WDOG_NO_WAY_OUT 3 /* Is 'nowayout' feature set ? */ #define WDOG_UNREGISTERED 4 /* Has the device been unregistered */ +#define WDOG_STOP_ON_REBOOT 5 /* Should be stopped on reboot */ struct list_head deferred; }; @@ -121,6 +124,12 @@ static inline void watchdog_set_nowayout(struct watchdog_device *wdd, bool noway set_bit(WDOG_NO_WAY_OUT, &wdd->status); } +/* Use the following function to stop the watchdog on reboot */ +static inline void watchdog_stop_on_reboot(struct watchdog_device *wdd) +{ + set_bit(WDOG_STOP_ON_REBOOT, &wdd->status); +} + /* Use the following function to check if a timeout value is invalid */ static inline bool watchdog_timeout_invalid(struct watchdog_device *wdd, unsigned int t) { -- cgit v1.2.3 From 2786aadeab263609eb690ca37e7dfd3b9ffa3625 Mon Sep 17 00:00:00 2001 From: Damien Riegel Date: Fri, 20 Nov 2015 16:54:52 -0500 Subject: watchdog: bcm47xx_wdt: use core reboot notifier Get rid of the custom reboot notifier block registration and use the one provided by the watchdog core. Signed-off-by: Damien Riegel Reviewed-by: Guenter Roeck Reviewed-by: Vivien Didelot Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/bcm47xx_wdt.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bcm47xx_wdt.h b/include/linux/bcm47xx_wdt.h index b708786d4cbf..8d9d07ec22a5 100644 --- a/include/linux/bcm47xx_wdt.h +++ b/include/linux/bcm47xx_wdt.h @@ -1,7 +1,6 @@ #ifndef LINUX_BCM47XX_WDT_H_ #define LINUX_BCM47XX_WDT_H_ -#include #include #include #include @@ -15,7 +14,6 @@ struct bcm47xx_wdt { void *driver_data; struct watchdog_device wdd; - struct notifier_block notifier; struct timer_list soft_timer; atomic_t soft_ticks; -- cgit v1.2.3 From 5ec9653806baa5928ee01109004411e3bed376f2 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Wed, 25 Nov 2015 00:11:48 -0300 Subject: fbdev: Make fb-notify a no-op if CONFIG_FB=n There's no point in having support for framebuffer notifications is CONFIG_FB is disabled. This commit adds the necessary stubs for code to link properly when CONFIG_FB=n and moves fb-notify.o to be built only when CONFIG_FB=y. Signed-off-by: Ezequiel Garcia Signed-off-by: Tomi Valkeinen --- include/linux/fb.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 3d003805aac3..55433f86f0a3 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -175,9 +175,27 @@ struct fb_blit_caps { u32 flags; }; +#ifdef CONFIG_FB_NOTIFY extern int fb_register_client(struct notifier_block *nb); extern int fb_unregister_client(struct notifier_block *nb); extern int fb_notifier_call_chain(unsigned long val, void *v); +#else +static inline int fb_register_client(struct notifier_block *nb) +{ + return 0; +}; + +static inline int fb_unregister_client(struct notifier_block *nb) +{ + return 0; +}; + +static inline int fb_notifier_call_chain(unsigned long val, void *v) +{ + return 0; +}; +#endif + /* * Pixmap structure definition * -- cgit v1.2.3 From 7626676320f398980a6bb4490fd58e924c888f6a Mon Sep 17 00:00:00 2001 From: Dmitry Kasatkin Date: Thu, 22 Oct 2015 21:26:32 +0300 Subject: evm: provide a function to set the EVM key from the kernel A crypto HW kernel module can possibly initialize the EVM key from the kernel __init code to enable EVM before calling the 'init' process. This patch provides a function evm_set_key() to set the EVM key directly without using the KEY subsystem. Changes in v4: * kernel-doc style for evm_set_key Changes in v3: * error reporting moved to evm_set_key * EVM_INIT_HMAC moved to evm_set_key * added bitop to prevent key setting race Changes in v2: * use size_t for key size instead of signed int * provide EVM_MAX_KEY_SIZE macro in * provide EVM_MIN_KEY_SIZE macro in Signed-off-by: Dmitry Kasatkin Signed-off-by: Mimi Zohar --- include/linux/evm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/evm.h b/include/linux/evm.h index 1fcb88ca88de..35ed9a8a403a 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -14,6 +14,7 @@ struct integrity_iint_cache; #ifdef CONFIG_EVM +extern int evm_set_key(void *key, size_t keylen); extern enum integrity_status evm_verifyxattr(struct dentry *dentry, const char *xattr_name, void *xattr_value, @@ -42,6 +43,12 @@ static inline int posix_xattr_acl(const char *xattrname) } #endif #else + +static inline int evm_set_key(void *key, size_t keylen) +{ + return -EOPNOTSUPP; +} + #ifdef CONFIG_INTEGRITY static inline enum integrity_status evm_verifyxattr(struct dentry *dentry, const char *xattr_name, -- cgit v1.2.3 From d3600bcf9d64d88dc1d189a754dcfab960ce751f Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 10 Nov 2015 08:34:46 -0500 Subject: KEYS: prevent keys from being removed from specified keyrings Userspace should not be allowed to remove keys from certain keyrings (eg. blacklist), though the keys themselves can expire. This patch defines a new key flag named KEY_FLAG_KEEP to prevent userspace from being able to unlink, revoke, invalidate or timed out a key on a keyring. When this flag is set on the keyring, all keys subsequently added are flagged. In addition, when this flag is set, the keyring itself can not be cleared. Signed-off-by: Mimi Zohar Cc: David Howells --- include/linux/key.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/key.h b/include/linux/key.h index 66f705243985..7321ab8ef949 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -177,6 +177,7 @@ struct key { #define KEY_FLAG_TRUSTED_ONLY 9 /* set if keyring only accepts links to trusted keys */ #define KEY_FLAG_BUILTIN 10 /* set if key is builtin */ #define KEY_FLAG_ROOT_CAN_INVAL 11 /* set if key can be invalidated by root without permission */ +#define KEY_FLAG_KEEP 12 /* set if key should not be removed */ /* the key type and key description string * - the desc is used to match a key against search criteria -- cgit v1.2.3 From 6604c6556db9e41c85f2839f66bd9d617bcf9f87 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 2 Nov 2015 12:14:21 +0100 Subject: pwm: Add PWM driver for OMAP using dual-mode timers Adds support for using a OMAP dual-mode timer with PWM capability as a Linux PWM device. The driver controls the timer by using the dmtimer API. Add a platform_data structure for each pwm-omap-dmtimer nodes containing the dmtimers functions in order to get driver not rely on platform specific functions. Cc: Grant Erickson Cc: NeilBrown Cc: Joachim Eastwood Suggested-by: Tony Lindgren Signed-off-by: Neil Armstrong Acked-by: Tony Lindgren [thierry.reding@gmail.com: coding style bikeshed, fix timer leak] Signed-off-by: Thierry Reding --- include/linux/platform_data/pwm_omap_dmtimer.h | 69 ++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 include/linux/platform_data/pwm_omap_dmtimer.h (limited to 'include/linux') diff --git a/include/linux/platform_data/pwm_omap_dmtimer.h b/include/linux/platform_data/pwm_omap_dmtimer.h new file mode 100644 index 000000000000..59384217208f --- /dev/null +++ b/include/linux/platform_data/pwm_omap_dmtimer.h @@ -0,0 +1,69 @@ +/* + * include/linux/platform_data/pwm_omap_dmtimer.h + * + * OMAP Dual-Mode Timer PWM platform data + * + * Copyright (C) 2010 Texas Instruments Incorporated - http://www.ti.com/ + * Tarun Kanti DebBarma + * Thara Gopinath + * + * Platform device conversion and hwmod support. + * + * Copyright (C) 2005 Nokia Corporation + * Author: Lauri Leukkunen + * PWM and clock framework support by Timo Teras. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN + * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __PWM_OMAP_DMTIMER_PDATA_H +#define __PWM_OMAP_DMTIMER_PDATA_H + +/* trigger types */ +#define PWM_OMAP_DMTIMER_TRIGGER_NONE 0x00 +#define PWM_OMAP_DMTIMER_TRIGGER_OVERFLOW 0x01 +#define PWM_OMAP_DMTIMER_TRIGGER_OVERFLOW_AND_COMPARE 0x02 + +struct omap_dm_timer; +typedef struct omap_dm_timer pwm_omap_dmtimer; + +struct pwm_omap_dmtimer_pdata { + pwm_omap_dmtimer *(*request_by_node)(struct device_node *np); + int (*free)(pwm_omap_dmtimer *timer); + + void (*enable)(pwm_omap_dmtimer *timer); + void (*disable)(pwm_omap_dmtimer *timer); + + struct clk *(*get_fclk)(pwm_omap_dmtimer *timer); + + int (*start)(pwm_omap_dmtimer *timer); + int (*stop)(pwm_omap_dmtimer *timer); + + int (*set_load)(pwm_omap_dmtimer *timer, int autoreload, + unsigned int value); + int (*set_match)(pwm_omap_dmtimer *timer, int enable, + unsigned int match); + int (*set_pwm)(pwm_omap_dmtimer *timer, int def_on, + int toggle, int trigger); + int (*set_prescaler)(pwm_omap_dmtimer *timer, int prescaler); + + int (*write_counter)(pwm_omap_dmtimer *timer, unsigned int value); +}; + +#endif /* __PWM_OMAP_DMTIMER_PDATA_H */ -- cgit v1.2.3 From 5b24a7a2aa2040c8c50c3b71122901d01661ff78 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2015 09:57:27 -0800 Subject: Add 'unsafe' user access functions for batched accesses The naming is meant to discourage random use: the helper functions are not really any more "unsafe" than the traditional double-underscore functions (which need the address range checking), but they do need even more infrastructure around them, and should not be used willy-nilly. In addition to checking the access range, these user access functions require that you wrap the user access with a "user_acess_{begin,end}()" around it. That allows architectures that implement kernel user access control (x86: SMAP, arm64: PAN) to do the user access control in the wrapping user_access_begin/end part, and then batch up the actual user space accesses using the new interfaces. The main (and hopefully only) use for these are for core generic access helpers, initially just the generic user string functions (strnlen_user() and strncpy_from_user()). Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 558129af828a..349557825428 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -111,4 +111,11 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); #define probe_kernel_address(addr, retval) \ probe_kernel_read(&retval, addr, sizeof(retval)) +#ifndef user_access_begin +#define user_access_begin() do { } while (0) +#define user_access_end() do { } while (0) +#define unsafe_get_user(x, ptr) __get_user(x, ptr) +#define unsafe_put_user(x, ptr) __put_user(x, ptr) +#endif + #endif /* __LINUX_UACCESS_H__ */ -- cgit v1.2.3 From 9a9e3415edd567813d52c8de402042b9720c54f5 Mon Sep 17 00:00:00 2001 From: Krzysztof Opasiak Date: Fri, 11 Dec 2015 16:06:09 +0100 Subject: fs: configfs: Drop unused parameter from configfs_undepend_item() subsys parameter is never used by configfs_undepend_item() so there is no point in passing it to this function. Signed-off-by: Krzysztof Opasiak Cc: Joel Becker Cc: Christoph Hellwig Signed-off-by: Nicholas Bellinger --- include/linux/configfs.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 758a029011b1..3b5c6d58b0d2 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -209,7 +209,8 @@ void configfs_unregister_default_group(struct config_group *group); /* These functions can sleep and can alloc with GFP_KERNEL */ /* WARNING: These cannot be called underneath configfs callbacks!! */ -int configfs_depend_item(struct configfs_subsystem *subsys, struct config_item *target); -void configfs_undepend_item(struct configfs_subsystem *subsys, struct config_item *target); +int configfs_depend_item(struct configfs_subsystem *subsys, + struct config_item *target); +void configfs_undepend_item(struct config_item *target); #endif /* _CONFIGFS_H_ */ -- cgit v1.2.3 From d79d75b5c5182fd94225996db71e06f9cbc7faed Mon Sep 17 00:00:00 2001 From: Krzysztof Opasiak Date: Fri, 11 Dec 2015 16:06:12 +0100 Subject: fs: configfs: Add unlocked version of configfs_depend_item() This change is necessary for the SCSI target usb gadget composed with configfs. In this case configfs will be used for two different purposes: to compose a usb gadget and to configure the target part. If an instance of tcm function is created in $CONFIGFS_ROOT/usb_gadget//functions a tpg can be created in $CONFIGFS_ROOT/target/usb_gadget//, but after a tpg is created the tcm function must not be removed until its corresponding tpg is gone. While the configfs_depend/undepend_item() are meant exactly for creating this kind of dependencies, they are not suitable if the other kernel subsystem happens to be another subsystem in configfs, so this patch adds unlocked versions meant for configfs callbacks. Above description has been provided by: Andrzej Pietrasiewicz In configfs_depend_item() we have to consider two possible cases: 1) When we are called to depend another item in the same subsystem as caller In this case we should skip locking configfs root as we know that configfs is in valid state and our subsystem will not be unregistered during this call. 2) When we are called to depend item in different subsystem than our caller In this case we are also sure that configfs is in valid state but we have to lock root of configfs to avoid unregistration of target's subsystem. As it is other than caller's subsystem, there may be nothing what protects us against unregistration of that subsystem. Signed-off-by: Krzysztof Opasiak Cc: Joel Becker Cc: Christoph Hellwig Signed-off-by: Nicholas Bellinger --- include/linux/configfs.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 3b5c6d58b0d2..7ee1a014c56b 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -213,4 +213,20 @@ int configfs_depend_item(struct configfs_subsystem *subsys, struct config_item *target); void configfs_undepend_item(struct config_item *target); +/* + * These functions can sleep and can alloc with GFP_KERNEL + * NOTE: These should be called only underneath configfs callbacks. + * NOTE: First parameter is a caller's subsystem, not target's. + * WARNING: These cannot be called on newly created item + * (in make_group()/make_item() callback) + */ +int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys, + struct config_item *target); + + +static inline void configfs_undepend_item_unlocked(struct config_item *target) +{ + configfs_undepend_item(target); +} + #endif /* _CONFIGFS_H_ */ -- cgit v1.2.3 From 287922eb0b186e2a5bf54fdd04b734c25c90035c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Oct 2015 20:57:30 +0800 Subject: block: defer timeouts to a workqueue Timer context is not very useful for drivers to perform any meaningful abort action from. So instead of calling the driver from this useless context defer it to a workqueue as soon as possible. Note that while a delayed_work item would seem the right thing here I didn't dare to use it due to the magic in blk_add_timer that pokes deep into timer internals. But maybe this encourages Tejun to add a sensible API for that to the workqueue API and we'll all be fine in the end :) Contains a major update from Keith Bush: "This patch removes synchronizing the timeout work so that the timer can start a freeze on its own queue. The timer enters the queue, so timer context can only start a freeze, but not wait for frozen." Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e711f294934c..221dc3bac49f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -407,6 +407,7 @@ struct request_queue { unsigned int rq_timeout; struct timer_list timeout; + struct work_struct timeout_work; struct list_head timeout_list; struct list_head icq_list; -- cgit v1.2.3 From bbc758ec04c2f30805ce0fcdfbaa4c3445fafbae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 7 Nov 2015 09:39:28 +0100 Subject: block: remove REQ_NO_TIMEOUT flag This was added for the 'magic' AEN requests in the NVMe driver that never return. We now handle them purely inside the driver and don't need this core hack any more. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0fb65843ec1e..86a38ea1823f 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -188,7 +188,6 @@ enum rq_flag_bits { __REQ_PM, /* runtime pm request */ __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ - __REQ_NO_TIMEOUT, /* requests may never expire */ __REQ_NR_BITS, /* stops here */ }; @@ -242,7 +241,6 @@ enum rq_flag_bits { #define REQ_PM (1ULL << __REQ_PM) #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) -#define REQ_NO_TIMEOUT (1ULL << __REQ_NO_TIMEOUT) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U -- cgit v1.2.3 From c89e5b80245899fc51fb1d83880e2f5762fcf350 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Wed, 23 Dec 2015 21:05:26 +0530 Subject: PCI/AER: include header file We are having build failure with sparc allmodconfig with the error: drivers/nvme/host/pci.c:15:0: include/linux/aer.h: In function 'pci_enable_pcie_error_reporting': include/linux/aer.h:49:10: error: 'EINVAL' undeclared (first use in this function) The file aer.h is using the error values but they are defined in errno.h. Include errno.h so that we have the definitions of the error codes. Fixes: a0a3408ee614 ("NVMe: Add pci error handlers") Cc: Keith Busch Signed-off-by: Sudip Mukherjee Signed-off-by: Jens Axboe --- include/linux/aer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/aer.h b/include/linux/aer.h index 744b997d6a94..164049357e5c 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -7,6 +7,7 @@ #ifndef _AER_H_ #define _AER_H_ +#include #include #define AER_NONFATAL 0 -- cgit v1.2.3 From 0de60af649533ad8d9aaeab1df710e6a728d45ea Mon Sep 17 00:00:00 2001 From: Achiad Shochat Date: Wed, 23 Dec 2015 18:47:19 +0200 Subject: net/mlx5_core: Introduce access functions to enable/disable RoCE A mlx5 Ethernet port must be explicitly enabled for RoCE. When RoCE is not enabled on the port, the NIC will refuse to create QPs attached to it and incoming RoCE packets will be considered by the NIC as plain Ethernet packets. Signed-off-by: Achiad Shochat Signed-off-by: Doug Ledford --- include/linux/mlx5/vport.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 967e0fd06e89..4c9ac604cccd 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -52,4 +52,7 @@ int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev, int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev, u64 *node_guid); +int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev); +int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev); + #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From 9efa75254593d6ca3ae54bac8153f47e1a7cbcda Mon Sep 17 00:00:00 2001 From: Achiad Shochat Date: Wed, 23 Dec 2015 18:47:20 +0200 Subject: net/mlx5_core: Introduce access functions to query vport RoCE fields Introduce access functions to query NIC vport system_image_guid, node_guid and qkey_viol_cntr. Signed-off-by: Achiad Shochat Signed-off-by: Doug Ledford --- include/linux/mlx5/mlx5_ifc.h | 10 +++++++++- include/linux/mlx5/vport.h | 5 +++++ 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1565324eb620..49b34c6466ac 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2141,7 +2141,15 @@ struct mlx5_ifc_nic_vport_context_bits { u8 reserved_0[0x1f]; u8 roce_en[0x1]; - u8 reserved_1[0x760]; + u8 reserved_1[0x120]; + + u8 system_image_guid[0x40]; + u8 port_guid[0x40]; + u8 node_guid[0x40]; + + u8 reserved_5[0x140]; + u8 qkey_violation_counter[0x10]; + u8 reserved_6[0x430]; u8 reserved_2[0x5]; u8 allowed_list_type[0x3]; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 4c9ac604cccd..dfb2d9497d2d 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -37,6 +37,11 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod); void mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr); +int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, + u64 *system_image_guid); +int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid); +int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev, + u16 *qkey_viol_cntr); int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport, u8 port_num, u16 vf_num, u16 gid_index, union ib_gid *gid); -- cgit v1.2.3 From 3f89a643eb29543af0838d37604bbc29a4e1eb60 Mon Sep 17 00:00:00 2001 From: Achiad Shochat Date: Wed, 23 Dec 2015 18:47:21 +0200 Subject: IB/mlx5: Extend query_device/port to support RoCE Using the vport access functions to retrieve the Ethernet specific information and return this information in ib_query_device and ib_query_port. Signed-off-by: Achiad Shochat Signed-off-by: Doug Ledford --- include/linux/mlx5/driver.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5c857f2a20d7..7b9c976b42d9 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -632,13 +632,6 @@ extern struct workqueue_struct *mlx5_core_wq; .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \ .struct_size_bytes = sizeof((struct ib_unpacked_ ## header *)0)->field -struct ib_field { - size_t struct_offset_bytes; - size_t struct_size_bytes; - int offset_bits; - int size_bits; -}; - static inline struct mlx5_core_dev *pci2mlx5_core_dev(struct pci_dev *pdev) { return pci_get_drvdata(pdev); -- cgit v1.2.3 From cb34be6da25f45034ef4ff6103d401b451165e39 Mon Sep 17 00:00:00 2001 From: Achiad Shochat Date: Wed, 23 Dec 2015 18:47:22 +0200 Subject: IB/mlx5: Set network_hdr_type upon RoCE responder completion When handling a responder completion, if the link layer is Ethernet, set the work completion network_hdr_type field according to CQE's info and the IB_WC_WITH_NETWORK_HDR_TYPE flag. Signed-off-by: Achiad Shochat Signed-off-by: Doug Ledford --- include/linux/mlx5/device.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0b473cbfa7ef..84aa7e0e1dfa 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -628,6 +628,12 @@ enum { CQE_RSS_HTYPE_L4 = 0x3 << 2, }; +enum { + MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH = 0x0, + MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6 = 0x1, + MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4 = 0x2, +}; + enum { CQE_L2_OK = 1 << 0, CQE_L3_OK = 1 << 1, -- cgit v1.2.3 From 3cca26069a4b7f6d8fd3dc0ed707e795c22712e2 Mon Sep 17 00:00:00 2001 From: Achiad Shochat Date: Wed, 23 Dec 2015 18:47:23 +0200 Subject: IB/mlx5: Support IB device's callbacks for adding/deleting GIDs These callbacks write into the mlx5 RoCE address table. Upon del_gid we write a zero'd GID. Signed-off-by: Achiad Shochat Signed-off-by: Doug Ledford --- include/linux/mlx5/device.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 84aa7e0e1dfa..ea4281b00c8d 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -278,6 +278,26 @@ enum { MLX5_DEV_CAP_FLAG_CMDIF_CSUM = 3LL << 46, }; +enum { + MLX5_ROCE_VERSION_1 = 0, + MLX5_ROCE_VERSION_2 = 2, +}; + +enum { + MLX5_ROCE_VERSION_1_CAP = 1 << MLX5_ROCE_VERSION_1, + MLX5_ROCE_VERSION_2_CAP = 1 << MLX5_ROCE_VERSION_2, +}; + +enum { + MLX5_ROCE_L3_TYPE_IPV4 = 0, + MLX5_ROCE_L3_TYPE_IPV6 = 1, +}; + +enum { + MLX5_ROCE_L3_TYPE_IPV4_CAP = 1 << 1, + MLX5_ROCE_L3_TYPE_IPV6_CAP = 1 << 2, +}; + enum { MLX5_OPCODE_NOP = 0x00, MLX5_OPCODE_SEND_INVAL = 0x01, -- cgit v1.2.3 From 2811ba51b04958cd001b6409c9f70e8563376346 Mon Sep 17 00:00:00 2001 From: Achiad Shochat Date: Wed, 23 Dec 2015 18:47:24 +0200 Subject: IB/mlx5: Add RoCE fields to Address Vector Set the address handle and QP address path fields according to the link layer type (IB/Eth). Signed-off-by: Achiad Shochat Signed-off-by: Doug Ledford --- include/linux/mlx5/qp.h | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index f079fb1a31f7..a9ad40169191 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -248,8 +248,12 @@ struct mlx5_av { __be32 dqp_dct; u8 stat_rate_sl; u8 fl_mlid; - __be16 rlid; - u8 reserved0[10]; + union { + __be16 rlid; + __be16 udp_sport; + }; + u8 reserved0[4]; + u8 rmac[6]; u8 tclass; u8 hop_limit; __be32 grh_gid_fl; @@ -456,11 +460,16 @@ struct mlx5_qp_path { u8 static_rate; u8 hop_limit; __be32 tclass_flowlabel; - u8 rgid[16]; - u8 rsvd1[4]; - u8 sl; + union { + u8 rgid[16]; + u8 rip[16]; + }; + u8 f_dscp_ecn_prio; + u8 ecn_dscp; + __be16 udp_sport; + u8 dci_cfi_prio_sl; u8 port; - u8 rsvd2[6]; + u8 rmac[6]; }; struct mlx5_qp_context { -- cgit v1.2.3 From 2eb8c7104c648ad4bfae1f5333f98c09522149b5 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Tue, 22 Dec 2015 22:27:58 +0100 Subject: clk: add flag for clocks that need to be enabled on rate changes Some clocks need to be enabled to accept rate changes. This patch adds a new flag CLK_SET_RATE_UNGATE that lets clk_change_rate enable the clock before trying to change the rate and disable it again afterwards. This of course doesn't effect clocks that are already running at that point, as their refcount will only temporarily increase. Signed-off-by: Heiko Stuebner Tested-by: Sjoerd Simons Reviewed-by: Sjoerd Simons Signed-off-by: Michael Turquette --- include/linux/clk-provider.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index c56988ac63f7..a971ce462565 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -31,6 +31,7 @@ #define CLK_SET_RATE_NO_REPARENT BIT(7) /* don't re-parent on rate change */ #define CLK_GET_ACCURACY_NOCACHE BIT(8) /* do not use the cached clk accuracy */ #define CLK_RECALC_NEW_RATES BIT(9) /* recalc rates after notifications */ +#define CLK_SET_RATE_UNGATE BIT(10) /* clock needs to run to set rate */ struct clk; struct clk_hw; -- cgit v1.2.3 From 7c60bcbb68122b39fe3e92143abce01be75f3fa6 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Tue, 15 Dec 2015 20:30:11 +0200 Subject: IB/mlx5: Add support for hca_core_clock and timestamp_mask Reporting the hca_core_clock (in kHZ) and the timestamp_mask in query_device extended verb. timestamp_mask is used by users in order to know what is the valid range of the raw timestamps, while hca_core_clock reports the clock frequency that is used for timestamps. Signed-off-by: Matan Barak Reviewed-by: Moshe Lazer Signed-off-by: Doug Ledford --- include/linux/mlx5/mlx5_ifc.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 49b34c6466ac..091d8343d594 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -794,15 +794,18 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_63[0x8]; u8 log_uar_page_sz[0x10]; - u8 reserved_64[0x100]; + u8 reserved_64[0x20]; + u8 device_frequency_mhz[0x20]; + u8 device_frequency_khz[0x20]; + u8 reserved_65[0xa0]; - u8 reserved_65[0x1f]; + u8 reserved_66[0x1f]; u8 cqe_zip[0x1]; u8 cqe_zip_timeout[0x10]; u8 cqe_zip_max_num[0x10]; - u8 reserved_66[0x220]; + u8 reserved_67[0x220]; }; enum { -- cgit v1.2.3 From b368d7cb8ceb77f481b066bd8be5fada82da7301 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Tue, 15 Dec 2015 20:30:12 +0200 Subject: IB/mlx5: Add hca_core_clock_offset to udata in init_ucontext Pass hca_core_clock_offset to user-space is mandatory in order to let the user-space read the free-running clock register from the right offset in the memory mapped page. Passing this value is done by changing the vendor's command and response of init_ucontext to be in extensible form. Signed-off-by: Matan Barak Reviewed-by: Moshe Lazer Signed-off-by: Doug Ledford --- include/linux/mlx5/device.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index ea4281b00c8d..48c4623ad651 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -462,9 +462,12 @@ struct mlx5_init_seg { __be32 rsvd1[120]; __be32 initializing; struct health_buffer health; - __be32 rsvd2[884]; + __be32 rsvd2[880]; + __be32 internal_timer_h; + __be32 internal_timer_l; + __be32 rsvd3[2]; __be32 health_counter; - __be32 rsvd3[1019]; + __be32 rsvd4[1019]; __be64 ieee1588_clk; __be32 ieee1588_clk_type; __be32 clr_intx; -- cgit v1.2.3 From 051f263098a90d208e2d20251bfd4834bc783214 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 20 Dec 2015 12:16:11 +0200 Subject: IB/mlx5: Add driver cross-channel support Add support of cross-channel functionality to mlx5 driver. This includes ability to ignore overrun for CQ which intended for cross-channel, export device capability and configure the QP to be sync master/slave queues. The cross-channel enabled QP supports combination of three possible properties: * WQE processing on the receive queue of this QP * WQE processing on the send queue of this QP * WQE are supported on the send queue Reviewed-by: Sagi Grimberg Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/linux/mlx5/qp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index a9ad40169191..fd1ff4110e80 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -130,6 +130,9 @@ enum { MLX5_QP_BIT_RWE = 1 << 14, MLX5_QP_BIT_RAE = 1 << 13, MLX5_QP_BIT_RIC = 1 << 4, + MLX5_QP_BIT_CC_SLAVE_RECV = 1 << 2, + MLX5_QP_BIT_CC_SLAVE_SEND = 1 << 1, + MLX5_QP_BIT_CC_MASTER = 1 << 0 }; enum { -- cgit v1.2.3 From f91e6d8941bf450f7842dfc1ed80e948aaa65e8c Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 14 Dec 2015 16:34:09 +0200 Subject: net/mlx5_core: Add setting ATOMIC endian mode HW is capable of 2 requestor endianness modes for standard 8 Bytes atomic: BE (0x0) and host endianness (0x1). Read the supported modes from hca atomic capabilities and configure HW to host endianness mode if supported. Signed-off-by: Eran Ben Elisha Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- include/linux/mlx5/mlx5_ifc.h | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 091d8343d594..991283b51f61 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -66,6 +66,11 @@ enum { MLX5_MODIFY_TIR_BITMASK_TUNNELED_OFFLOAD_EN = 0x3 }; +enum { + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0, + MLX5_SET_HCA_CAP_OP_MOD_ATOMIC = 0x3, +}; + enum { MLX5_CMD_OP_QUERY_HCA_CAP = 0x100, MLX5_CMD_OP_QUERY_ADAPTER = 0x101, @@ -527,21 +532,24 @@ enum { struct mlx5_ifc_atomic_caps_bits { u8 reserved_0[0x40]; - u8 atomic_req_endianness[0x1]; - u8 reserved_1[0x1f]; + u8 atomic_req_8B_endianess_mode[0x2]; + u8 reserved_1[0x4]; + u8 supported_atomic_req_8B_endianess_mode_1[0x1]; - u8 reserved_2[0x20]; + u8 reserved_2[0x19]; - u8 reserved_3[0x10]; - u8 atomic_operations[0x10]; + u8 reserved_3[0x20]; u8 reserved_4[0x10]; - u8 atomic_size_qp[0x10]; + u8 atomic_operations[0x10]; u8 reserved_5[0x10]; + u8 atomic_size_qp[0x10]; + + u8 reserved_6[0x10]; u8 atomic_size_dc[0x10]; - u8 reserved_6[0x720]; + u8 reserved_7[0x720]; }; struct mlx5_ifc_odp_cap_bits { -- cgit v1.2.3 From da7525d2a9ae9d9d9af754441befcf2560f6cac3 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 14 Dec 2015 16:34:10 +0200 Subject: IB/mlx5: Advertise atomic capabilities in query device In order to ensure IB spec atomic correctness in atomic operations, if HW is configured to host endianness, advertise IB_ATOMIC_HCA. if not, advertise IB_ATOMIC_NONE. Signed-off-by: Eran Ben Elisha Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- include/linux/mlx5/driver.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 7b9c976b42d9..53c57724c8dd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -115,6 +115,11 @@ enum { MLX5_REG_HOST_ENDIANNESS = 0x7004, }; +enum { + MLX5_ATOMIC_OPS_CMP_SWAP = 1 << 0, + MLX5_ATOMIC_OPS_FETCH_ADD = 1 << 1, +}; + enum mlx5_page_fault_resume_flags { MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0, MLX5_PAGE_FAULT_RESUME_WRITE = 1 << 1, -- cgit v1.2.3 From ea861dfd9e0e7e044a6e65fa02a14b9159b568da Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 24 Dec 2015 11:09:39 -0500 Subject: security: Make inode argument of inode_getsecurity non-const Make the inode argument of the inode_getsecurity hook non-const so that we can use it to revalidate invalid security labels. Signed-off-by: Andreas Gruenbacher Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 2 +- include/linux/security.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index ec3a6bab29de..bdd0a3a8a0e4 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1413,7 +1413,7 @@ union security_list_options { int (*inode_removexattr)(struct dentry *dentry, const char *name); int (*inode_need_killpriv)(struct dentry *dentry); int (*inode_killpriv)(struct dentry *dentry); - int (*inode_getsecurity)(const struct inode *inode, const char *name, + int (*inode_getsecurity)(struct inode *inode, const char *name, void **buffer, bool alloc); int (*inode_setsecurity)(struct inode *inode, const char *name, const void *value, size_t size, diff --git a/include/linux/security.h b/include/linux/security.h index 2f4c1f7aa7db..9ee61b264b23 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -270,7 +270,7 @@ int security_inode_listxattr(struct dentry *dentry); int security_inode_removexattr(struct dentry *dentry, const char *name); int security_inode_need_killpriv(struct dentry *dentry); int security_inode_killpriv(struct dentry *dentry); -int security_inode_getsecurity(const struct inode *inode, const char *name, void **buffer, bool alloc); +int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc); int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags); int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size); void security_inode_getsecid(const struct inode *inode, u32 *secid); @@ -719,7 +719,7 @@ static inline int security_inode_killpriv(struct dentry *dentry) return cap_inode_killpriv(dentry); } -static inline int security_inode_getsecurity(const struct inode *inode, const char *name, void **buffer, bool alloc) +static inline int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc) { return -EOPNOTSUPP; } -- cgit v1.2.3 From d6335d77a7622a88380f3f207cc1f727f878dd21 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 24 Dec 2015 11:09:39 -0500 Subject: security: Make inode argument of inode_getsecid non-const Make the inode argument of the inode_getsecid hook non-const so that we can use it to revalidate invalid security labels. Signed-off-by: Andreas Gruenbacher Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- include/linux/audit.h | 8 ++++---- include/linux/lsm_hooks.h | 2 +- include/linux/security.h | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 20eba1eb0a3c..8a2d046e9f6b 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -137,7 +137,7 @@ extern void __audit_getname(struct filename *name); extern void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags); extern void __audit_file(const struct file *); -extern void __audit_inode_child(const struct inode *parent, +extern void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type); extern void __audit_seccomp(unsigned long syscall, long signr, int code); @@ -202,7 +202,7 @@ static inline void audit_inode_parent_hidden(struct filename *name, __audit_inode(name, dentry, AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN); } -static inline void audit_inode_child(const struct inode *parent, +static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { if (unlikely(!audit_dummy_context())) @@ -359,7 +359,7 @@ static inline void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags) { } -static inline void __audit_inode_child(const struct inode *parent, +static inline void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { } @@ -373,7 +373,7 @@ static inline void audit_file(struct file *file) static inline void audit_inode_parent_hidden(struct filename *name, const struct dentry *dentry) { } -static inline void audit_inode_child(const struct inode *parent, +static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { } diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index bdd0a3a8a0e4..4c48227450e6 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1420,7 +1420,7 @@ union security_list_options { int flags); int (*inode_listsecurity)(struct inode *inode, char *buffer, size_t buffer_size); - void (*inode_getsecid)(const struct inode *inode, u32 *secid); + void (*inode_getsecid)(struct inode *inode, u32 *secid); int (*file_permission)(struct file *file, int mask); int (*file_alloc_security)(struct file *file); diff --git a/include/linux/security.h b/include/linux/security.h index 9ee61b264b23..e79149a06454 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -273,7 +273,7 @@ int security_inode_killpriv(struct dentry *dentry); int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc); int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags); int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size); -void security_inode_getsecid(const struct inode *inode, u32 *secid); +void security_inode_getsecid(struct inode *inode, u32 *secid); int security_file_permission(struct file *file, int mask); int security_file_alloc(struct file *file); void security_file_free(struct file *file); @@ -734,7 +734,7 @@ static inline int security_inode_listsecurity(struct inode *inode, char *buffer, return 0; } -static inline void security_inode_getsecid(const struct inode *inode, u32 *secid) +static inline void security_inode_getsecid(struct inode *inode, u32 *secid) { *secid = 0; } -- cgit v1.2.3 From 6f3be9f562e3027c77bc4482ccf2cea8600a7f74 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 24 Dec 2015 11:09:40 -0500 Subject: security: Add hook to invalidate inode security labels Add a hook to invalidate an inode's security label when the cached information becomes invalid. Add the new hook in selinux: set a flag when a security label becomes invalid. Signed-off-by: Andreas Gruenbacher Reviewed-by: James Morris Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 6 ++++++ include/linux/security.h | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 4c48227450e6..71969de4058c 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1261,6 +1261,10 @@ * audit_rule_init. * @rule contains the allocated rule * + * @inode_invalidate_secctx: + * Notify the security module that it must revalidate the security context + * of an inode. + * * @inode_notifysecctx: * Notify the security module of what the security context of an inode * should be. Initializes the incore security context managed by the @@ -1516,6 +1520,7 @@ union security_list_options { int (*secctx_to_secid)(const char *secdata, u32 seclen, u32 *secid); void (*release_secctx)(char *secdata, u32 seclen); + void (*inode_invalidate_secctx)(struct inode *inode); int (*inode_notifysecctx)(struct inode *inode, void *ctx, u32 ctxlen); int (*inode_setsecctx)(struct dentry *dentry, void *ctx, u32 ctxlen); int (*inode_getsecctx)(struct inode *inode, void **ctx, u32 *ctxlen); @@ -1757,6 +1762,7 @@ struct security_hook_heads { struct list_head secid_to_secctx; struct list_head secctx_to_secid; struct list_head release_secctx; + struct list_head inode_invalidate_secctx; struct list_head inode_notifysecctx; struct list_head inode_setsecctx; struct list_head inode_getsecctx; diff --git a/include/linux/security.h b/include/linux/security.h index e79149a06454..4824a4ccaf1c 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -353,6 +353,7 @@ int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); void security_release_secctx(char *secdata, u32 seclen); +void security_inode_invalidate_secctx(struct inode *inode); int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); @@ -1093,6 +1094,10 @@ static inline void security_release_secctx(char *secdata, u32 seclen) { } +static inline void security_inode_invalidate_secctx(struct inode *inode) +{ +} + static inline int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) { return -EOPNOTSUPP; -- cgit v1.2.3 From 60befd2ea1c2061775838ea7bac5cc2b1353afd0 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Tue, 22 Dec 2015 16:37:28 +0200 Subject: gpio: update gpiochip .get() callback description Since gpiochip .get() callback may return a negative error value, it strictly limits the range of possible non-error returned values to a subset of [30:0] bitmask, however on practice on success all gpiochip drivers return either 0 for low signal or 1 for high signal, this is assured by "gpio: *: Be sure to clamp return value" series of changes. To avoid any confusion, misinterpretation and potential errors while developing gpiochip drivers in future convert this implicit assumption to a mandatory rule. For output signals with unknown output signal state gpiochip drivers should return a negative error instead of 0. Signed-off-by: Vladimir Zapolskiy Signed-off-by: Linus Walleij --- include/linux/gpio/driver.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index b02c43be7859..990797589408 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -32,8 +32,7 @@ struct seq_file; * (same as GPIOF_DIR_XXX), or negative error * @direction_input: configures signal "offset" as input, or returns error * @direction_output: configures signal "offset" as output, or returns error - * @get: returns value for signal "offset"; for output signals this - * returns either the value actually sensed, or zero + * @get: returns value for signal "offset", 0=low, 1=high, or negative error * @set: assigns output value for signal "offset" * @set_multiple: assigns output values for multiple signals defined by "mask" * @set_debounce: optional hook for setting debounce time for specified gpio in -- cgit v1.2.3 From bb431ba26c5cd0a17c941ca6c3a195a3a6d5d461 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 30 Oct 2015 16:31:47 +0800 Subject: Thermal: initialize thermal zone device correctly After thermal zone device registered, as we have not read any temperature before, thus tz->temperature should not be 0, which actually means 0C, and thermal trend is not available. In this case, we need specially handling for the first thermal_zone_device_update(). Both thermal core framework and step_wise governor is enhanced to handle this. And since the step_wise governor is the only one that uses trends, so it's the only thermal governor that needs to be updated. CC: #3.18+ Tested-by: Manuel Krause Tested-by: szegad Tested-by: prash Tested-by: amish Tested-by: Matthias Reviewed-by: Javi Merino Signed-off-by: Zhang Rui Signed-off-by: Chen Yu --- include/linux/thermal.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 613c29bd6baf..103fcbe6bdaf 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -43,6 +43,9 @@ /* Default weight of a bound cooling device */ #define THERMAL_WEIGHT_DEFAULT 0 +/* use value, which < 0K, to indicate an invalid/uninitialized temperature */ +#define THERMAL_TEMP_INVALID -274000 + /* Unit conversion macros */ #define DECI_KELVIN_TO_CELSIUS(t) ({ \ long _t = (t); \ -- cgit v1.2.3 From 4511f7166a2deb5f7a578cf87fd2fe1ae83527e3 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 30 Oct 2015 16:32:10 +0800 Subject: Thermal: do thermal zone update after a cooling device registered When a new cooling device is registered, we need to update the thermal zone to set the new registered cooling device to a proper state. This fixes a problem that the system is cool, while the fan devices are left running on full speed after boot, if fan device is registered after thermal zone device. Here is the history of why current patch looks like this: https://patchwork.kernel.org/patch/7273041/ CC: #3.18+ Reference:https://bugzilla.kernel.org/show_bug.cgi?id=92431 Tested-by: Manuel Krause Tested-by: szegad Tested-by: prash Tested-by: amish Reviewed-by: Javi Merino Signed-off-by: Zhang Rui Signed-off-by: Chen Yu --- include/linux/thermal.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 103fcbe6bdaf..e13a1ace50e9 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -170,6 +170,7 @@ struct thermal_attr { * @forced_passive: If > 0, temperature at which to switch on all ACPI * processor cooling devices. Currently only used by the * step-wise governor. + * @need_update: if equals 1, thermal_zone_device_update needs to be invoked. * @ops: operations this &thermal_zone_device supports * @tzp: thermal zone parameters * @governor: pointer to the governor for this thermal zone @@ -197,6 +198,7 @@ struct thermal_zone_device { int emul_temperature; int passive; unsigned int forced_passive; + atomic_t need_update; struct thermal_zone_device_ops *ops; struct thermal_zone_params *tzp; struct thermal_governor *governor; -- cgit v1.2.3 From b4ffb1909843b28f3b1b60197d517b123b7a9b66 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 25 Dec 2015 16:01:42 -0800 Subject: watchdog: Separate and maintain variables based on variable lifetime All variables required by the watchdog core to manage a watchdog are currently stored in struct watchdog_device. The lifetime of those variables is determined by the watchdog driver. However, the lifetime of variables used by the watchdog core differs from the lifetime of struct watchdog_device. To remedy this situation, watchdog drivers can implement ref and unref callbacks, to be used by the watchdog core to lock struct watchdog_device in memory. While this solves the immediate problem, it depends on watchdog drivers to actually implement the ref/unref callbacks. This is error prone, often not implemented in the first place, or not implemented correctly. To solve the problem without requiring driver support, split the variables in struct watchdog_device into two data structures - one for variables associated with the watchdog driver, one for variables associated with the watchdog core. With this approach, the watchdog core can keep track of its variable lifetime and no longer depends on ref/unref callbacks in the driver. As a side effect, some of the variables originally in struct watchdog_driver are now private to the watchdog core and no longer visible in watchdog drivers. As a side effect of the changes made, an ioctl will now always fail with -ENODEV after a watchdog device was unregistered with the character device still open. Previously, it would only fail with -ENODEV in some situations. Also, ioctl operations are now atomic from driver perspective. With this change, it is now guaranteed that the driver will not unregister a watchdog between a timeout change and the subsequent ping. The 'ref' and 'unref' callbacks in struct watchdog_driver are no longer used and marked as deprecated. Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/watchdog.h | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index a88f955fde92..850af04fe0c7 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -17,6 +17,7 @@ struct watchdog_ops; struct watchdog_device; +struct watchdog_core_data; /** struct watchdog_ops - The watchdog-devices operations * @@ -28,8 +29,6 @@ struct watchdog_device; * @set_timeout:The routine for setting the watchdog devices timeout value (in seconds). * @get_timeleft:The routine that gets the time left before a reset (in seconds). * @restart: The routine for restarting the machine. - * @ref: The ref operation for dyn. allocated watchdog_device structs - * @unref: The unref operation for dyn. allocated watchdog_device structs * @ioctl: The routines that handles extra ioctl calls. * * The watchdog_ops structure contains a list of low-level operations @@ -48,15 +47,14 @@ struct watchdog_ops { int (*set_timeout)(struct watchdog_device *, unsigned int); unsigned int (*get_timeleft)(struct watchdog_device *); int (*restart)(struct watchdog_device *); - void (*ref)(struct watchdog_device *); - void (*unref)(struct watchdog_device *); + void (*ref)(struct watchdog_device *) __deprecated; + void (*unref)(struct watchdog_device *) __deprecated; long (*ioctl)(struct watchdog_device *, unsigned int, unsigned long); }; /** struct watchdog_device - The structure that defines a watchdog device * * @id: The watchdog's ID. (Allocated by watchdog_register_device) - * @cdev: The watchdog's Character device. * @dev: The device for our watchdog * @parent: The parent bus device * @info: Pointer to a watchdog_info structure. @@ -67,8 +65,8 @@ struct watchdog_ops { * @max_timeout:The watchdog devices maximum timeout value (in seconds). * @reboot_nb: The notifier block to stop watchdog on reboot. * @restart_nb: The notifier block to register a restart function. - * @driver-data:Pointer to the drivers private data. - * @lock: Lock for watchdog core internal use only. + * @driver_data:Pointer to the drivers private data. + * @wd_data: Pointer to watchdog core internal data. * @status: Field that contains the devices internal status bits. * @deferred: entry in wtd_deferred_reg_list which is used to * register early initialized watchdogs. @@ -84,7 +82,6 @@ struct watchdog_ops { */ struct watchdog_device { int id; - struct cdev cdev; struct device *dev; struct device *parent; const struct watchdog_info *info; @@ -96,15 +93,12 @@ struct watchdog_device { struct notifier_block reboot_nb; struct notifier_block restart_nb; void *driver_data; - struct mutex lock; + struct watchdog_core_data *wd_data; unsigned long status; /* Bit numbers for status flags */ #define WDOG_ACTIVE 0 /* Is the watchdog running/active */ -#define WDOG_DEV_OPEN 1 /* Opened via /dev/watchdog ? */ -#define WDOG_ALLOW_RELEASE 2 /* Did we receive the magic char ? */ -#define WDOG_NO_WAY_OUT 3 /* Is 'nowayout' feature set ? */ -#define WDOG_UNREGISTERED 4 /* Has the device been unregistered */ -#define WDOG_STOP_ON_REBOOT 5 /* Should be stopped on reboot */ +#define WDOG_NO_WAY_OUT 1 /* Is 'nowayout' feature set ? */ +#define WDOG_STOP_ON_REBOOT 2 /* Should be stopped on reboot */ struct list_head deferred; }; -- cgit v1.2.3 From 427e0dc57db7046385ed7618ab24aa5c58dccab1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 1 Jan 2016 23:40:46 +0100 Subject: gpiolib: always initialize *flags from of_get_named_gpio_flags The of_get_named_gpio_flags() function does nothing other than returning an error when CONFIG_OF_GPIO is disabled, but that causes spurious warnings about possible use of uninitialized variables in any code that does not check the of_get_named_gpio_flags() return value before trying to use the flags: drivers/input/misc/rotary_encoder.c: In function 'rotary_encoder_probe': drivers/input/misc/rotary_encoder.c:223:28: warning: 'flags' may be used uninitialized in this function [-Wmaybe-uninitialized] drivers/power/bq24735-charger.c: In function 'bq24735_charger_probe': drivers/power/bq24735-charger.c:227:12: warning: 'flags' may be used uninitialized in this function [-Wmaybe-uninitialized] drivers/power/sbs-battery.c: In function 'sbs_probe': drivers/power/sbs-battery.c:782:17: warning: 'gpio_flags' may be used uninitialized in this function [-Wmaybe-uninitialized] This changes the behavior of the inline helper to set the flags to zero when OF_GPIO is disabled, to avoid the warnings. In all cases I've encountered, we don't actually get to the place that uses the flags if CONFIG_OF is disabled because we won't enter the DT parser code. Signed-off-by: Arnd Bergmann Signed-off-by: Linus Walleij --- include/linux/of_gpio.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h index 87d6d1632dd4..bb85a8eeba6a 100644 --- a/include/linux/of_gpio.h +++ b/include/linux/of_gpio.h @@ -67,6 +67,9 @@ extern int of_gpio_simple_xlate(struct gpio_chip *gc, static inline int of_get_named_gpio_flags(struct device_node *np, const char *list_name, int index, enum of_gpio_flags *flags) { + if (flags) + *flags = 0; + return -ENOSYS; } -- cgit v1.2.3 From b08ea35a3296ee25c4cb53a977b752266dafa2c2 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 3 Dec 2015 15:14:13 +0100 Subject: gpio: add a data pointer to gpio_chip This adds a void * pointer to gpio_chip so that driver can assign and retrieve some states. This is done to get rid of container_of() calls for gpio_chips embedded inside state containers, so we can remove the need to have the gpio_chip or later (planned) struct gpio_device be dynamically allocated at registration time, so that its struct device can be properly reference counted and not bound to its parent device (e.g. a platform_device) but instead live on after unregistration if it is opened by e.g. a char device or sysfs. The data is added with the new function gpiochip_add_data() and for compatibility we add static inline wrapper function gpiochip_add() that will call gpiochip_add_data() with NULL as argument. The latter will be removed once we have exorcised gpiochip_add() from the kernel. gpiochip_get_data() is added as a static inline accessor for drivers to quickly get their data out. Signed-off-by: Linus Walleij --- include/linux/gpio/driver.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 990797589408..b833a5f9629a 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -23,6 +23,7 @@ struct seq_file; * @parent: optional parent device providing the GPIOs * @cdev: class device used by sysfs interface (may be NULL) * @owner: helps prevent removal of modules exporting active GPIOs + * @data: per-instance data assigned by the driver * @list: links gpio_chips together for traversal * @request: optional hook for chip-specific activation, such as * enabling module power and clock; may sleep @@ -91,6 +92,7 @@ struct gpio_chip { struct device *parent; struct device *cdev; struct module *owner; + void *data; struct list_head list; int (*request)(struct gpio_chip *chip, @@ -165,7 +167,11 @@ extern const char *gpiochip_is_requested(struct gpio_chip *chip, unsigned offset); /* add/remove chips */ -extern int gpiochip_add(struct gpio_chip *chip); +extern int gpiochip_add_data(struct gpio_chip *chip, void *data); +static inline int gpiochip_add(struct gpio_chip *chip) +{ + return gpiochip_add_data(chip, NULL); +} extern void gpiochip_remove(struct gpio_chip *chip); extern struct gpio_chip *gpiochip_find(void *data, int (*match)(struct gpio_chip *chip, void *data)); @@ -174,6 +180,12 @@ extern struct gpio_chip *gpiochip_find(void *data, int gpiochip_lock_as_irq(struct gpio_chip *chip, unsigned int offset); void gpiochip_unlock_as_irq(struct gpio_chip *chip, unsigned int offset); +/* get driver data */ +static inline void *gpiochip_get_data(struct gpio_chip *chip) +{ + return chip->data; +} + struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc); #ifdef CONFIG_GPIOLIB_IRQCHIP -- cgit v1.2.3 From 3208b0f0c010b26e4d461a3bca59989d03ed9087 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 4 Dec 2015 15:13:53 +0100 Subject: gpio: of: provide optional of_mm_gpiochip_add_data() function In the same spirit as we add an optional void *data argument to the gpiochip_add_data() call, we need this also for of_mm_gpiochip_add(). Signed-off-by: Linus Walleij --- include/linux/of_gpio.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h index bb85a8eeba6a..092186c62ff4 100644 --- a/include/linux/of_gpio.h +++ b/include/linux/of_gpio.h @@ -51,8 +51,14 @@ static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc) extern int of_get_named_gpio_flags(struct device_node *np, const char *list_name, int index, enum of_gpio_flags *flags); -extern int of_mm_gpiochip_add(struct device_node *np, - struct of_mm_gpio_chip *mm_gc); +extern int of_mm_gpiochip_add_data(struct device_node *np, + struct of_mm_gpio_chip *mm_gc, + void *data); +static inline int of_mm_gpiochip_add(struct device_node *np, + struct of_mm_gpio_chip *mm_gc) +{ + return of_mm_gpiochip_add_data(np, mm_gc, NULL); +} extern void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc); extern int of_gpiochip_add(struct gpio_chip *gc); -- cgit v1.2.3 From 0f4630f3720e7e6e921bf525c8357fea7ef3dbab Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 4 Dec 2015 14:02:58 +0100 Subject: gpio: generic: factor into gpio_chip struct The separate struct bgpio_chip has been a pain to handle, both by being confusingly similar in name to struct gpio_chip and for being contained inside a struct so that struct gpio_chip is contained in a struct contained in a struct, making several steps of dereferencing necessary. Make things simpler: include the fields directly into , #ifdef:ed for CONFIG_GENERIC_GPIO, and get rid of the altogether. Prefix some of the member variables with bgpio_* and add proper kerneldoc while we're at it. Modify all users to handle the change and use a struct gpio_chip directly. And while we're at it: replace all container_of() dereferencing by gpiochip_get_data() and registering the gpio_chip with gpiochip_add_data(). Cc: arm@kernel.org Cc: Alexander Shiyan Cc: Shawn Guo Cc: Sascha Hauer Cc: Kukjin Kim Cc: Alexandre Courbot Cc: Brian Norris Cc: Florian Fainelli Cc: Sudeep Holla Cc: Lorenzo Pieralisi Cc: Nicolas Pitre Cc: Olof Johansson Cc: Vladimir Zapolskiy Cc: Rabin Vincent Cc: linux-arm-kernel@lists.infradead.org Cc: linux-omap@vger.kernel.org Cc: linux-samsung-soc@vger.kernel.org Cc: bcm-kernel-feedback-list@broadcom.com Acked-by: Gregory Fong Acked-by: Liviu Dudau Acked-by: H Hartley Sweeten Acked-by: Tony Lindgren Acked-by: Krzysztof Kozlowski Acked-by: Lee Jones Signed-off-by: Linus Walleij --- include/linux/basic_mmio_gpio.h | 80 ----------------------------------------- include/linux/gpio/driver.h | 54 ++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 80 deletions(-) delete mode 100644 include/linux/basic_mmio_gpio.h (limited to 'include/linux') diff --git a/include/linux/basic_mmio_gpio.h b/include/linux/basic_mmio_gpio.h deleted file mode 100644 index ed3768f4ecc7..000000000000 --- a/include/linux/basic_mmio_gpio.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Basic memory-mapped GPIO controllers. - * - * Copyright 2008 MontaVista Software, Inc. - * Copyright 2008,2010 Anton Vorontsov - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - */ - -#ifndef __BASIC_MMIO_GPIO_H -#define __BASIC_MMIO_GPIO_H - -#include -#include -#include -#include - -struct bgpio_pdata { - const char *label; - int base; - int ngpio; -}; - -struct device; - -struct bgpio_chip { - struct gpio_chip gc; - - unsigned long (*read_reg)(void __iomem *reg); - void (*write_reg)(void __iomem *reg, unsigned long data); - - void __iomem *reg_dat; - void __iomem *reg_set; - void __iomem *reg_clr; - void __iomem *reg_dir; - - /* Number of bits (GPIOs): * 8. */ - int bits; - - /* - * Some GPIO controllers work with the big-endian bits notation, - * e.g. in a 8-bits register, GPIO7 is the least significant bit. - */ - unsigned long (*pin2mask)(struct bgpio_chip *bgc, unsigned int pin); - - /* - * Used to lock bgpio_chip->data. Also, this is needed to keep - * shadowed and real data registers writes together. - */ - spinlock_t lock; - - /* Shadowed data register to clear/set bits safely. */ - unsigned long data; - - /* Shadowed direction registers to clear/set direction safely. */ - unsigned long dir; -}; - -static inline struct bgpio_chip *to_bgpio_chip(struct gpio_chip *gc) -{ - return container_of(gc, struct bgpio_chip, gc); -} - -int bgpio_remove(struct bgpio_chip *bgc); -int bgpio_init(struct bgpio_chip *bgc, struct device *dev, - unsigned long sz, void __iomem *dat, void __iomem *set, - void __iomem *clr, void __iomem *dirout, void __iomem *dirin, - unsigned long flags); - -#define BGPIOF_BIG_ENDIAN BIT(0) -#define BGPIOF_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ -#define BGPIOF_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ -#define BGPIOF_BIG_ENDIAN_BYTE_ORDER BIT(3) -#define BGPIOF_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ -#define BGPIOF_NO_OUTPUT BIT(5) /* only input */ - -#endif /* __BASIC_MMIO_GPIO_H */ diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index b833a5f9629a..e2d05fd0e6e3 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -8,6 +8,7 @@ #include #include #include +#include struct device; struct gpio_desc; @@ -65,6 +66,23 @@ struct seq_file; * registers. * @irq_not_threaded: flag must be set if @can_sleep is set but the * IRQs don't need to be threaded + * @read_reg: reader function for generic GPIO + * @write_reg: writer function for generic GPIO + * @pin2mask: some generic GPIO controllers work with the big-endian bits + * notation, e.g. in a 8-bits register, GPIO7 is the least significant + * bit. This callback assigns the right bit mask. + * @reg_dat: data (in) register for generic GPIO + * @reg_set: output set register (out=high) for generic GPIO + * @reg_clk: output clear register (out=low) for generic GPIO + * @reg_dir: direction setting register for generic GPIO + * @bgpio_bits: number of register bits used for a generic GPIO i.e. + * * 8 + * @bgpio_lock: used to lock chip->bgpio_data. Also, this is needed to keep + * shadowed and real data registers writes together. + * @bgpio_data: shadowed data register for generic GPIO to clear/set bits + * safely. + * @bgpio_dir: shadowed direction register for generic GPIO to clear/set + * direction safely. * @irqchip: GPIO IRQ chip impl, provided by GPIO driver * @irqdomain: Interrupt translation domain; responsible for mapping * between GPIO hwirq number and linux irq number @@ -128,6 +146,20 @@ struct gpio_chip { bool can_sleep; bool irq_not_threaded; +#if IS_ENABLED(CONFIG_GPIO_GENERIC) + unsigned long (*read_reg)(void __iomem *reg); + void (*write_reg)(void __iomem *reg, unsigned long data); + unsigned long (*pin2mask)(struct gpio_chip *gc, unsigned int pin); + void __iomem *reg_dat; + void __iomem *reg_set; + void __iomem *reg_clr; + void __iomem *reg_dir; + int bgpio_bits; + spinlock_t bgpio_lock; + unsigned long bgpio_data; + unsigned long bgpio_dir; +#endif + #ifdef CONFIG_GPIOLIB_IRQCHIP /* * With CONFIG_GPIOLIB_IRQCHIP we get an irqchip inside the gpiolib @@ -188,6 +220,28 @@ static inline void *gpiochip_get_data(struct gpio_chip *chip) struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc); +#if IS_ENABLED(CONFIG_GPIO_GENERIC) + +struct bgpio_pdata { + const char *label; + int base; + int ngpio; +}; + +int bgpio_init(struct gpio_chip *gc, struct device *dev, + unsigned long sz, void __iomem *dat, void __iomem *set, + void __iomem *clr, void __iomem *dirout, void __iomem *dirin, + unsigned long flags); + +#define BGPIOF_BIG_ENDIAN BIT(0) +#define BGPIOF_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ +#define BGPIOF_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ +#define BGPIOF_BIG_ENDIAN_BYTE_ORDER BIT(3) +#define BGPIOF_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ +#define BGPIOF_NO_OUTPUT BIT(5) /* only input */ + +#endif + #ifdef CONFIG_GPIOLIB_IRQCHIP void gpiochip_set_chained_irqchip(struct gpio_chip *gpiochip, -- cgit v1.2.3 From 1d1e8cdc823b9c0ed9b51dffef59b874b0aac808 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Mon, 21 Dec 2015 14:13:08 +0100 Subject: PCI/MSI: Fix typos in Fix two comment typos in the header. Signed-off-by: Thomas Petazzoni Signed-off-by: Bjorn Helgaas --- include/linux/msi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index f71a25e5fd25..8974b2177b67 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -186,7 +186,7 @@ struct msi_domain_info; * @msi_free: Domain specific function to free a MSI interrupts * @msi_check: Callback for verification of the domain/info/dev data * @msi_prepare: Prepare the allocation of the interrupts in the domain - * @msi_finish: Optional callbacl to finalize the allocation + * @msi_finish: Optional callback to finalize the allocation * @set_desc: Set the msi descriptor for an interrupt * @handle_error: Optional error handler if the allocation fails * @@ -194,7 +194,7 @@ struct msi_domain_info; * msi_create_irq_domain() and related interfaces * * @msi_check, @msi_prepare, @msi_finish, @set_desc and @handle_error - * are callbacks used by msi_irq_domain_alloc_irqs() and related + * are callbacks used by msi_domain_alloc_irqs() and related * interfaces which are based on msi_desc. */ struct msi_domain_ops { -- cgit v1.2.3 From 6b9cb42752dafba3761dde0002ca58ca518b6311 Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Thu, 7 Jan 2016 16:46:12 +0100 Subject: device core: add device_is_bound() Adds a function that tells whether a device is already bound to a driver. This is needed to warn when there is an attempt to change the PM domain of a device that has finished probing already. The reason why we want to enforce that is because in the general case that can cause problems and also that we can simplify code quite a bit if we can always assume that. Signed-off-by: Tomeu Vizoso Reviewed-by: Ulf Hansson Acked-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki --- include/linux/device.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index f627ba20a46c..6d6f1fec092f 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1044,6 +1044,8 @@ extern int __must_check driver_attach(struct device_driver *drv); extern void device_initial_probe(struct device *dev); extern int __must_check device_reprobe(struct device *dev); +extern bool device_is_bound(struct device *dev); + /* * Easy functions for dynamically creating devices on the fly */ -- cgit v1.2.3 From 989561de9b5112999475b406557d9c7e9e59c041 Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Thu, 7 Jan 2016 16:46:13 +0100 Subject: PM / Domains: add setter for dev.pm_domain Adds a function that sets the pointer to dev_pm_domain in struct device and that warns if the device has already finished probing. The reason why we want to enforce that is because in the general case that can cause problems and also that we can simplify code quite a bit if we can always assume that. This patch also changes all current code that directly sets the dev.pm_domain pointer. Signed-off-by: Tomeu Vizoso Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- include/linux/pm_domain.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index ba4ced38efae..db21d3995f7e 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -240,12 +240,15 @@ static inline int of_genpd_add_provider_onecell(struct device_node *np, #ifdef CONFIG_PM extern int dev_pm_domain_attach(struct device *dev, bool power_on); extern void dev_pm_domain_detach(struct device *dev, bool power_off); +extern void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd); #else static inline int dev_pm_domain_attach(struct device *dev, bool power_on) { return -ENODEV; } static inline void dev_pm_domain_detach(struct device *dev, bool power_off) {} +static inline void dev_pm_domain_set(struct device *dev, + struct dev_pm_domain *pd) {} #endif #endif /* _LINUX_PM_DOMAIN_H */ -- cgit v1.2.3 From aa8e54b559479d0cb7eb632ba443b8cacd20cd4b Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Thu, 7 Jan 2016 16:46:14 +0100 Subject: PM / sleep: Go direct_complete if driver has no callbacks If a suitable prepare callback cannot be found for a given device and its driver has no PM callbacks at all, assume that it can go direct to complete when the system goes to sleep. The reason for this is that there's lots of devices in a system that do no PM at all and there's no reason for them to prevent their ancestors to do direct_complete if they can support it. Signed-off-by: Tomeu Vizoso Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index 528be6787796..6a5d654f4447 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -573,6 +573,7 @@ struct dev_pm_info { struct wakeup_source *wakeup; bool wakeup_path:1; bool syscore:1; + bool no_pm_callbacks:1; /* Owned by the PM core */ #else unsigned int should_wakeup:1; #endif -- cgit v1.2.3 From 8ae83b6f76fc74eb6535b9d331a3310a59c32f84 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 30 Dec 2015 13:47:27 +0900 Subject: rtc: s5m: Make register configuration per S2MPS device to remove exceptions Before updating time and alarm the driver must set appropriate mask in UDR register. For that purpose the driver uses common register configuration and a lot of exceptions per device in the code. The exceptions are not obvious, for example except the change in the logic sometimes the fields are swapped (WUDR and AUDR between S2MPS14 and S2MPS15). This leads to quite complicated code. Try to make it more obvious by: 1. Documenting the UDR masks for devices and operations. 2. Adding fields in register configuration structure for each operation (read time, write time and alarm). 3. Splitting the configuration per S2MPS13, S2MPS14 and S2MPS15 thus removing exceptions for them. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Alim Akhtar Tested-by: Alim Akhtar Acked-by: Lee Jones Signed-off-by: Alexandre Belloni --- include/linux/mfd/samsung/rtc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/samsung/rtc.h b/include/linux/mfd/samsung/rtc.h index a65e4655d470..48c3c5be7eb1 100644 --- a/include/linux/mfd/samsung/rtc.h +++ b/include/linux/mfd/samsung/rtc.h @@ -105,6 +105,8 @@ enum s2mps_rtc_reg { #define S5M_RTC_UDR_MASK (1 << S5M_RTC_UDR_SHIFT) #define S2MPS_RTC_WUDR_SHIFT 4 #define S2MPS_RTC_WUDR_MASK (1 << S2MPS_RTC_WUDR_SHIFT) +#define S2MPS15_RTC_AUDR_SHIFT 4 +#define S2MPS15_RTC_AUDR_MASK (1 << S2MPS15_RTC_AUDR_SHIFT) #define S2MPS13_RTC_AUDR_SHIFT 1 #define S2MPS13_RTC_AUDR_MASK (1 << S2MPS13_RTC_AUDR_SHIFT) #define S2MPS15_RTC_WUDR_SHIFT 1 -- cgit v1.2.3 From 62cd1c40ce1c7c16835b599751c7a002eb5bbdf5 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Sun, 3 Jan 2016 13:32:37 +0200 Subject: watchdog: kill unref/ref ops ref/unref ops are not called at all so even marked them as deprecated is misleading, we need to just drop the API. Signed-off-by: Tomas Winkler Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/watchdog.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 850af04fe0c7..aaabd4703b46 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -47,8 +47,6 @@ struct watchdog_ops { int (*set_timeout)(struct watchdog_device *, unsigned int); unsigned int (*get_timeleft)(struct watchdog_device *); int (*restart)(struct watchdog_device *); - void (*ref)(struct watchdog_device *) __deprecated; - void (*unref)(struct watchdog_device *) __deprecated; long (*ioctl)(struct watchdog_device *, unsigned int, unsigned long); }; -- cgit v1.2.3 From faa584757b63aad42d19f1c6a6eac2c848618f83 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 3 Jan 2016 15:11:56 -0800 Subject: watchdog: Add support for creating driver specific sysfs attributes The Zodiac watchdog driver attaches additional sysfs attributes to the watchdog device. This has a number of problems: The watchdog device lifetime differs from the driver lifetime, and the device structure should therefore not be accessed from drivers. Also, creating sysfs attributes after driver registration results in a potential race condition if user space expects the attributes to exist but they don't exist yet. Add support for creating driver specific sysfs attributes to the watchdog core to solve the problems. Signed-off-by: Guenter Roeck Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/watchdog.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index aaabd4703b46..076df50ea0da 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -55,6 +55,8 @@ struct watchdog_ops { * @id: The watchdog's ID. (Allocated by watchdog_register_device) * @dev: The device for our watchdog * @parent: The parent bus device + * @groups: List of sysfs attribute groups to create when creating the + * watchdog device. * @info: Pointer to a watchdog_info structure. * @ops: Pointer to the list of watchdog operations. * @bootstatus: Status of the watchdog device at boot. @@ -82,6 +84,7 @@ struct watchdog_device { int id; struct device *dev; struct device *parent; + const struct attribute_group **groups; const struct watchdog_info *info; const struct watchdog_ops *ops; unsigned int bootstatus; -- cgit v1.2.3 From 0254e953537c92df3e7d0176f401a211e944fd61 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 3 Jan 2016 15:11:58 -0800 Subject: watchdog: Drop pointer to watchdog device from struct watchdog_device The lifetime of the watchdog device pointer is different from the lifetime of its character device. Remove it entirely to avoid race conditions. Signed-off-by: Guenter Roeck Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/watchdog.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 076df50ea0da..b585fa2507ee 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -53,7 +53,6 @@ struct watchdog_ops { /** struct watchdog_device - The structure that defines a watchdog device * * @id: The watchdog's ID. (Allocated by watchdog_register_device) - * @dev: The device for our watchdog * @parent: The parent bus device * @groups: List of sysfs attribute groups to create when creating the * watchdog device. @@ -82,7 +81,6 @@ struct watchdog_ops { */ struct watchdog_device { int id; - struct device *dev; struct device *parent; const struct attribute_group **groups; const struct watchdog_info *info; -- cgit v1.2.3 From 069368e91879a3a640cfae4bdc1f9f8cc99c93a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 12 Jan 2016 07:49:19 +0100 Subject: lightnvm: move ppa erase logic to core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A device may function in single, dual or quad plane mode. The gennvm media manager manages this with explicit helpers. They convert a single ppa to 1, 2 or 4 separate ppas in a ppa list. To aid implementation of recovery and system blocks, this functionality can be moved directly into the core. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 034117b3be5f..c228dbc803bf 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -427,6 +427,9 @@ extern int nvm_register(struct request_queue *, char *, extern void nvm_unregister(char *); extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *); +extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); +extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); +extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr); extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); #else /* CONFIG_NVM */ struct nvm_dev_ops; -- cgit v1.2.3 From abd805ec9f51f37db9da63dda44c3f4b4ae8ad57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 12 Jan 2016 07:49:20 +0100 Subject: lightnvm: refactor rqd ppa list into set/free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A device may be driven in single, double or quad plane mode. In that case, the rqd must have either one, two, or four PPAs set for a single PPA sent to the device. Refactor this logic into their own functions to be shared by program/erase/read in the core. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index c228dbc803bf..2fd6871ac7f5 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -429,6 +429,9 @@ extern void nvm_unregister(char *); extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *); extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); +extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, + struct ppa_addr *, int); +extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr); extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); #else /* CONFIG_NVM */ -- cgit v1.2.3 From 91276162de9476b8ff32d9452e849210e5dd09e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 12 Jan 2016 07:49:21 +0100 Subject: lightnvm: refactor end_io functions for sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To implement sync I/O support within the LightNVM core, the end_io functions are refactored to take an end_io function pointer instead of testing for initialized media manager, followed by calling its end_io function. Sync I/O can then be implemented using a callback that signal I/O completion. This is similar to the logic found in blk_to_execute_io(). By implementing it this way, the underlying device I/Os submission logic is abstracted away from core, targets, and media managers. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 2fd6871ac7f5..9c9fe9ca0441 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -148,6 +148,9 @@ struct ppa_addr { }; }; +struct nvm_rq; +typedef void (nvm_end_io_fn)(struct nvm_rq *, int); + struct nvm_rq { struct nvm_tgt_instance *ins; struct nvm_dev *dev; @@ -164,6 +167,9 @@ struct nvm_rq { void *metadata; dma_addr_t dma_metadata; + struct completion *wait; + nvm_end_io_fn *end_io; + uint8_t opcode; uint16_t nr_pages; uint16_t flags; @@ -347,7 +353,6 @@ static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev, typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); -typedef int (nvm_tgt_end_io_fn)(struct nvm_rq *, int); typedef void *(nvm_tgt_init_fn)(struct nvm_dev *, struct gendisk *, int, int); typedef void (nvm_tgt_exit_fn)(void *); @@ -358,7 +363,7 @@ struct nvm_tgt_type { /* target entry points */ nvm_tgt_make_rq_fn *make_rq; nvm_tgt_capacity_fn *capacity; - nvm_tgt_end_io_fn *end_io; + nvm_end_io_fn *end_io; /* module-specific init/teardown */ nvm_tgt_init_fn *init; @@ -383,7 +388,6 @@ typedef int (nvmm_open_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef int (nvmm_close_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef void (nvmm_flush_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); -typedef int (nvmm_end_io_fn)(struct nvm_rq *, int); typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, unsigned long); typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int); @@ -404,7 +408,6 @@ struct nvmm_type { nvmm_flush_blk_fn *flush_blk; nvmm_submit_io_fn *submit_io; - nvmm_end_io_fn *end_io; nvmm_erase_blk_fn *erase_blk; /* Configuration management */ @@ -434,6 +437,7 @@ extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr); extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); +extern void nvm_end_io(struct nvm_rq *, int); #else /* CONFIG_NVM */ struct nvm_dev_ops; -- cgit v1.2.3 From 81e681d3f7424fc2f03b6269e15c63131473c98f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 12 Jan 2016 07:49:28 +0100 Subject: lightnvm: support multiple ppas in nvm_erase_ppa MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sometimes a user want to erase multiple PPAs at the same time. Extend nvm_erase_ppa to take multiple ppas and number of ppas to be erased. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 9c9fe9ca0441..a83298f62122 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -435,7 +435,7 @@ extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, struct ppa_addr *, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); -extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr); +extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int); extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); extern void nvm_end_io(struct nvm_rq *, int); #else /* CONFIG_NVM */ -- cgit v1.2.3 From 72d256ecc5d0c8cbcc0bd5c6d983b434df556cb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 12 Jan 2016 07:49:29 +0100 Subject: lightnvm: move rq->error to nvm_rq->error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of passing request error into the LightNVM modules, incorporate it into the nvm_rq. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index a83298f62122..9acc71a9a47f 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -149,7 +149,7 @@ struct ppa_addr { }; struct nvm_rq; -typedef void (nvm_end_io_fn)(struct nvm_rq *, int); +typedef void (nvm_end_io_fn)(struct nvm_rq *); struct nvm_rq { struct nvm_tgt_instance *ins; @@ -173,6 +173,8 @@ struct nvm_rq { uint8_t opcode; uint16_t nr_pages; uint16_t flags; + + int error; }; static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu) -- cgit v1.2.3 From 09719b62fdab031e39b39a6470364a372abdf3f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 12 Jan 2016 07:49:30 +0100 Subject: lightnvm: introduce nvm_submit_ppa MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Internal logic for both core and media managers, does not have a backing bio for issuing I/Os. Introduce nvm_submit_ppa to allow raw I/Os to be submitted to the underlying device driver. The function request the device, ppa, data buffer and its length and will submit the I/O synchronously to the device. The return value may therefore be used to detect any errors regarding the issued I/O. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 9acc71a9a47f..b7001481e207 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -440,6 +440,8 @@ extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int); extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); extern void nvm_end_io(struct nvm_rq *, int); +extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, + void *, int); #else /* CONFIG_NVM */ struct nvm_dev_ops; -- cgit v1.2.3 From b5d4acd4cbf5029a2616084d9e9f392046d53a37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 12 Jan 2016 07:49:32 +0100 Subject: lightnvm: fix missing grown bad block type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The get/set bad block interface defines good block, factory bad block, grown bad block, device reserved block, and host reserved block. Unfortunately the grown bad block was missing, leaving the offsets wrong for device and host side reserved blocks. This patch adds the missing type and corrects the offsets. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index b7001481e207..4a700a137460 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -58,8 +58,9 @@ enum { /* Block Types */ NVM_BLK_T_FREE = 0x0, NVM_BLK_T_BAD = 0x1, - NVM_BLK_T_DEV = 0x2, - NVM_BLK_T_HOST = 0x4, + NVM_BLK_T_GRWN_BAD = 0x2, + NVM_BLK_T_DEV = 0x4, + NVM_BLK_T_HOST = 0x8, }; struct nvm_id_group { -- cgit v1.2.3 From ff0e498bfa185fad5e86c4c7a2db4f9648d2344f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 12 Jan 2016 07:49:33 +0100 Subject: lightnvm: manage open and closed blocks separately MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LightNVM targets need to know the state of the flash block when doing flash optimizations. An example is implementing a write buffer to respect the flash page size. Currently, block state is not accounted for; the media manager only differentiates among free, bad and in-use blocks. This patch adds the logic in the generic media manager to enable targets manage blocks into open and close separately, and it implements such management in rrpc. It also adds a set of flags to describe the state of the block (open, closed, free, bad). In order to avoid taking two locks (nvm_lun and rrpc_lun) consecutively, we introduce lockless get_/put_block primitives so that the open and close list locks and future common logic is handled within the nvm_lun lock. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 4a700a137460..aa35907714b8 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -229,12 +229,25 @@ struct nvm_lun { int lun_id; int chnl_id; - unsigned int nr_inuse_blocks; /* Number of used blocks */ + /* It is up to the target to mark blocks as closed. If the target does + * not do it, all blocks are marked as open, and nr_open_blocks + * represents the number of blocks in use + */ + unsigned int nr_open_blocks; /* Number of used, writable blocks */ + unsigned int nr_closed_blocks; /* Number of used, read-only blocks */ unsigned int nr_free_blocks; /* Number of unused blocks */ unsigned int nr_bad_blocks; /* Number of bad blocks */ - struct nvm_block *blocks; spinlock_t lock; + + struct nvm_block *blocks; +}; + +enum { + NVM_BLK_ST_FREE = 0x1, /* Free block */ + NVM_BLK_ST_OPEN = 0x2, /* Open block - read-write */ + NVM_BLK_ST_CLOSED = 0x4, /* Closed block - read-only */ + NVM_BLK_ST_BAD = 0x8, /* Bad block */ }; struct nvm_block { @@ -243,7 +256,7 @@ struct nvm_block { unsigned long id; void *priv; - int type; + int state; }; struct nvm_dev { @@ -404,6 +417,8 @@ struct nvmm_type { nvmm_unregister_fn *unregister_mgr; /* Block administration callbacks */ + nvmm_get_blk_fn *get_blk_unlocked; + nvmm_put_blk_fn *put_blk_unlocked; nvmm_get_blk_fn *get_blk; nvmm_put_blk_fn *put_blk; nvmm_open_blk_fn *open_blk; @@ -424,6 +439,10 @@ struct nvmm_type { extern int nvm_register_mgr(struct nvmm_type *); extern void nvm_unregister_mgr(struct nvmm_type *); +extern struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *, + struct nvm_lun *, unsigned long); +extern void nvm_put_blk_unlocked(struct nvm_dev *, struct nvm_block *); + extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *, unsigned long); extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *); -- cgit v1.2.3 From f9a9995072904f2d67d649545f17f81e00f4985e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 12 Jan 2016 07:49:34 +0100 Subject: lightnvm: add mccap support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some flash media has extended capabilities, such as programming SLC pages on MLC/TLC flash, erase/program suspend, scramble and encryption. MCCAP is introduced to detect support for these capabilities in the command set. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index aa35907714b8..b90d28344e3d 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -61,6 +61,12 @@ enum { NVM_BLK_T_GRWN_BAD = 0x2, NVM_BLK_T_DEV = 0x4, NVM_BLK_T_HOST = 0x8, + + /* Memory capabilities */ + NVM_ID_CAP_SLC = 0x1, + NVM_ID_CAP_CMD_SUSPEND = 0x2, + NVM_ID_CAP_SCRAMBLE = 0x4, + NVM_ID_CAP_ENCRYPT = 0x8, }; struct nvm_id_group { @@ -278,6 +284,7 @@ struct nvm_dev { int blks_per_lun; int sec_size; int oob_size; + int mccap; struct nvm_addr_format ppaf; /* Calculated/Cached values. These do not reflect the actual usable -- cgit v1.2.3 From ca5927e7ab5307965104ca58bbb29d110b1d4545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 12 Jan 2016 07:49:35 +0100 Subject: lightnvm: introduce mlc lower page table mappings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NAND MLC memories have both lower and upper pages. When programming, both of these must be written, before data can be read. However, these lower and upper pages might not placed at even and odd flash pages, but can be skipped. Therefore each flash memory has its lower pages defined, which can then be used when programming and to know when padding are necessary. This patch implements the lower page definition in the specification, and exposes it through a simple lookup table at dev->lptbl. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index b90d28344e3d..678fd91a1f99 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -67,6 +67,20 @@ enum { NVM_ID_CAP_CMD_SUSPEND = 0x2, NVM_ID_CAP_SCRAMBLE = 0x4, NVM_ID_CAP_ENCRYPT = 0x8, + + /* Memory types */ + NVM_ID_FMTYPE_SLC = 0, + NVM_ID_FMTYPE_MLC = 1, +}; + +struct nvm_id_lp_mlc { + u16 num_pairs; + u8 pairs[886]; +}; + +struct nvm_id_lp_tbl { + __u8 id[8]; + struct nvm_id_lp_mlc mlc; }; struct nvm_id_group { @@ -89,6 +103,8 @@ struct nvm_id_group { u32 mpos; u32 mccap; u16 cpar; + + struct nvm_id_lp_tbl lptbl; }; struct nvm_addr_format { @@ -297,6 +313,10 @@ struct nvm_dev { int sec_per_blk; int sec_per_lun; + /* lower page table */ + int lps_per_blk; + int *lptbl; + unsigned long total_pages; unsigned long total_blocks; int nr_luns; -- cgit v1.2.3 From e3eb3799f7e0d0924ceeba672ab271865de2802d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 12 Jan 2016 07:49:36 +0100 Subject: lightnvm: core on-disk initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An Open-Channel SSD shall be initialized before use. To initialize, we define an on-disk format, that keeps a small set of metadata to bring up the media manager on top of the device. The initial step is introduced to allow a user to format the disks for a given media manager. During format, a system block is stored on one to three separate luns on the device. Each lun has the system block duplicated. During initialization, the system block can be retrieved and the appropriate media manager can initialized. The on-disk format currently covers (struct nvm_system_block): - Magic value "NVMS". - Monotonic increasing sequence number. - The physical block erase count. - Version of the system block format. - Media manager type. - Media manager superblock physical address. The interface provides three functions to manage the system block: int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *) int nvm_get_sysblock(struct nvm *dev, struct nvm_sb_info *) int nvm_update_sysblock(struct nvm *dev, struct nvm_sb_info *) Each implement a part of the logic to manage the system block. The initialization creates the first system blocks and mark them on the device. Get retrieves the latest system block by scanning all pages in the associated system blocks. The update sysblock writes new metadata and allocates new block if necessary. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 678fd91a1f99..7ad22d3f0d2e 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -17,6 +17,7 @@ enum { #include #include #include +#include enum { /* HW Responsibilities */ @@ -281,6 +282,15 @@ struct nvm_block { int state; }; +/* system block cpu representation */ +struct nvm_sb_info { + unsigned long seqnr; + unsigned long erase_cnt; + unsigned int version; + char mmtype[NVM_MMTYPE_LEN]; + struct ppa_addr fs_ppa; +}; + struct nvm_dev { struct nvm_dev_ops *ops; @@ -329,6 +339,8 @@ struct nvm_dev { /* Backend device */ struct request_queue *q; char name[DISK_NAME_LEN]; + + struct mutex mlock; }; static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, @@ -394,6 +406,11 @@ static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev, return ppa; } +static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg) +{ + return dev->lptbl[slc_pg]; +} + typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_dev *, struct gendisk *, int, int); @@ -489,6 +506,24 @@ extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); extern void nvm_end_io(struct nvm_rq *, int); extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, void *, int); + +/* sysblk.c */ +#define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */ + +/* system block on disk representation */ +struct nvm_system_block { + __be32 magic; /* magic signature */ + __be32 seqnr; /* sequence number */ + __be32 erase_cnt; /* erase count */ + __be16 version; /* version number */ + u8 mmtype[NVM_MMTYPE_LEN]; /* media manager name */ + __be64 fs_ppa; /* PPA for media manager + * superblock */ +}; + +extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *); +extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *); +extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *); #else /* CONFIG_NVM */ struct nvm_dev_ops; -- cgit v1.2.3 From b769207678176d590ea61ce7a64c9100925668b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 12 Jan 2016 07:49:38 +0100 Subject: lightnvm: use system block for mm initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use system block information to register the appropriate media manager. This enables the LightNVM subsystem to instantiate a media manager selected by the user, instead of relying on automatic detection by each media manager loaded in the kernel. A device must now be initialized before it can proceed to initialize its media manager. Upon initialization, the configured media manager is automatically initialized as well. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 7ad22d3f0d2e..02f36bdf216e 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -301,6 +301,9 @@ struct nvm_dev { struct nvmm_type *mt; void *mp; + /* System blocks */ + struct nvm_sb_info sb; + /* Device information */ int nr_chnls; int nr_planes; -- cgit v1.2.3 From 8b4970c41f88ad772771f87b1c82c395248a84d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 12 Jan 2016 07:49:39 +0100 Subject: lightnvm: introduce factory reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that a device can be managed using the system blocks, a method to reset the device is necessary as well. This patch introduces logic to reset the device easily to factory state and exposes it through an ioctl. The ioctl takes the following flags: NVM_FACTORY_ERASE_ONLY_USER By default all blocks, except host-reserved blocks are erased upon factory reset. Instead of this, only erase host-reserved blocks. NVM_FACTORY_RESET_HOST_BLKS Mark host-reserved blocks to be erased and set their type to free. NVM_FACTORY_RESET_GRWN_BBLKS Mark "grown bad blocks" to be erased and set their type to free. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 02f36bdf216e..fc0e7c924967 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -527,6 +527,8 @@ struct nvm_system_block { extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *); extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *); extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *); + +extern int nvm_dev_factory(struct nvm_dev *, int flags); #else /* CONFIG_NVM */ struct nvm_dev_ops; -- cgit v1.2.3 From d307fb16f788823b29aab9aa7e4821ac8a124b19 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 20 Dec 2015 13:52:10 +0200 Subject: Revert "virtio_ring: Update weak barriers to use dma_wmb/rmb" This reverts commit 9e1a27ea42691429e31f158cce6fc61bc79bb2e9. While that commit optimizes !CONFIG_SMP, it mixes up DMA and SMP concepts, making the code hard to figure out. A better way to optimize this is with the new __smp_XXX barriers. As a first step, go back to full rmb/wmb barriers for !SMP. We switch to __smp_XXX barriers in the next patch. Cc: Peter Zijlstra Cc: Alexander Duyck Signed-off-by: Michael S. Tsirkin Acked-by: Peter Zijlstra (Intel) --- include/linux/virtio_ring.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 8e50888a6d59..67e06fe18c03 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -21,20 +21,19 @@ * actually quite cheap. */ +#ifdef CONFIG_SMP static inline void virtio_mb(bool weak_barriers) { -#ifdef CONFIG_SMP if (weak_barriers) smp_mb(); else -#endif mb(); } static inline void virtio_rmb(bool weak_barriers) { if (weak_barriers) - dma_rmb(); + smp_rmb(); else rmb(); } @@ -42,10 +41,26 @@ static inline void virtio_rmb(bool weak_barriers) static inline void virtio_wmb(bool weak_barriers) { if (weak_barriers) - dma_wmb(); + smp_wmb(); else wmb(); } +#else +static inline void virtio_mb(bool weak_barriers) +{ + mb(); +} + +static inline void virtio_rmb(bool weak_barriers) +{ + rmb(); +} + +static inline void virtio_wmb(bool weak_barriers) +{ + wmb(); +} +#endif struct virtio_device; struct virtqueue; -- cgit v1.2.3 From a65961272e1ebdb60804bbe2bb440481fcbd1c76 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 27 Dec 2015 17:55:35 +0200 Subject: virtio_ring: update weak barriers to use virt_xxx virtio ring uses smp_wmb on SMP and wmb on !SMP, the reason for the later being that it might be talking to another kernel on the same SMP machine. This is exactly what virt_xxx barriers do, so switch to these instead of homegrown ifdef hacks. Cc: Peter Zijlstra Cc: Alexander Duyck Signed-off-by: Michael S. Tsirkin Acked-by: Peter Zijlstra (Intel) --- include/linux/virtio_ring.h | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 67e06fe18c03..f3fa55bdd6ce 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -12,7 +12,7 @@ * anyone care? * * For virtio_pci on SMP, we don't need to order with respect to MMIO - * accesses through relaxed memory I/O windows, so smp_mb() et al are + * accesses through relaxed memory I/O windows, so virt_mb() et al are * sufficient. * * For using virtio to talk to real devices (eg. other heterogeneous @@ -21,11 +21,10 @@ * actually quite cheap. */ -#ifdef CONFIG_SMP static inline void virtio_mb(bool weak_barriers) { if (weak_barriers) - smp_mb(); + virt_mb(); else mb(); } @@ -33,7 +32,7 @@ static inline void virtio_mb(bool weak_barriers) static inline void virtio_rmb(bool weak_barriers) { if (weak_barriers) - smp_rmb(); + virt_rmb(); else rmb(); } @@ -41,26 +40,10 @@ static inline void virtio_rmb(bool weak_barriers) static inline void virtio_wmb(bool weak_barriers) { if (weak_barriers) - smp_wmb(); + virt_wmb(); else wmb(); } -#else -static inline void virtio_mb(bool weak_barriers) -{ - mb(); -} - -static inline void virtio_rmb(bool weak_barriers) -{ - rmb(); -} - -static inline void virtio_wmb(bool weak_barriers) -{ - wmb(); -} -#endif struct virtio_device; struct virtqueue; -- cgit v1.2.3 From 788e5b3a5da24cc8d93ce2f7c6508181cd7d7fb6 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 17 Dec 2015 12:20:39 +0200 Subject: virtio_ring: use virt_store_mb We need a full barrier after writing out event index, using virt_store_mb there seems better than open-coding. As usual, we need a wrapper to account for strong barriers. It's tempting to use this in vhost as well, for that, we'll need a variant of smp_store_mb that works on __user pointers. Signed-off-by: Michael S. Tsirkin Acked-by: Peter Zijlstra (Intel) --- include/linux/virtio_ring.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index f3fa55bdd6ce..a156e2b6ccfe 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -45,6 +45,17 @@ static inline void virtio_wmb(bool weak_barriers) wmb(); } +static inline void virtio_store_mb(bool weak_barriers, + __virtio16 *p, __virtio16 v) +{ + if (weak_barriers) { + virt_store_mb(*p, v); + } else { + WRITE_ONCE(*p, v); + mb(); + } +} + struct virtio_device; struct virtqueue; -- cgit v1.2.3 From f7ad26ff952b3ca2702d7da03aad0ab1f6c01d7c Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 17 Dec 2015 16:53:43 +0800 Subject: virtio: make find_vqs() checkpatch.pl-friendly checkpatch.pl wants arrays of strings declared as follows: static const char * const names[] = { "vq-1", "vq-2", "vq-3" }; Currently the find_vqs() function takes a const char *names[] argument so passing checkpatch.pl's const char * const names[] results in a compiler error due to losing the second const. This patch adjusts the find_vqs() prototype and updates all virtio transports. This makes it possible for virtio_balloon.c, virtio_input.c, virtgpu_kms.c, and virtio_rpmsg_bus.c to use the checkpatch.pl-friendly type. Signed-off-by: Stefan Hajnoczi Signed-off-by: Michael S. Tsirkin Acked-by: Bjorn Andersson --- include/linux/virtio_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index e5ce8ab0b8b0..6e6cb0c9d7cb 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -70,7 +70,7 @@ struct virtio_config_ops { int (*find_vqs)(struct virtio_device *, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char *names[]); + const char * const names[]); void (*del_vqs)(struct virtio_device *); u64 (*get_features)(struct virtio_device *vdev); int (*finalize_features)(struct virtio_device *vdev); -- cgit v1.2.3 From 0ec09ac2cebe9769491a470c33edff0f873ff79d Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Wed, 18 Nov 2015 14:49:02 +0900 Subject: PM / devfreq: Set the freq_table of devfreq device This patch initialize the freq_table array of each devfreq device by using the devfreq_set_freq_table(). If freq_table is NULL, the devfreq framework is not able to support the frequency transtion information through sysfs. The OPP core uses the integer type for the number of opps in the opp list and uses the 'unsigned long' type for each frequency. So, this patch modifies the type of some variable as following: - the type of freq_table : unsigned int -> unsigned long - the type of max_state : unsigned int -> int - Corrected types, format strings, mutex usages by MyungJoo Signed-off-by: Chanwoo Choi Signed-off-by: MyungJoo Ham --- include/linux/devfreq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h index 68030e22af35..6fa02a20eb63 100644 --- a/include/linux/devfreq.h +++ b/include/linux/devfreq.h @@ -89,7 +89,7 @@ struct devfreq_dev_profile { int (*get_cur_freq)(struct device *dev, unsigned long *freq); void (*exit)(struct device *dev); - unsigned int *freq_table; + unsigned long *freq_table; unsigned int max_state; }; -- cgit v1.2.3 From 96368701e1c89057bbf39222e965161c68a85b4b Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 13 Jan 2016 09:18:55 -0500 Subject: audit: force seccomp event logging to honor the audit_enabled flag Previously we were emitting seccomp audit records regardless of the audit_enabled setting, a deparature from the rest of audit. This patch makes seccomp auditing consistent with the rest of the audit record generation code in that when audit_enabled=0 nothing is logged by the audit subsystem. The bulk of this patch is moving the CONFIG_AUDIT block ahead of the CONFIG_AUDITSYSCALL block in include/linux/audit.h; the only real code change was in the audit_seccomp() definition. Signed-off-by: Tony Jones Signed-off-by: Paul Moore --- include/linux/audit.h | 204 +++++++++++++++++++++++++------------------------- 1 file changed, 104 insertions(+), 100 deletions(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 20eba1eb0a3c..476bc1237ec2 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -113,6 +113,107 @@ struct filename; extern void audit_log_session_info(struct audit_buffer *ab); +#ifdef CONFIG_AUDIT +/* These are defined in audit.c */ + /* Public API */ +extern __printf(4, 5) +void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, + const char *fmt, ...); + +extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type); +extern __printf(2, 3) +void audit_log_format(struct audit_buffer *ab, const char *fmt, ...); +extern void audit_log_end(struct audit_buffer *ab); +extern bool audit_string_contains_control(const char *string, + size_t len); +extern void audit_log_n_hex(struct audit_buffer *ab, + const unsigned char *buf, + size_t len); +extern void audit_log_n_string(struct audit_buffer *ab, + const char *buf, + size_t n); +extern void audit_log_n_untrustedstring(struct audit_buffer *ab, + const char *string, + size_t n); +extern void audit_log_untrustedstring(struct audit_buffer *ab, + const char *string); +extern void audit_log_d_path(struct audit_buffer *ab, + const char *prefix, + const struct path *path); +extern void audit_log_key(struct audit_buffer *ab, + char *key); +extern void audit_log_link_denied(const char *operation, + struct path *link); +extern void audit_log_lost(const char *message); +#ifdef CONFIG_SECURITY +extern void audit_log_secctx(struct audit_buffer *ab, u32 secid); +#else +static inline void audit_log_secctx(struct audit_buffer *ab, u32 secid) +{ } +#endif + +extern int audit_log_task_context(struct audit_buffer *ab); +extern void audit_log_task_info(struct audit_buffer *ab, + struct task_struct *tsk); + +extern int audit_update_lsm_rules(void); + + /* Private API (for audit.c only) */ +extern int audit_filter_user(int type); +extern int audit_filter_type(int type); +extern int audit_rule_change(int type, __u32 portid, int seq, + void *data, size_t datasz); +extern int audit_list_rules_send(struct sk_buff *request_skb, int seq); + +extern u32 audit_enabled; +#else /* CONFIG_AUDIT */ +static inline __printf(4, 5) +void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, + const char *fmt, ...) +{ } +static inline struct audit_buffer *audit_log_start(struct audit_context *ctx, + gfp_t gfp_mask, int type) +{ + return NULL; +} +static inline __printf(2, 3) +void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) +{ } +static inline void audit_log_end(struct audit_buffer *ab) +{ } +static inline void audit_log_n_hex(struct audit_buffer *ab, + const unsigned char *buf, size_t len) +{ } +static inline void audit_log_n_string(struct audit_buffer *ab, + const char *buf, size_t n) +{ } +static inline void audit_log_n_untrustedstring(struct audit_buffer *ab, + const char *string, size_t n) +{ } +static inline void audit_log_untrustedstring(struct audit_buffer *ab, + const char *string) +{ } +static inline void audit_log_d_path(struct audit_buffer *ab, + const char *prefix, + const struct path *path) +{ } +static inline void audit_log_key(struct audit_buffer *ab, char *key) +{ } +static inline void audit_log_link_denied(const char *string, + const struct path *link) +{ } +static inline void audit_log_secctx(struct audit_buffer *ab, u32 secid) +{ } +static inline int audit_log_task_context(struct audit_buffer *ab) +{ + return 0; +} +static inline void audit_log_task_info(struct audit_buffer *ab, + struct task_struct *tsk) +{ } +#define audit_enabled 0 +#endif /* CONFIG_AUDIT */ + #ifdef CONFIG_AUDIT_COMPAT_GENERIC #define audit_is_compat(arch) (!((arch) & __AUDIT_ARCH_64BIT)) #else @@ -212,6 +313,9 @@ void audit_core_dumps(long signr); static inline void audit_seccomp(unsigned long syscall, long signr, int code) { + if (!audit_enabled) + return; + /* Force a record to be reported if a signal was delivered. */ if (signr || unlikely(!audit_dummy_context())) __audit_seccomp(syscall, signr, code); @@ -446,106 +550,6 @@ static inline bool audit_loginuid_set(struct task_struct *tsk) return uid_valid(audit_get_loginuid(tsk)); } -#ifdef CONFIG_AUDIT -/* These are defined in audit.c */ - /* Public API */ -extern __printf(4, 5) -void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, - const char *fmt, ...); - -extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type); -extern __printf(2, 3) -void audit_log_format(struct audit_buffer *ab, const char *fmt, ...); -extern void audit_log_end(struct audit_buffer *ab); -extern bool audit_string_contains_control(const char *string, - size_t len); -extern void audit_log_n_hex(struct audit_buffer *ab, - const unsigned char *buf, - size_t len); -extern void audit_log_n_string(struct audit_buffer *ab, - const char *buf, - size_t n); -extern void audit_log_n_untrustedstring(struct audit_buffer *ab, - const char *string, - size_t n); -extern void audit_log_untrustedstring(struct audit_buffer *ab, - const char *string); -extern void audit_log_d_path(struct audit_buffer *ab, - const char *prefix, - const struct path *path); -extern void audit_log_key(struct audit_buffer *ab, - char *key); -extern void audit_log_link_denied(const char *operation, - struct path *link); -extern void audit_log_lost(const char *message); -#ifdef CONFIG_SECURITY -extern void audit_log_secctx(struct audit_buffer *ab, u32 secid); -#else -static inline void audit_log_secctx(struct audit_buffer *ab, u32 secid) -{ } -#endif - -extern int audit_log_task_context(struct audit_buffer *ab); -extern void audit_log_task_info(struct audit_buffer *ab, - struct task_struct *tsk); - -extern int audit_update_lsm_rules(void); - - /* Private API (for audit.c only) */ -extern int audit_filter_user(int type); -extern int audit_filter_type(int type); -extern int audit_rule_change(int type, __u32 portid, int seq, - void *data, size_t datasz); -extern int audit_list_rules_send(struct sk_buff *request_skb, int seq); - -extern u32 audit_enabled; -#else /* CONFIG_AUDIT */ -static inline __printf(4, 5) -void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, - const char *fmt, ...) -{ } -static inline struct audit_buffer *audit_log_start(struct audit_context *ctx, - gfp_t gfp_mask, int type) -{ - return NULL; -} -static inline __printf(2, 3) -void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) -{ } -static inline void audit_log_end(struct audit_buffer *ab) -{ } -static inline void audit_log_n_hex(struct audit_buffer *ab, - const unsigned char *buf, size_t len) -{ } -static inline void audit_log_n_string(struct audit_buffer *ab, - const char *buf, size_t n) -{ } -static inline void audit_log_n_untrustedstring(struct audit_buffer *ab, - const char *string, size_t n) -{ } -static inline void audit_log_untrustedstring(struct audit_buffer *ab, - const char *string) -{ } -static inline void audit_log_d_path(struct audit_buffer *ab, - const char *prefix, - const struct path *path) -{ } -static inline void audit_log_key(struct audit_buffer *ab, char *key) -{ } -static inline void audit_log_link_denied(const char *string, - const struct path *link) -{ } -static inline void audit_log_secctx(struct audit_buffer *ab, u32 secid) -{ } -static inline int audit_log_task_context(struct audit_buffer *ab) -{ - return 0; -} -static inline void audit_log_task_info(struct audit_buffer *ab, - struct task_struct *tsk) -{ } -#define audit_enabled 0 -#endif /* CONFIG_AUDIT */ static inline void audit_log_string(struct audit_buffer *ab, const char *buf) { audit_log_n_string(ab, buf, strlen(buf)); -- cgit v1.2.3 From c474e348778bdf5b453a2cdff4b2b1f9e000f343 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 9 Jan 2016 22:16:42 +0100 Subject: gpio: generic: make bgpio_pdata always visible Board files that define their own bgpio_pdata are broken when CONFIG_GPIO_GENERIC is disabled and the bgpio_pdata structure definition is hidden by the #ifdef: arch/arm/mach-clps711x/board-autcpu12.c:148:15: error: variable 'autcpu12_mmgpio_pdata' has initializer but incomplete type static struct bgpio_pdata autcpu12_mmgpio_pdata __initdata = { arch/arm/mach-clps711x/board-autcpu12.c:149:2: error: unknown field 'base' specified in initializer .base = AUTCPU12_MMGPIO_BASE, Since the board files should generally not care what drivers are enabled, this makes the structure definition visible again. Signed-off-by: Arnd Bergmann Fixes: 0f4630f3720e ("gpio: generic: factor into gpio_chip struct") Signed-off-by: Linus Walleij --- include/linux/gpio/driver.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index e2d05fd0e6e3..82fda487453f 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -220,14 +220,14 @@ static inline void *gpiochip_get_data(struct gpio_chip *chip) struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc); -#if IS_ENABLED(CONFIG_GPIO_GENERIC) - struct bgpio_pdata { const char *label; int base; int ngpio; }; +#if IS_ENABLED(CONFIG_GPIO_GENERIC) + int bgpio_init(struct gpio_chip *gc, struct device *dev, unsigned long sz, void __iomem *dat, void __iomem *set, void __iomem *clr, void __iomem *dirout, void __iomem *dirin, -- cgit v1.2.3 From a7fd9a4f3e8179bab31e4637236ebb0e0b7867c6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Jan 2016 13:04:11 -0700 Subject: lightnvm: ensure that nvm_dev_ops can be used without CONFIG_NVM null_blk defines an empty version of this ops structure if CONFIG_NVM isn't set, but it doesn't know the type. Move those bits out of the protection of CONFIG_NVM in the main lightnvm include. Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 121 +++++++++++++++++++++++++---------------------- 1 file changed, 64 insertions(+), 57 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index fc0e7c924967..d6750111e48e 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -1,6 +1,8 @@ #ifndef NVM_H #define NVM_H +#include + enum { NVM_IO_OK = 0, NVM_IO_REQUEUE = 1, @@ -11,10 +13,71 @@ enum { NVM_IOTYPE_GC = 1, }; +#define NVM_BLK_BITS (16) +#define NVM_PG_BITS (16) +#define NVM_SEC_BITS (8) +#define NVM_PL_BITS (8) +#define NVM_LUN_BITS (8) +#define NVM_CH_BITS (8) + +struct ppa_addr { + /* Generic structure for all addresses */ + union { + struct { + u64 blk : NVM_BLK_BITS; + u64 pg : NVM_PG_BITS; + u64 sec : NVM_SEC_BITS; + u64 pl : NVM_PL_BITS; + u64 lun : NVM_LUN_BITS; + u64 ch : NVM_CH_BITS; + } g; + + u64 ppa; + }; +}; + +struct nvm_rq; +struct nvm_id; +struct nvm_dev; + +typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); +typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); +typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); +typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, + nvm_l2p_update_fn *, void *); +typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int, + nvm_bb_update_fn *, void *); +typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int); +typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); +typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *); +typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); +typedef void (nvm_destroy_dma_pool_fn)(void *); +typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, + dma_addr_t *); +typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); + +struct nvm_dev_ops { + nvm_id_fn *identity; + nvm_get_l2p_tbl_fn *get_l2p_tbl; + nvm_op_bb_tbl_fn *get_bb_tbl; + nvm_op_set_bb_fn *set_bb_tbl; + + nvm_submit_io_fn *submit_io; + nvm_erase_blk_fn *erase_block; + + nvm_create_dma_pool_fn *create_dma_pool; + nvm_destroy_dma_pool_fn *destroy_dma_pool; + nvm_dev_dma_alloc_fn *dev_dma_alloc; + nvm_dev_dma_free_fn *dev_dma_free; + + unsigned int max_phys_sect; +}; + + + #ifdef CONFIG_NVM #include -#include #include #include #include @@ -149,29 +212,6 @@ struct nvm_tgt_instance { #define NVM_VERSION_MINOR 0 #define NVM_VERSION_PATCH 0 -#define NVM_BLK_BITS (16) -#define NVM_PG_BITS (16) -#define NVM_SEC_BITS (8) -#define NVM_PL_BITS (8) -#define NVM_LUN_BITS (8) -#define NVM_CH_BITS (8) - -struct ppa_addr { - /* Generic structure for all addresses */ - union { - struct { - u64 blk : NVM_BLK_BITS; - u64 pg : NVM_PG_BITS; - u64 sec : NVM_SEC_BITS; - u64 pl : NVM_PL_BITS; - u64 lun : NVM_LUN_BITS; - u64 ch : NVM_CH_BITS; - } g; - - u64 ppa; - }; -}; - struct nvm_rq; typedef void (nvm_end_io_fn)(struct nvm_rq *); @@ -213,39 +253,6 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata) struct nvm_block; -typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); -typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); -typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); -typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, - nvm_l2p_update_fn *, void *); -typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int, - nvm_bb_update_fn *, void *); -typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int); -typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); -typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *); -typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); -typedef void (nvm_destroy_dma_pool_fn)(void *); -typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, - dma_addr_t *); -typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); - -struct nvm_dev_ops { - nvm_id_fn *identity; - nvm_get_l2p_tbl_fn *get_l2p_tbl; - nvm_op_bb_tbl_fn *get_bb_tbl; - nvm_op_set_bb_fn *set_bb_tbl; - - nvm_submit_io_fn *submit_io; - nvm_erase_blk_fn *erase_block; - - nvm_create_dma_pool_fn *create_dma_pool; - nvm_destroy_dma_pool_fn *destroy_dma_pool; - nvm_dev_dma_alloc_fn *dev_dma_alloc; - nvm_dev_dma_free_fn *dev_dma_free; - - unsigned int max_phys_sect; -}; - struct nvm_lun { int id; -- cgit v1.2.3 From 74843787158e9dff249f0528e7d4806102cc2c26 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 9 Jan 2016 12:45:10 +0000 Subject: mmc: atmel-mci: restore dma on AVR32 Commit ecb89f2f5f3e7 ("mmc: atmel-mci: remove compat for non DT board when requesting dma chan") broke dma on AVR32 and any other boards not using DT. This restores a fallback mechanism for such cases. Signed-off-by: Mans Rullgard Acked-by: Hans-Christian Noren Egtvedt Acked-by: Ludovic Desroches Acked-by: Ulf Hansson --- include/linux/atmel-mci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/atmel-mci.h b/include/linux/atmel-mci.h index 9177947bf032..e753062b9355 100644 --- a/include/linux/atmel-mci.h +++ b/include/linux/atmel-mci.h @@ -2,6 +2,7 @@ #define __LINUX_ATMEL_MCI_H #include +#include #define ATMCI_MAX_NR_SLOTS 2 @@ -37,6 +38,7 @@ struct mci_slot_pdata { */ struct mci_platform_data { struct mci_dma_data *dma_slave; + dma_filter_fn dma_filter; struct mci_slot_pdata slot[ATMCI_MAX_NR_SLOTS]; }; -- cgit v1.2.3 From 238d1c6041ebcb5ce7c075b696f6cc9962991e94 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 9 Jan 2016 12:45:11 +0000 Subject: mmc: atmel: get rid of struct mci_dma_data As struct mci_dma_data is now only used by AVR32, it is nothing but pointless indirection. Replace it with struct dw_dma_slave in the AVR32 platform code and with a void pointer elsewhere. Signed-off-by: Mans Rullgard Acked-by: Hans-Christian Noren Egtvedt Acked-by: Ludovic Desroches Acked-by: Ulf Hansson --- include/linux/atmel-mci.h | 2 +- include/linux/platform_data/mmc-atmel-mci.h | 22 ---------------------- 2 files changed, 1 insertion(+), 23 deletions(-) delete mode 100644 include/linux/platform_data/mmc-atmel-mci.h (limited to 'include/linux') diff --git a/include/linux/atmel-mci.h b/include/linux/atmel-mci.h index e753062b9355..42a9e1884842 100644 --- a/include/linux/atmel-mci.h +++ b/include/linux/atmel-mci.h @@ -37,7 +37,7 @@ struct mci_slot_pdata { * @slot: Per-slot configuration data. */ struct mci_platform_data { - struct mci_dma_data *dma_slave; + void *dma_slave; dma_filter_fn dma_filter; struct mci_slot_pdata slot[ATMCI_MAX_NR_SLOTS]; }; diff --git a/include/linux/platform_data/mmc-atmel-mci.h b/include/linux/platform_data/mmc-atmel-mci.h deleted file mode 100644 index 399a2d5a14bd..000000000000 --- a/include/linux/platform_data/mmc-atmel-mci.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef __MMC_ATMEL_MCI_H -#define __MMC_ATMEL_MCI_H - -#include -#include - -/** - * struct mci_dma_data - DMA data for MCI interface - */ -struct mci_dma_data { -#ifdef CONFIG_ARM - struct at_dma_slave sdata; -#else - struct dw_dma_slave sdata; -#endif -}; - -/* accessor macros */ -#define slave_data_ptr(s) (&(s)->sdata) -#define find_slave_dev(s) ((s)->sdata.dma_dev) - -#endif /* __MMC_ATMEL_MCI_H */ -- cgit v1.2.3 From b7522056ec1a62a5f3246627e12ef48e6274c21c Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Wed, 13 Jan 2016 18:39:58 +0300 Subject: ftrace: Remove unused nr_trampolines var It's not needed & not used since introducing old_hash: commit fef5aeeee9e371 ("ftrace: Replace tramp_hash with old_*_hash to save space"). Link: http://lkml.kernel.org/r/1452699598-27610-1-git-send-email-0x7f454c46@gmail.com Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 60048c50404e..65460ad93c7b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -164,7 +164,6 @@ struct ftrace_ops { ftrace_func_t saved_func; int __percpu *disabled; #ifdef CONFIG_DYNAMIC_FTRACE - int nr_trampolines; struct ftrace_ops_hash local_hash; struct ftrace_ops_hash *func_hash; struct ftrace_ops_hash old_hash; -- cgit v1.2.3 From 40704b129092eafce9c754199aea5a1f55c47fbc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 18 Dec 2015 16:02:41 +0100 Subject: PCI: host: Add of_pci_get_host_bridge_resources() stub The pcie-rcar driver can be built for any ARM platform (for COMPILE_TEST) including those without CONFIG_OF enabled, and that results in a compile-time error: drivers/pci/host/pcie-rcar.c: In function 'rcar_pcie_parse_request_of_pci_ranges': drivers/pci/host/pcie-rcar.c:939:8: error: implicit declaration of function 'of_pci_get_host_bridge_resources' [-Werror=implicit-function-declaration] err = of_pci_get_host_bridge_resources(np, 0, 0xff, &pci->resources, &iobase); Add a of_pci_get_host_bridge_resources() stub function defined when CONFIG_OF_ADDRESS is disabled to allow compile-testing on all platforms. This mirrors what we do for other OF-specific functions. Fixes: 5d2917d469fa ("PCI: rcar: Convert to DT resource parsing API") Signed-off-by: Arnd Bergmann Signed-off-by: Bjorn Helgaas Acked-by: Wolfram Sang Acked-by: Simon Horman --- include/linux/of_pci.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h index 2c51ee78b1c0..f6e9e85164e8 100644 --- a/include/linux/of_pci.h +++ b/include/linux/of_pci.h @@ -59,6 +59,13 @@ static inline void of_pci_check_probe_only(void) { } int of_pci_get_host_bridge_resources(struct device_node *dev, unsigned char busno, unsigned char bus_max, struct list_head *resources, resource_size_t *io_base); +#else +static inline int of_pci_get_host_bridge_resources(struct device_node *dev, + unsigned char busno, unsigned char bus_max, + struct list_head *resources, resource_size_t *io_base) +{ + return -EINVAL; +} #endif #if defined(CONFIG_OF) && defined(CONFIG_PCI_MSI) -- cgit v1.2.3 From e5b6c1518878e157df4121c1caf70d9c470a6d31 Mon Sep 17 00:00:00 2001 From: Jordan Hargrave Date: Fri, 15 Jan 2016 22:08:45 +0100 Subject: firmware: dmi_scan: Save SMBIOS Type 9 System Slots Save SMBIOS Type 9 System Slots during DMI scan. PCI address of onboard devices was already saved but not for slots. Signed-off-by: Jordan Hargrave Signed-off-by: Jean Delvare --- include/linux/dmi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dmi.h b/include/linux/dmi.h index 5055ac34142d..5e9c74cf8894 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -22,6 +22,7 @@ enum dmi_device_type { DMI_DEV_TYPE_IPMI = -1, DMI_DEV_TYPE_OEM_STRING = -2, DMI_DEV_TYPE_DEV_ONBOARD = -3, + DMI_DEV_TYPE_DEV_SLOT = -4, }; enum dmi_entry_type { -- cgit v1.2.3 From 46373a15f65fe862f31c19a484acdf551f2b442f Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 11 Jan 2016 17:40:31 +0100 Subject: time: nohz: Expose tick_nohz_enabled The cpuidle subsystem needs it. Signed-off-by: Jean Delvare Reviewed-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- include/linux/tick.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index e312219ff823..97fd4e543846 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -98,6 +98,7 @@ static inline void tick_broadcast_exit(void) } #ifdef CONFIG_NO_HZ_COMMON +extern int tick_nohz_enabled; extern int tick_nohz_tick_stopped(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); @@ -106,6 +107,7 @@ extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); #else /* !CONFIG_NO_HZ_COMMON */ +#define tick_nohz_enabled (0) static inline int tick_nohz_tick_stopped(void) { return 0; } static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } -- cgit v1.2.3 From 69874ec233871a62e1bc8c89e643993af93a8630 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 11 Dec 2015 11:30:11 +0900 Subject: PCI: Add Netronome NFP4000 PF device ID Add the device ID for the PF of the NFP4000. The device ID for the VF, 0x6003, is already present as PCI_DEVICE_ID_NETRONOME_NFP6000_VF. Signed-off-by: Simon Horman Signed-off-by: Bjorn Helgaas --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 526e2c12ae59..37f05cb1dfd6 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2498,6 +2498,7 @@ #define PCI_VENDOR_ID_NETRONOME 0x19ee #define PCI_DEVICE_ID_NETRONOME_NFP3200 0x3200 #define PCI_DEVICE_ID_NETRONOME_NFP3240 0x3240 +#define PCI_DEVICE_ID_NETRONOME_NFP4000 0x4000 #define PCI_DEVICE_ID_NETRONOME_NFP6000 0x6000 #define PCI_DEVICE_ID_NETRONOME_NFP6000_VF 0x6003 -- cgit v1.2.3 From d8c1bdeb5d6b62b34a78391206a5e55e4a02d58f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:13 -0800 Subject: page-flags: trivial cleanup for PageTrans* helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use TESTPAGEFLAG_FALSE() to get it a bit cleaner. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index bb53c7b86315..45792d01edea 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -485,21 +485,9 @@ static inline int PageTransTail(struct page *page) } #else - -static inline int PageTransHuge(struct page *page) -{ - return 0; -} - -static inline int PageTransCompound(struct page *page) -{ - return 0; -} - -static inline int PageTransTail(struct page *page) -{ - return 0; -} +TESTPAGEFLAG_FALSE(TransHuge) +TESTPAGEFLAG_FALSE(TransCompound) +TESTPAGEFLAG_FALSE(TransTail) #endif /* -- cgit v1.2.3 From 0e6d31a7336f41ef0375f5398c79e54de8e219b6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:17 -0800 Subject: page-flags: move code around MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The preparation patch: we are going to use compound_head(), PageTail() and PageCompound() to define page-flags helpers. Let's define them before macros. We cannot user PageHead() helper in PageCompound() as it's not yet defined -- use test_bit(PG_head, &page->flags) instead. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 45792d01edea..83161a22509c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -133,6 +133,27 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H +struct page; /* forward declaration */ + +static inline struct page *compound_head(struct page *page) +{ + unsigned long head = READ_ONCE(page->compound_head); + + if (unlikely(head & 1)) + return (struct page *) (head - 1); + return page; +} + +static inline int PageTail(struct page *page) +{ + return READ_ONCE(page->compound_head) & 1; +} + +static inline int PageCompound(struct page *page) +{ + return test_bit(PG_head, &page->flags) || PageTail(page); +} + /* * Macros to create function definitions for page flags */ @@ -204,7 +225,6 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } #define TESTSCFLAG_FALSE(uname) \ TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) -struct page; /* forward declaration */ TESTPAGEFLAG(Locked, locked) PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) @@ -395,11 +415,6 @@ static inline void set_page_writeback_keepwrite(struct page *page) __PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) -static inline int PageTail(struct page *page) -{ - return READ_ONCE(page->compound_head) & 1; -} - static inline void set_compound_head(struct page *page, struct page *head) { WRITE_ONCE(page->compound_head, (unsigned long)head + 1); @@ -410,20 +425,6 @@ static inline void clear_compound_head(struct page *page) WRITE_ONCE(page->compound_head, 0); } -static inline struct page *compound_head(struct page *page) -{ - unsigned long head = READ_ONCE(page->compound_head); - - if (unlikely(head & 1)) - return (struct page *) (head - 1); - return page; -} - -static inline int PageCompound(struct page *page) -{ - return PageHead(page) || PageTail(page); - -} #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void ClearPageCompound(struct page *page) { -- cgit v1.2.3 From 95ad97554ac81b31139d4fe5ed8757a07087cd90 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:21 -0800 Subject: page-flags: introduce page flags policies wrt compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a third argument to macros which create function definitions for page flags. This argument defines how page-flags helpers behave on compound functions. For now we define four policies: - PF_ANY: the helper function operates on the page it gets, regardless if it's non-compound, head or tail. - PF_HEAD: the helper function operates on the head page of the compound page if it gets tail page. - PF_NO_TAIL: only head and non-compond pages are acceptable for this helper function. - PF_NO_COMPOUND: only non-compound pages are acceptable for this helper function. For now we use policy PF_ANY for all helpers, which matches current behaviour. We do not enforce the policy for TESTPAGEFLAG, because we have flags checked for random pages all over the kernel. Noticeable exception to this is PageTransHuge() which triggers VM_BUG_ON() for tail page. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmdebug.h | 6 ++ include/linux/page-flags.h | 166 ++++++++++++++++++++++++++++----------------- 2 files changed, 108 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 772362adf471..053824b0a412 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -56,4 +56,10 @@ void dump_mm(const struct mm_struct *mm); #define VIRTUAL_BUG_ON(cond) do { } while (0) #endif +#ifdef CONFIG_DEBUG_VM_PGFLAGS +#define VM_BUG_ON_PGFLAGS(cond, page) VM_BUG_ON_PAGE(cond, page) +#else +#define VM_BUG_ON_PGFLAGS(cond, page) BUILD_BUG_ON_INVALID(cond) +#endif + #endif diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 83161a22509c..12ab023b67f2 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -154,49 +154,80 @@ static inline int PageCompound(struct page *page) return test_bit(PG_head, &page->flags) || PageTail(page); } +/* + * Page flags policies wrt compound pages + * + * PF_ANY: + * the page flag is relevant for small, head and tail pages. + * + * PF_HEAD: + * for compound page all operations related to the page flag applied to + * head page. + * + * PF_NO_TAIL: + * modifications of the page flag must be done on small or head pages, + * checks can be done on tail pages too. + * + * PF_NO_COMPOUND: + * the page flag is not relevant for compound pages. + */ +#define PF_ANY(page, enforce) page +#define PF_HEAD(page, enforce) compound_head(page) +#define PF_NO_TAIL(page, enforce) ({ \ + VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ + compound_head(page);}) +#define PF_NO_COMPOUND(page, enforce) ({ \ + VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ + page;}) + /* * Macros to create function definitions for page flags */ -#define TESTPAGEFLAG(uname, lname) \ -static inline int Page##uname(const struct page *page) \ - { return test_bit(PG_##lname, &page->flags); } +#define TESTPAGEFLAG(uname, lname, policy) \ +static inline int Page##uname(struct page *page) \ + { return test_bit(PG_##lname, &policy(page, 0)->flags); } -#define SETPAGEFLAG(uname, lname) \ +#define SETPAGEFLAG(uname, lname, policy) \ static inline void SetPage##uname(struct page *page) \ - { set_bit(PG_##lname, &page->flags); } + { set_bit(PG_##lname, &policy(page, 1)->flags); } -#define CLEARPAGEFLAG(uname, lname) \ +#define CLEARPAGEFLAG(uname, lname, policy) \ static inline void ClearPage##uname(struct page *page) \ - { clear_bit(PG_##lname, &page->flags); } + { clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define __SETPAGEFLAG(uname, lname) \ +#define __SETPAGEFLAG(uname, lname, policy) \ static inline void __SetPage##uname(struct page *page) \ - { __set_bit(PG_##lname, &page->flags); } + { __set_bit(PG_##lname, &policy(page, 1)->flags); } -#define __CLEARPAGEFLAG(uname, lname) \ +#define __CLEARPAGEFLAG(uname, lname, policy) \ static inline void __ClearPage##uname(struct page *page) \ - { __clear_bit(PG_##lname, &page->flags); } + { __clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define TESTSETFLAG(uname, lname) \ +#define TESTSETFLAG(uname, lname, policy) \ static inline int TestSetPage##uname(struct page *page) \ - { return test_and_set_bit(PG_##lname, &page->flags); } + { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } -#define TESTCLEARFLAG(uname, lname) \ +#define TESTCLEARFLAG(uname, lname, policy) \ static inline int TestClearPage##uname(struct page *page) \ - { return test_and_clear_bit(PG_##lname, &page->flags); } + { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define __TESTCLEARFLAG(uname, lname) \ +#define __TESTCLEARFLAG(uname, lname, policy) \ static inline int __TestClearPage##uname(struct page *page) \ - { return __test_and_clear_bit(PG_##lname, &page->flags); } + { return __test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ - SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname) +#define PAGEFLAG(uname, lname, policy) \ + TESTPAGEFLAG(uname, lname, policy) \ + SETPAGEFLAG(uname, lname, policy) \ + CLEARPAGEFLAG(uname, lname, policy) -#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ - __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname) +#define __PAGEFLAG(uname, lname, policy) \ + TESTPAGEFLAG(uname, lname, policy) \ + __SETPAGEFLAG(uname, lname, policy) \ + __CLEARPAGEFLAG(uname, lname, policy) -#define TESTSCFLAG(uname, lname) \ - TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname) +#define TESTSCFLAG(uname, lname, policy) \ + TESTSETFLAG(uname, lname, policy) \ + TESTCLEARFLAG(uname, lname, policy) #define TESTPAGEFLAG_FALSE(uname) \ static inline int Page##uname(const struct page *page) { return 0; } @@ -225,46 +256,48 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } #define TESTSCFLAG_FALSE(uname) \ TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) - -TESTPAGEFLAG(Locked, locked) -PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) -PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) - __SETPAGEFLAG(Referenced, referenced) -PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) -PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) -PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) - TESTCLEARFLAG(Active, active) -__PAGEFLAG(Slab, slab) -PAGEFLAG(Checked, checked) /* Used by some filesystems */ -PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */ -PAGEFLAG(SavePinned, savepinned); /* Xen */ -PAGEFLAG(Foreign, foreign); /* Xen */ -PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) -PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) - __SETPAGEFLAG(SwapBacked, swapbacked) - -__PAGEFLAG(SlobFree, slob_free) +TESTPAGEFLAG(Locked, locked, PF_ANY) +PAGEFLAG(Error, error, PF_ANY) TESTCLEARFLAG(Error, error, PF_ANY) +PAGEFLAG(Referenced, referenced, PF_ANY) TESTCLEARFLAG(Referenced, referenced, PF_ANY) + __SETPAGEFLAG(Referenced, referenced, PF_ANY) +PAGEFLAG(Dirty, dirty, PF_ANY) TESTSCFLAG(Dirty, dirty, PF_ANY) + __CLEARPAGEFLAG(Dirty, dirty, PF_ANY) +PAGEFLAG(LRU, lru, PF_ANY) __CLEARPAGEFLAG(LRU, lru, PF_ANY) +PAGEFLAG(Active, active, PF_ANY) __CLEARPAGEFLAG(Active, active, PF_ANY) + TESTCLEARFLAG(Active, active, PF_ANY) +__PAGEFLAG(Slab, slab, PF_ANY) +PAGEFLAG(Checked, checked, PF_ANY) /* Used by some filesystems */ +PAGEFLAG(Pinned, pinned, PF_ANY) TESTSCFLAG(Pinned, pinned, PF_ANY) /* Xen */ +PAGEFLAG(SavePinned, savepinned, PF_ANY); /* Xen */ +PAGEFLAG(Foreign, foreign, PF_ANY); /* Xen */ +PAGEFLAG(Reserved, reserved, PF_ANY) __CLEARPAGEFLAG(Reserved, reserved, PF_ANY) +PAGEFLAG(SwapBacked, swapbacked, PF_ANY) + __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_ANY) + __SETPAGEFLAG(SwapBacked, swapbacked, PF_ANY) + +__PAGEFLAG(SlobFree, slob_free, PF_ANY) /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. * - PG_private and PG_private_2 cause releasepage() and co to be invoked */ -PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private) - __CLEARPAGEFLAG(Private, private) -PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2) -PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) +PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY) + __CLEARPAGEFLAG(Private, private, PF_ANY) +PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) +PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) + TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY) /* * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. */ -TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) -PAGEFLAG(MappedToDisk, mappedtodisk) +TESTPAGEFLAG(Writeback, writeback, PF_ANY) TESTSCFLAG(Writeback, writeback, PF_ANY) +PAGEFLAG(MappedToDisk, mappedtodisk, PF_ANY) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ -PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) -PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) +PAGEFLAG(Reclaim, reclaim, PF_ANY) TESTCLEARFLAG(Reclaim, reclaim, PF_ANY) +PAGEFLAG(Readahead, reclaim, PF_ANY) TESTCLEARFLAG(Readahead, reclaim, PF_ANY) #ifdef CONFIG_HIGHMEM /* @@ -277,31 +310,32 @@ PAGEFLAG_FALSE(HighMem) #endif #ifdef CONFIG_SWAP -PAGEFLAG(SwapCache, swapcache) +PAGEFLAG(SwapCache, swapcache, PF_ANY) #else PAGEFLAG_FALSE(SwapCache) #endif -PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) - TESTCLEARFLAG(Unevictable, unevictable) +PAGEFLAG(Unevictable, unevictable, PF_ANY) + __CLEARPAGEFLAG(Unevictable, unevictable, PF_ANY) + TESTCLEARFLAG(Unevictable, unevictable, PF_ANY) #ifdef CONFIG_MMU -PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked) - TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked) +PAGEFLAG(Mlocked, mlocked, PF_ANY) __CLEARPAGEFLAG(Mlocked, mlocked, PF_ANY) + TESTSCFLAG(Mlocked, mlocked, PF_ANY) __TESTCLEARFLAG(Mlocked, mlocked, PF_ANY) #else PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED -PAGEFLAG(Uncached, uncached) +PAGEFLAG(Uncached, uncached, PF_ANY) #else PAGEFLAG_FALSE(Uncached) #endif #ifdef CONFIG_MEMORY_FAILURE -PAGEFLAG(HWPoison, hwpoison) -TESTSCFLAG(HWPoison, hwpoison) +PAGEFLAG(HWPoison, hwpoison, PF_ANY) +TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) #else PAGEFLAG_FALSE(HWPoison) @@ -309,10 +343,10 @@ PAGEFLAG_FALSE(HWPoison) #endif #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) -TESTPAGEFLAG(Young, young) -SETPAGEFLAG(Young, young) -TESTCLEARFLAG(Young, young) -PAGEFLAG(Idle, idle) +TESTPAGEFLAG(Young, young, PF_ANY) +SETPAGEFLAG(Young, young, PF_ANY) +TESTCLEARFLAG(Young, young, PF_ANY) +PAGEFLAG(Idle, idle, PF_ANY) #endif /* @@ -393,7 +427,7 @@ static inline void SetPageUptodate(struct page *page) set_bit(PG_uptodate, &(page)->flags); } -CLEARPAGEFLAG(Uptodate, uptodate) +CLEARPAGEFLAG(Uptodate, uptodate, PF_ANY) int test_clear_page_writeback(struct page *page); int __test_set_page_writeback(struct page *page, bool keep_write); @@ -413,7 +447,7 @@ static inline void set_page_writeback_keepwrite(struct page *page) test_set_page_writeback_keepwrite(page); } -__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) +__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) static inline void set_compound_head(struct page *page, struct page *head) { @@ -615,6 +649,10 @@ static inline int page_has_private(struct page *page) return !!(page->flags & PAGE_FLAGS_PRIVATE); } +#undef PF_ANY +#undef PF_HEAD +#undef PF_NO_TAIL +#undef PF_NO_COMPOUND #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */ -- cgit v1.2.3 From 48c935ad88f5be20eb5445a77c171351b1eb5111 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:24 -0800 Subject: page-flags: define PG_locked behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lock_page() must operate on the whole compound page. It doesn't make much sense to lock part of compound page. Change code to use head page's PG_locked, if tail page is passed. This patch also gets rid of custom helper functions -- __set_page_locked() and __clear_page_locked(). They are replaced with helpers generated by __SETPAGEFLAG/__CLEARPAGEFLAG. Tail pages to these helper would trigger VM_BUG_ON(). SLUB uses PG_locked as a bit spin locked. IIUC, tail pages should never appear there. VM_BUG_ON() is added to make sure that this assumption is correct. [akpm@linux-foundation.org: fix fs/cifs/file.c] Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- include/linux/pagemap.h | 25 ++++++++----------------- 2 files changed, 9 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 12ab023b67f2..32c87eb470cb 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -256,7 +256,7 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } #define TESTSCFLAG_FALSE(uname) \ TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) -TESTPAGEFLAG(Locked, locked, PF_ANY) +__PAGEFLAG(Locked, locked, PF_NO_TAIL) PAGEFLAG(Error, error, PF_ANY) TESTCLEARFLAG(Error, error, PF_ANY) PAGEFLAG(Referenced, referenced, PF_ANY) TESTCLEARFLAG(Referenced, referenced, PF_ANY) __SETPAGEFLAG(Referenced, referenced, PF_ANY) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 26eabf5ec718..df214a4b886d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -433,18 +433,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags); extern void unlock_page(struct page *page); -static inline void __set_page_locked(struct page *page) -{ - __set_bit(PG_locked, &page->flags); -} - -static inline void __clear_page_locked(struct page *page) -{ - __clear_bit(PG_locked, &page->flags); -} - static inline int trylock_page(struct page *page) { + page = compound_head(page); return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); } @@ -497,9 +488,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page, static inline int wait_on_page_locked_killable(struct page *page) { - if (PageLocked(page)) - return wait_on_page_bit_killable(page, PG_locked); - return 0; + if (!PageLocked(page)) + return 0; + return wait_on_page_bit_killable(compound_head(page), PG_locked); } extern wait_queue_head_t *page_waitqueue(struct page *page); @@ -518,7 +509,7 @@ static inline void wake_up_page(struct page *page, int bit) static inline void wait_on_page_locked(struct page *page) { if (PageLocked(page)) - wait_on_page_bit(page, PG_locked); + wait_on_page_bit(compound_head(page), PG_locked); } /* @@ -664,17 +655,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: - * the page is new, so we can just run __set_page_locked() against it. + * the page is new, so we can just run __SetPageLocked() against it. */ static inline int add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { int error; - __set_page_locked(page); + __SetPageLocked(page); error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); if (unlikely(error)) - __clear_page_locked(page); + __ClearPageLocked(page); return error; } -- cgit v1.2.3 From df8c94d13c7e30f4471f8faa8d544809a0e52865 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:28 -0800 Subject: page-flags: define behavior of FS/IO-related flags on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems we don't have compound page on FS/IO path currently. Use PF_NO_COMPOUND to catch if we have. The odd exception is PG_dirty: sound uses compound pages and maps them with PTEs. PF_NO_COMPOUND triggers VM_BUG_ON() in set_page_dirty() on handling shared fault. Let's use PF_HEAD for PG_dirty. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 32c87eb470cb..2493f80b949b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -257,16 +257,16 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) -PAGEFLAG(Error, error, PF_ANY) TESTCLEARFLAG(Error, error, PF_ANY) +PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND) PAGEFLAG(Referenced, referenced, PF_ANY) TESTCLEARFLAG(Referenced, referenced, PF_ANY) __SETPAGEFLAG(Referenced, referenced, PF_ANY) -PAGEFLAG(Dirty, dirty, PF_ANY) TESTSCFLAG(Dirty, dirty, PF_ANY) - __CLEARPAGEFLAG(Dirty, dirty, PF_ANY) +PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) + __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_ANY) __CLEARPAGEFLAG(LRU, lru, PF_ANY) PAGEFLAG(Active, active, PF_ANY) __CLEARPAGEFLAG(Active, active, PF_ANY) TESTCLEARFLAG(Active, active, PF_ANY) __PAGEFLAG(Slab, slab, PF_ANY) -PAGEFLAG(Checked, checked, PF_ANY) /* Used by some filesystems */ +PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ PAGEFLAG(Pinned, pinned, PF_ANY) TESTSCFLAG(Pinned, pinned, PF_ANY) /* Xen */ PAGEFLAG(SavePinned, savepinned, PF_ANY); /* Xen */ PAGEFLAG(Foreign, foreign, PF_ANY); /* Xen */ @@ -292,12 +292,15 @@ PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. */ -TESTPAGEFLAG(Writeback, writeback, PF_ANY) TESTSCFLAG(Writeback, writeback, PF_ANY) -PAGEFLAG(MappedToDisk, mappedtodisk, PF_ANY) +TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND) + TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND) +PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ -PAGEFLAG(Reclaim, reclaim, PF_ANY) TESTCLEARFLAG(Reclaim, reclaim, PF_ANY) -PAGEFLAG(Readahead, reclaim, PF_ANY) TESTCLEARFLAG(Readahead, reclaim, PF_ANY) +PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND) + TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND) +PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) + TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) #ifdef CONFIG_HIGHMEM /* @@ -413,7 +416,7 @@ static inline int PageUptodate(struct page *page) static inline void __SetPageUptodate(struct page *page) { smp_wmb(); - __set_bit(PG_uptodate, &(page)->flags); + __set_bit(PG_uptodate, &page->flags); } static inline void SetPageUptodate(struct page *page) @@ -424,7 +427,7 @@ static inline void SetPageUptodate(struct page *page) * uptodate are actually visible before PageUptodate becomes true. */ smp_wmb(); - set_bit(PG_uptodate, &(page)->flags); + set_bit(PG_uptodate, &page->flags); } CLEARPAGEFLAG(Uptodate, uptodate, PF_ANY) -- cgit v1.2.3 From 8cb38fabb6bc1ba8bcec83eaf04848d886b54d28 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:32 -0800 Subject: page-flags: define behavior of LRU-related flags on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only head pages are ever on LRU. Let's use PF_HEAD policy to avoid any confusion for all LRU-related flags. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 2493f80b949b..88a3bcba57d6 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -258,13 +258,14 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } __PAGEFLAG(Locked, locked, PF_NO_TAIL) PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND) -PAGEFLAG(Referenced, referenced, PF_ANY) TESTCLEARFLAG(Referenced, referenced, PF_ANY) - __SETPAGEFLAG(Referenced, referenced, PF_ANY) +PAGEFLAG(Referenced, referenced, PF_HEAD) + TESTCLEARFLAG(Referenced, referenced, PF_HEAD) + __SETPAGEFLAG(Referenced, referenced, PF_HEAD) PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) -PAGEFLAG(LRU, lru, PF_ANY) __CLEARPAGEFLAG(LRU, lru, PF_ANY) -PAGEFLAG(Active, active, PF_ANY) __CLEARPAGEFLAG(Active, active, PF_ANY) - TESTCLEARFLAG(Active, active, PF_ANY) +PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) +PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) + TESTCLEARFLAG(Active, active, PF_HEAD) __PAGEFLAG(Slab, slab, PF_ANY) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ PAGEFLAG(Pinned, pinned, PF_ANY) TESTSCFLAG(Pinned, pinned, PF_ANY) /* Xen */ @@ -318,9 +319,9 @@ PAGEFLAG(SwapCache, swapcache, PF_ANY) PAGEFLAG_FALSE(SwapCache) #endif -PAGEFLAG(Unevictable, unevictable, PF_ANY) - __CLEARPAGEFLAG(Unevictable, unevictable, PF_ANY) - TESTCLEARFLAG(Unevictable, unevictable, PF_ANY) +PAGEFLAG(Unevictable, unevictable, PF_HEAD) + __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD) + TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD) #ifdef CONFIG_MMU PAGEFLAG(Mlocked, mlocked, PF_ANY) __CLEARPAGEFLAG(Mlocked, mlocked, PF_ANY) -- cgit v1.2.3 From dcb351cd095a3a1e1100b74f15a0100cf9a0c700 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:35 -0800 Subject: page-flags: define behavior SL*B-related flags on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SL*B uses compound pages and marks head pages with PG_slab. __SetPageSlab() and __ClearPageSlab() are never called for tail pages. The same situation with PG_slob_free in SLOB allocator. PF_NO_TAIL is appropriate for these flags. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 88a3bcba57d6..29d8805aaa23 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -266,7 +266,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) TESTCLEARFLAG(Active, active, PF_HEAD) -__PAGEFLAG(Slab, slab, PF_ANY) +__PAGEFLAG(Slab, slab, PF_NO_TAIL) +__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ PAGEFLAG(Pinned, pinned, PF_ANY) TESTSCFLAG(Pinned, pinned, PF_ANY) /* Xen */ PAGEFLAG(SavePinned, savepinned, PF_ANY); /* Xen */ @@ -276,8 +277,6 @@ PAGEFLAG(SwapBacked, swapbacked, PF_ANY) __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_ANY) __SETPAGEFLAG(SwapBacked, swapbacked, PF_ANY) -__PAGEFLAG(SlobFree, slob_free, PF_ANY) - /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. -- cgit v1.2.3 From c13985fa800312fdcd4c7d67a1f55abcbc2f6b7d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:39 -0800 Subject: page-flags: define behavior of Xen-related flags on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PG_pinned and PG_savepinned are about page table's pages which are never compound. I'm not so sure about PG_foreign, but it seems we shouldn't see compound pages there too. Let's use PF_NO_COMPOUND for all of them. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 29d8805aaa23..6e7c7c66b6ca 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -269,9 +269,13 @@ PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) __PAGEFLAG(Slab, slab, PF_NO_TAIL) __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ -PAGEFLAG(Pinned, pinned, PF_ANY) TESTSCFLAG(Pinned, pinned, PF_ANY) /* Xen */ -PAGEFLAG(SavePinned, savepinned, PF_ANY); /* Xen */ -PAGEFLAG(Foreign, foreign, PF_ANY); /* Xen */ + +/* Xen */ +PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) + TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND) +PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND); +PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); + PAGEFLAG(Reserved, reserved, PF_ANY) __CLEARPAGEFLAG(Reserved, reserved, PF_ANY) PAGEFLAG(SwapBacked, swapbacked, PF_ANY) __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_ANY) -- cgit v1.2.3 From de09d31dd38a50fdce106c15abd68432eebbd014 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:42 -0800 Subject: page-flags: define PG_reserved behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As far as I can see there's no users of PG_reserved on compound pages. Let's use PF_NO_COMPOUND here. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6e7c7c66b6ca..dbfd8f325f98 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -276,7 +276,8 @@ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND); PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); -PAGEFLAG(Reserved, reserved, PF_ANY) __CLEARPAGEFLAG(Reserved, reserved, PF_ANY) +PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) + __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) PAGEFLAG(SwapBacked, swapbacked, PF_ANY) __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_ANY) __SETPAGEFLAG(SwapBacked, swapbacked, PF_ANY) -- cgit v1.2.3 From da5efc408baefd686b0ee2cbd1353eb10ec71a0f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:46 -0800 Subject: page-flags: define PG_swapbacked behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PG_swapbacked is used for transparent huge pages. For head pages only. Let's use PF_NO_TAIL policy. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index dbfd8f325f98..eda487ecc01c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -278,9 +278,9 @@ PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) -PAGEFLAG(SwapBacked, swapbacked, PF_ANY) - __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_ANY) - __SETPAGEFLAG(SwapBacked, swapbacked, PF_ANY) +PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) + __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) + __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) /* * Private page markings that may be used by the filesystem that owns the page -- cgit v1.2.3 From 50ea78d676d4282a403f956229505b9fddf69f3a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:49 -0800 Subject: page-flags: define PG_swapcache behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Swap cannot handle compound pages so far. Transparent huge pages are split on the way to swap. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index eda487ecc01c..7fc2ea83cbd5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -318,7 +318,7 @@ PAGEFLAG_FALSE(HighMem) #endif #ifdef CONFIG_SWAP -PAGEFLAG(SwapCache, swapcache, PF_ANY) +PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND) #else PAGEFLAG_FALSE(SwapCache) #endif -- cgit v1.2.3 From e4f87d5d752d259b274681420b010c65006301a6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:53 -0800 Subject: page-flags: define PG_mlocked behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Transparent huge pages can be mlocked -- whole compund page at once. Something went wrong if we're trying to mlock() tail page. Let's use PF_NO_TAIL. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7fc2ea83cbd5..43b7acb092ff 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -328,8 +328,10 @@ PAGEFLAG(Unevictable, unevictable, PF_HEAD) TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD) #ifdef CONFIG_MMU -PAGEFLAG(Mlocked, mlocked, PF_ANY) __CLEARPAGEFLAG(Mlocked, mlocked, PF_ANY) - TESTSCFLAG(Mlocked, mlocked, PF_ANY) __TESTCLEARFLAG(Mlocked, mlocked, PF_ANY) +PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) + __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) + TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) + __TESTCLEARFLAG(Mlocked, mlocked, PF_NO_TAIL) #else PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) -- cgit v1.2.3 From b9d418170aefb020ebd4c60040d69c4399851aa3 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:51:56 -0800 Subject: page-flags: define PG_uncached behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So far, only IA64 uses PG_uncached and only on non-compound pages. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 43b7acb092ff..dff90852fbc6 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -338,7 +338,7 @@ PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED -PAGEFLAG(Uncached, uncached, PF_ANY) +PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) #else PAGEFLAG_FALSE(Uncached) #endif -- cgit v1.2.3 From d2998c4de2937b964ea63aa2c08183f28462d532 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:00 -0800 Subject: page-flags: define PG_uptodate behavior on compound pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We use PG_uptodate on head pages on transparent huge page. Let's use PF_NO_TAIL. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index dff90852fbc6..818fa39538a9 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -404,8 +404,9 @@ u64 stable_page_flags(struct page *page); static inline int PageUptodate(struct page *page) { - int ret = test_bit(PG_uptodate, &(page)->flags); - + int ret; + page = compound_head(page); + ret = test_bit(PG_uptodate, &(page)->flags); /* * Must ensure that the data we read out of the page is loaded * _after_ we've loaded page->flags to check for PageUptodate. @@ -422,12 +423,14 @@ static inline int PageUptodate(struct page *page) static inline void __SetPageUptodate(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); smp_wmb(); __set_bit(PG_uptodate, &page->flags); } static inline void SetPageUptodate(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); /* * Memory barrier must be issued before setting the PG_uptodate bit, * so that all previous stores issued in order to bring the page @@ -437,7 +440,7 @@ static inline void SetPageUptodate(struct page *page) set_bit(PG_uptodate, &page->flags); } -CLEARPAGEFLAG(Uptodate, uptodate, PF_ANY) +CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) int test_clear_page_writeback(struct page *page); int __test_set_page_writeback(struct page *page, bool keep_write); -- cgit v1.2.3 From 822cdd1152265d87fcfc974e06c3b68f762987fd Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:03 -0800 Subject: page-flags: look at head page if the flag is encoded in page->mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PageAnon() and PageKsm() look at lower bits of page->mapping to check if the page is Anon or KSM. page->mapping can be overloaded in tail pages. Let's always look at head page to avoid false-positives. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Cc: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 818fa39538a9..190f1915a097 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -176,7 +176,7 @@ static inline int PageCompound(struct page *page) #define PF_NO_TAIL(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ compound_head(page);}) -#define PF_NO_COMPOUND(page, enforce) ({ \ +#define PF_NO_COMPOUND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ page;}) @@ -381,6 +381,7 @@ PAGEFLAG(Idle, idle, PF_ANY) static inline int PageAnon(struct page *page) { + page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; } @@ -393,6 +394,7 @@ static inline int PageAnon(struct page *page) */ static inline int PageKsm(struct page *page) { + page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); } -- cgit v1.2.3 From 1c290f642101e64f379e38ea0361d097c08e824d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:07 -0800 Subject: mm: sanitize page->mapping for tail pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't define meaning of page->mapping for tail pages. Currently it's always NULL, which can be inconsistent with head page and potentially lead to problems. Let's poison the pointer to catch all illigal uses. page_rmapping(), page_mapping() and page_anon_vma() are changed to look on head page. The only illegal use I've caught so far is __GPF_COMP pages from sound subsystem, mapped with PTEs. do_shared_fault() is changed to use page_rmapping() instead of direct access to fault_page->mapping. Signed-off-by: Kirill A. Shutemov Reviewed-by: Jérôme Glisse Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/poison.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/poison.h b/include/linux/poison.h index 317e16de09e5..76c3b6c38c16 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -32,6 +32,10 @@ /********** mm/debug-pagealloc.c **********/ #define PAGE_POISON 0xaa +/********** mm/page_alloc.c ************/ + +#define TAIL_MAPPING ((void *) 0x01014A11 + POISON_POINTER_DELTA) + /********** mm/slab.c **********/ /* * Magic nums for obj red zoning. -- cgit v1.2.3 From 685eaade56c66c806dbe8102f12e2926cf4ec870 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:10 -0800 Subject: page-flags: drop __TestClearPage*() helpers Nobody uses them. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 190f1915a097..7bc7fd9c4c5c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -211,10 +211,6 @@ static inline int TestSetPage##uname(struct page *page) \ static inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } -#define __TESTCLEARFLAG(uname, lname, policy) \ -static inline int __TestClearPage##uname(struct page *page) \ - { return __test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } - #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ SETPAGEFLAG(uname, lname, policy) \ @@ -247,9 +243,6 @@ static inline int TestSetPage##uname(struct page *page) { return 0; } #define TESTCLEARFLAG_FALSE(uname) \ static inline int TestClearPage##uname(struct page *page) { return 0; } -#define __TESTCLEARFLAG_FALSE(uname) \ -static inline int __TestClearPage##uname(struct page *page) { return 0; } - #define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \ SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname) @@ -331,10 +324,9 @@ PAGEFLAG(Unevictable, unevictable, PF_HEAD) PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) - __TESTCLEARFLAG(Mlocked, mlocked, PF_NO_TAIL) #else PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) - TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) + TESTSCFLAG_FALSE(Mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED -- cgit v1.2.3 From d281ee6145183594788ab6d5b55f8d144e69eace Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:16 -0800 Subject: rmap: add argument to charge compound page We're going to allow mapping of individual 4k pages of THP compound page. It means we cannot rely on PageTransHuge() check to decide if map/unmap small page or THP. The patch adds new argument to rmap functions to indicate whether we want to operate on whole compound page or only the small page. [n-horiguchi@ah.jp.nec.com: fix mapcount mismatch in hugepage migration] Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 29446aeef36e..038b6e704d9b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -161,16 +161,22 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, struct anon_vma *page_get_anon_vma(struct page *page); +/* bitflags for do_page_add_anon_rmap() */ +#define RMAP_EXCLUSIVE 0x01 +#define RMAP_COMPOUND 0x02 + /* * rmap interfaces called when adding or removing pte of page */ void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); -void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); +void page_add_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long, bool); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, int); -void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); +void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, + unsigned long, bool); void page_add_file_rmap(struct page *); -void page_remove_rmap(struct page *); +void page_remove_rmap(struct page *, bool); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); -- cgit v1.2.3 From f627c2f53786b0445abca47f6aa84c96a1fffec2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:20 -0800 Subject: memcg: adjust to support new THP refcounting As with rmap, with new refcounting we cannot rely on PageTransHuge() to check if we need to charge size of huge page form the cgroup. We need to get information from caller to know whether it was mapped with PMD or PTE. We do uncharge when last reference on the page gone. At that point if we see PageTransHuge() it means we need to unchange whole huge page. The tricky part is partial unmap -- when we try to unmap part of huge page. We don't do a special handing of this situation, meaning we don't uncharge the part of huge page unless last user is gone or split_huge_page() is triggered. In case of cgroup memory pressure happens the partial unmapped page will be split through shrinker. This should be good enough. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2292468f2a30..189f04d4d2ec 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -280,10 +280,12 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg, bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp); + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound); void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare); -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); + bool lrucare, bool compound); +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, + bool compound); void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); @@ -515,7 +517,8 @@ static inline bool mem_cgroup_low(struct mem_cgroup *root, static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, - struct mem_cgroup **memcgp) + struct mem_cgroup **memcgp, + bool compound) { *memcgp = NULL; return 0; @@ -523,12 +526,13 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, static inline void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare) + bool lrucare, bool compound) { } static inline void mem_cgroup_cancel_charge(struct page *page, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg, + bool compound) { } -- cgit v1.2.3 From 1f25fe20a76af0d960172fb104d4b13697cafa84 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:24 -0800 Subject: mm, thp: adjust conditions when we can reuse the page on WP fault With new refcounting we will be able map the same compound page with PTEs and PMDs. It requires adjustment to conditions when we can reuse the page on write-protection fault. For PTE fault we can't reuse the page if it's part of huge page. For PMD we can only reuse the page if nobody else maps the huge page or it's part. We can do it by checking page_mapcount() on each sub-page, but it's expensive. The cheaper way is to check page_count() to be equal 1: every mapcount takes page reference, so this way we can guarantee, that the PMD is the only mapping. This approach can give false negative if somebody pinned the page, but that doesn't affect correctness. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 066bd21765ad..a282933c5bc6 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -538,7 +538,8 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page) (page_mapcount(page) == 1) +#define reuse_swap_page(page) \ + (!PageTransCompound(page) && page_mapcount(page) == 1) static inline int try_to_free_swap(struct page *page) { -- cgit v1.2.3 From 78ddc53473419073ffb2e91178001e87bc513524 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:42 -0800 Subject: thp: rename split_huge_page_pmd() to split_huge_pmd() We are going to decouple splitting THP PMD from splitting underlying compound page. This patch renames split_huge_page_pmd*() functions to split_huge_pmd*() to reflect the fact that it doesn't imply page splitting, only PMD. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ecb080d6ff42..805c7ae42280 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -102,7 +102,7 @@ static inline int split_huge_page(struct page *page) } extern void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd); -#define split_huge_page_pmd(__vma, __address, __pmd) \ +#define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ if (unlikely(pmd_trans_huge(*____pmd))) \ @@ -117,8 +117,6 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma, BUG_ON(pmd_trans_splitting(*____pmd) || \ pmd_trans_huge(*____pmd)); \ } while (0) -extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, - pmd_t *pmd); #if HPAGE_PMD_ORDER >= MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif @@ -183,11 +181,9 @@ static inline int split_huge_page(struct page *page) { return 0; } -#define split_huge_page_pmd(__vma, __address, __pmd) \ - do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) -#define split_huge_page_pmd_mm(__mm, __address, __pmd) \ +#define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) -- cgit v1.2.3 From 122afea9626ab3f717b250a8dd3d5ebf57cdb56c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:46 -0800 Subject: mm, vmstats: new THP splitting event The patch replaces THP_SPLIT with tree events: THP_SPLIT_PAGE, THP_SPLIT_PAGE_FAILED and THP_SPLIT_PMD. It reflects the fact that we are going to be able split PMD without the compound page and that split_huge_page() can fail. Signed-off-by: Kirill A. Shutemov Acked-by: Christoph Lameter Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index e623d392db0c..e1f8c993e73b 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -68,7 +68,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_FAULT_FALLBACK, THP_COLLAPSE_ALLOC, THP_COLLAPSE_ALLOC_FAILED, - THP_SPLIT, + THP_SPLIT_PAGE, + THP_SPLIT_PAGE_FAILED, + THP_SPLIT_PMD, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, #endif -- cgit v1.2.3 From ad0bed24e98bcae9952c2d1f663ec7cb6344a387 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:53 -0800 Subject: thp: drop all split_huge_page()-related code We will re-introduce new version with new refcounting later in patchset. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 805c7ae42280..9df5802faadf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -95,28 +95,12 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); #endif /* CONFIG_DEBUG_VM */ extern unsigned long transparent_hugepage_flags; -extern int split_huge_page_to_list(struct page *page, struct list_head *list); -static inline int split_huge_page(struct page *page) -{ - return split_huge_page_to_list(page, NULL); -} -extern void __split_huge_page_pmd(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd); -#define split_huge_pmd(__vma, __pmd, __address) \ - do { \ - pmd_t *____pmd = (__pmd); \ - if (unlikely(pmd_trans_huge(*____pmd))) \ - __split_huge_page_pmd(__vma, __address, \ - ____pmd); \ - } while (0) -#define wait_split_huge_page(__anon_vma, __pmd) \ - do { \ - pmd_t *____pmd = (__pmd); \ - anon_vma_lock_write(__anon_vma); \ - anon_vma_unlock_write(__anon_vma); \ - BUG_ON(pmd_trans_splitting(*____pmd) || \ - pmd_trans_huge(*____pmd)); \ - } while (0) + +#define split_huge_page_to_list(page, list) BUILD_BUG() +#define split_huge_page(page) BUILD_BUG() +#define split_huge_pmd(__vma, __pmd, __address) BUILD_BUG() + +#define wait_split_huge_page(__anon_vma, __pmd) BUILD_BUG() #if HPAGE_PMD_ORDER >= MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif -- cgit v1.2.3 From ddc58f27f9eee9117219936f77e90ad5b2e00e96 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:56 -0800 Subject: mm: drop tail page refcounting Tail page refcounting is utterly complicated and painful to support. It uses ->_mapcount on tail pages to store how many times this page is pinned. get_page() bumps ->_mapcount on tail page in addition to ->_count on head. This information is required by split_huge_page() to be able to distribute pins from head of compound page to tails during the split. We will need ->_mapcount to account PTE mappings of subpages of the compound page. We eliminate need in current meaning of ->_mapcount in tail pages by forbidding split entirely if the page is pinned. The only user of tail page refcounting is THP which is marked BROKEN for now. Let's drop all this mess. It makes get_page() and put_page() much simpler. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 47 ++++++++++------------------------------------- include/linux/mm_types.h | 17 +++-------------- 2 files changed, 13 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 839d9e9a1c38..34387351930c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -466,44 +466,9 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } -static inline bool __compound_tail_refcounted(struct page *page) -{ - return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page); -} - -/* - * This takes a head page as parameter and tells if the - * tail page reference counting can be skipped. - * - * For this to be safe, PageSlab and PageHeadHuge must remain true on - * any given page where they return true here, until all tail pins - * have been released. - */ -static inline bool compound_tail_refcounted(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHead(page), page); - return __compound_tail_refcounted(page); -} - -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run from under us. - */ - VM_BUG_ON_PAGE(!PageTail(page), page); - VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); - VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); - if (compound_tail_refcounted(compound_head(page))) - atomic_inc(&page->_mapcount); -} - -extern bool __get_page_tail(struct page *page); - static inline void get_page(struct page *page) { - if (unlikely(PageTail(page))) - if (likely(__get_page_tail(page))) - return; + page = compound_head(page); /* * Getting a normal page or the head of a compound page * requires to already have an elevated page->_count. @@ -528,7 +493,15 @@ static inline void init_page_count(struct page *page) atomic_set(&page->_count, 1); } -void put_page(struct page *page); +void __put_page(struct page *page); + +static inline void put_page(struct page *page) +{ + page = compound_head(page); + if (put_page_testzero(page)) + __put_page(page); +} + void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6bc9a0ce2253..faf6fe88d6b3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -81,20 +81,9 @@ struct page { union { /* - * Count of ptes mapped in - * mms, to show when page is - * mapped & limit reverse map - * searches. - * - * Used also for tail pages - * refcounting instead of - * _count. Tail pages cannot - * be mapped and keeping the - * tail page _count zero at - * all times guarantees - * get_page_unless_zero() will - * never succeed on tail - * pages. + * Count of ptes mapped in mms, to show + * when page is mapped & limit reverse + * map searches. */ atomic_t _mapcount; -- cgit v1.2.3 From 3ac808fdd2b835547af81de75c813cf7ba2ab58f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:53:07 -0800 Subject: mm, thp: remove compound_lock() We are going to use migration entries to stabilize page counts. It means we don't need compound_lock() for that. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 35 ----------------------------------- include/linux/page-flags.h | 12 +----------- 2 files changed, 1 insertion(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 34387351930c..70f59de2e288 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -410,41 +410,6 @@ static inline int is_vmalloc_or_module_addr(const void *x) extern void kvfree(const void *addr); -static inline void compound_lock(struct page *page) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON_PAGE(PageSlab(page), page); - bit_spin_lock(PG_compound_lock, &page->flags); -#endif -} - -static inline void compound_unlock(struct page *page) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON_PAGE(PageSlab(page), page); - bit_spin_unlock(PG_compound_lock, &page->flags); -#endif -} - -static inline unsigned long compound_lock_irqsave(struct page *page) -{ - unsigned long uninitialized_var(flags); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - local_irq_save(flags); - compound_lock(page); -#endif - return flags; -} - -static inline void compound_unlock_irqrestore(struct page *page, - unsigned long flags) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - compound_unlock(page); - local_irq_restore(flags); -#endif -} - /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7bc7fd9c4c5c..0c42acca0338 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -101,9 +101,6 @@ enum pageflags { #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - PG_compound_lock, -#endif #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) PG_young, PG_idle, @@ -613,12 +610,6 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) #define __PG_MLOCKED 0 #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define __PG_COMPOUND_LOCK (1 << PG_compound_lock) -#else -#define __PG_COMPOUND_LOCK 0 -#endif - /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. It they are, there is a problem. @@ -628,8 +619,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) 1 << PG_private | 1 << PG_private_2 | \ 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - 1 << PG_unevictable | __PG_MLOCKED | \ - __PG_COMPOUND_LOCK) + 1 << PG_unevictable | __PG_MLOCKED) /* * Flags checked when a page is prepped for return by the page allocator. -- cgit v1.2.3 From 4b471e8898c3d0f5c97a3c73ac32d0549fe01c87 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:53:39 -0800 Subject: mm, thp: remove infrastructure for handling splitting PMDs With new refcounting we don't need to mark PMDs splitting. Let's drop code to handle this. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9df5802faadf..333b058b1e3d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -25,7 +25,7 @@ extern int zap_huge_pmd(struct mmu_gather *tlb, extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); -extern int move_huge_pmd(struct vm_area_struct *vma, +extern bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, @@ -48,15 +48,9 @@ enum transparent_hugepage_flag { #endif }; -enum page_check_address_pmd_flag { - PAGE_CHECK_ADDRESS_PMD_FLAG, - PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, - PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, -}; extern pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, - enum page_check_address_pmd_flag flag, spinlock_t **ptl); #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) @@ -100,7 +94,6 @@ extern unsigned long transparent_hugepage_flags; #define split_huge_page(page) BUILD_BUG() #define split_huge_pmd(__vma, __pmd, __address) BUILD_BUG() -#define wait_split_huge_page(__anon_vma, __pmd) BUILD_BUG() #if HPAGE_PMD_ORDER >= MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif @@ -110,17 +103,17 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); -extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, +extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, spinlock_t **ptl); /* mmap_sem must be held on entry */ -static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, +static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, spinlock_t **ptl) { VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); if (pmd_trans_huge(*pmd)) return __pmd_trans_huge_lock(pmd, vma, ptl); else - return 0; + return false; } static inline int hpage_nr_pages(struct page *page) { @@ -165,8 +158,6 @@ static inline int split_huge_page(struct page *page) { return 0; } -#define wait_split_huge_page(__anon_vma, __pmd) \ - do { } while (0) #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, @@ -181,10 +172,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, long adjust_next) { } -static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, +static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, spinlock_t **ptl) { - return 0; + return false; } static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, -- cgit v1.2.3 From 53f9263baba69fc1630e3c780c4d11b72643f962 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:53:42 -0800 Subject: mm: rework mapcount accounting to enable 4k mapping of THPs We're going to allow mapping of individual 4k pages of THP compound. It means we need to track mapcount on per small page basis. Straight-forward approach is to use ->_mapcount in all subpages to track how many time this subpage is mapped with PMDs or PTEs combined. But this is rather expensive: mapping or unmapping of a THP page with PMD would require HPAGE_PMD_NR atomic operations instead of single we have now. The idea is to store separately how many times the page was mapped as whole -- compound_mapcount. This frees up ->_mapcount in subpages to track PTE mapcount. We use the same approach as with compound page destructor and compound order to store compound_mapcount: use space in first tail page, ->mapping this time. Any time we map/unmap whole compound page (THP or hugetlb) -- we increment/decrement compound_mapcount. When we map part of compound page with PTE we operate on ->_mapcount of the subpage. page_mapcount() counts both: PTE and PMD mappings of the page. Basically, we have mapcount for a subpage spread over two counters. It makes tricky to detect when last mapcount for a page goes away. We introduced PageDoubleMap() for this. When we split THP PMD for the first time and there's other PMD mapping left we offset up ->_mapcount in all subpages by one and set PG_double_map on the compound page. These additional references go away with last compound_mapcount. This approach provides a way to detect when last mapcount goes away on per small page basis without introducing new overhead for most common cases. [akpm@linux-foundation.org: fix typo in comment] [mhocko@suse.com: ignore partial THP when moving task] Signed-off-by: Kirill A. Shutemov Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Cc: Sasha Levin Cc: Aneesh Kumar K.V Cc: Jerome Marchand Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 26 ++++++++++++++++++++++++-- include/linux/mm_types.h | 1 + include/linux/page-flags.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/rmap.h | 4 ++-- 4 files changed, 63 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 70f59de2e288..67e0fab225e8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -410,6 +410,19 @@ static inline int is_vmalloc_or_module_addr(const void *x) extern void kvfree(const void *addr); +static inline atomic_t *compound_mapcount_ptr(struct page *page) +{ + return &page[1].compound_mapcount; +} + +static inline int compound_mapcount(struct page *page) +{ + if (!PageCompound(page)) + return 0; + page = compound_head(page); + return atomic_read(compound_mapcount_ptr(page)) + 1; +} + /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test @@ -422,8 +435,17 @@ static inline void page_mapcount_reset(struct page *page) static inline int page_mapcount(struct page *page) { + int ret; VM_BUG_ON_PAGE(PageSlab(page), page); - return atomic_read(&page->_mapcount) + 1; + + ret = atomic_read(&page->_mapcount) + 1; + if (PageCompound(page)) { + page = compound_head(page); + ret += atomic_read(compound_mapcount_ptr(page)) + 1; + if (PageDoubleMap(page)) + ret--; + } + return ret; } static inline int page_count(struct page *page) @@ -934,7 +956,7 @@ static inline pgoff_t page_file_index(struct page *page) */ static inline int page_mapped(struct page *page) { - return atomic_read(&(page)->_mapcount) >= 0; + return atomic_read(&(page)->_mapcount) + compound_mapcount(page) >= 0; } /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index faf6fe88d6b3..809defe0597d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -54,6 +54,7 @@ struct page { * see PAGE_MAPPING_ANON below. */ void *s_mem; /* slab first object */ + atomic_t compound_mapcount; /* first tail page */ }; /* Second double word */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0c42acca0338..19724e6ebd26 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -126,6 +126,9 @@ enum pageflags { /* SLOB */ PG_slob_free = PG_private, + + /* Compound pages. Stored in first tail page's flags */ + PG_double_map = PG_private_2, }; #ifndef __GENERATING_BOUNDS_H @@ -523,10 +526,43 @@ static inline int PageTransTail(struct page *page) return PageTail(page); } +/* + * PageDoubleMap indicates that the compound page is mapped with PTEs as well + * as PMDs. + * + * This is required for optimization of rmap operations for THP: we can postpone + * per small page mapcount accounting (and its overhead from atomic operations) + * until the first PMD split. + * + * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up + * by one. This reference will go away with last compound_mapcount. + * + * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap(). + */ +static inline int PageDoubleMap(struct page *page) +{ + return PageHead(page) && test_bit(PG_double_map, &page[1].flags); +} + +static inline int TestSetPageDoubleMap(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHead(page), page); + return test_and_set_bit(PG_double_map, &page[1].flags); +} + +static inline int TestClearPageDoubleMap(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHead(page), page); + return test_and_clear_bit(PG_double_map, &page[1].flags); +} + #else TESTPAGEFLAG_FALSE(TransHuge) TESTPAGEFLAG_FALSE(TransCompound) TESTPAGEFLAG_FALSE(TransTail) +TESTPAGEFLAG_FALSE(DoubleMap) + TESTSETFLAG_FALSE(DoubleMap) + TESTCLEARFLAG_FALSE(DoubleMap) #endif /* diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 038b6e704d9b..ebf3750e42b2 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -183,9 +183,9 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); -static inline void page_dup_rmap(struct page *page) +static inline void page_dup_rmap(struct page *page, bool compound) { - atomic_inc(&page->_mapcount); + atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); } /* -- cgit v1.2.3 From e1534ae95004d6a307839a44eed40389d608c935 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:53:46 -0800 Subject: mm: differentiate page_mapped() from page_mapcount() for compound pages Let's define page_mapped() to be true for compound pages if any sub-pages of the compound page is mapped (with PMD or PTE). On other hand page_mapcount() return mapcount for this particular small page. This will make cases like page_get_anon_vma() behave correctly once we allow huge pages to be mapped with PTE. Most users outside core-mm should use page_mapcount() instead of page_mapped(). Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 67e0fab225e8..6b56cfd9fd09 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -953,10 +953,21 @@ static inline pgoff_t page_file_index(struct page *page) /* * Return true if this page is mapped into pagetables. + * For compound page it returns true if any subpage of compound page is mapped. */ -static inline int page_mapped(struct page *page) +static inline bool page_mapped(struct page *page) { - return atomic_read(&(page)->_mapcount) + compound_mapcount(page) >= 0; + int i; + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) >= 0; + page = compound_head(page); + if (atomic_read(compound_mapcount_ptr(page)) >= 0) + return true; + for (i = 0; i < hpage_nr_pages(page); i++) { + if (atomic_read(&page[i]._mapcount) >= 0) + return true; + } + return false; } /* -- cgit v1.2.3 From eef1b3ba053aa68967d294c80a50c4a26db30f52 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:53:53 -0800 Subject: thp: implement split_huge_pmd() Original split_huge_page() combined two operations: splitting PMDs into tables of PTEs and splitting underlying compound page. This patch implements split_huge_pmd() which split given PMD without splitting other PMDs this page mapped with or underlying compound page. Without tail page refcounting, implementation of split_huge_pmd() is pretty straight-forward. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 333b058b1e3d..f1fa1c283be1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -92,7 +92,16 @@ extern unsigned long transparent_hugepage_flags; #define split_huge_page_to_list(page, list) BUILD_BUG() #define split_huge_page(page) BUILD_BUG() -#define split_huge_pmd(__vma, __pmd, __address) BUILD_BUG() + +void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address); + +#define split_huge_pmd(__vma, __pmd, __address) \ + do { \ + pmd_t *____pmd = (__pmd); \ + if (pmd_trans_huge(*____pmd)) \ + __split_huge_pmd(__vma, __pmd, __address); \ + } while (0) #if HPAGE_PMD_ORDER >= MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" -- cgit v1.2.3 From 4e41a30c6d506c884d3da9aeb316352e70679d4b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 15 Jan 2016 16:54:07 -0800 Subject: mm: hwpoison: adjust for new thp refcounting Some mm-related BUG_ON()s could trigger from hwpoison code due to recent changes in thp refcounting rule. This patch fixes them up. In the new refcounting, we no longer use tail->_mapcount to keep tail's refcount, and thereby we can simplify get/put_hwpoison_page(). And another change is that tail's refcount is not transferred to the raw page during thp split (more precisely, in new rule we don't take refcount on tail page any more.) So when we need thp split, we have to transfer the refcount properly to the 4kB soft-offlined page before migration. thp split code goes into core code only when precheck (total_mapcount(head) == page_count(head) - 1) passes to avoid useless split, where we assume that one refcount is held by the caller of thp split and the others are taken via mapping. To meet this assumption, this patch moves thp split part in soft_offline_page() after get_any_page(). [akpm@linux-foundation.org: remove unneeded #define, per Kirill] Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6b56cfd9fd09..e4397f640e86 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2217,7 +2217,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags); extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); extern int unpoison_memory(unsigned long pfn); extern int get_hwpoison_page(struct page *page); -extern void put_hwpoison_page(struct page *page); +#define put_hwpoison_page(page) put_page(page) extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); -- cgit v1.2.3 From e9b61f19858a5d6c42ce2298cf138279375d0d9b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:54:10 -0800 Subject: thp: reintroduce split_huge_page() This patch adds implementation of split_huge_page() for new refcountings. Unlike previous implementation, new split_huge_page() can fail if somebody holds GUP pin on the page. It also means that pin on page would prevent it from bening split under you. It makes situation in many places much cleaner. The basic scheme of split_huge_page(): - Check that sum of mapcounts of all subpage is equal to page_count() plus one (caller pin). Foll off with -EBUSY. This way we can avoid useless PMD-splits. - Freeze the page counters by splitting all PMD and setup migration PTEs. - Re-check sum of mapcounts against page_count(). Page's counts are stable now. -EBUSY if page is pinned. - Split compound page. - Unfreeze the page by removing migration entries. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Jerome Marchand Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 7 +++++-- include/linux/pagemap.h | 13 ++++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f1fa1c283be1..90e11e6a37ab 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -90,8 +90,11 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); extern unsigned long transparent_hugepage_flags; -#define split_huge_page_to_list(page, list) BUILD_BUG() -#define split_huge_page(page) BUILD_BUG() +int split_huge_page_to_list(struct page *page, struct list_head *list); +static inline int split_huge_page(struct page *page) +{ + return split_huge_page_to_list(page, NULL); +} void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index df214a4b886d..4d08b6c33557 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -394,10 +394,21 @@ static inline struct page *read_mapping_page(struct address_space *mapping, */ static inline pgoff_t page_to_pgoff(struct page *page) { + pgoff_t pgoff; + if (unlikely(PageHeadHuge(page))) return page->index << compound_order(page); - else + + if (likely(!PageTransTail(page))) return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + /* + * We don't initialize ->index for tail pages: calculate based on + * head page + */ + pgoff = compound_head(page)->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff += page - compound_head(page); + return pgoff; } /* -- cgit v1.2.3 From 9a982250f773cc8c76f1eee68a770b7cbf2faf78 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:54:17 -0800 Subject: thp: introduce deferred_split_huge_page() Currently we don't split huge page on partial unmap. It's not an ideal situation. It can lead to memory overhead. Furtunately, we can detect partial unmap on page_remove_rmap(). But we cannot call split_huge_page() from there due to locking context. It's also counterproductive to do directly from munmap() codepath: in many cases we will hit this from exit(2) and splitting the huge page just to free it up in small pages is not what we really want. The patch introduce deferred_split_huge_page() which put the huge page into queue for splitting. The splitting itself will happen when we get memory pressure via shrinker interface. The page will be dropped from list on freeing through compound page destructor. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 5 +++++ include/linux/mm.h | 5 +++++ include/linux/mm_types.h | 2 ++ 3 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 90e11e6a37ab..7aec5ee9cfdf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -90,11 +90,15 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); extern unsigned long transparent_hugepage_flags; +extern void prep_transhuge_page(struct page *page); +extern void free_transhuge_page(struct page *page); + int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { return split_huge_page_to_list(page, NULL); } +void deferred_split_huge_page(struct page *page); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address); @@ -170,6 +174,7 @@ static inline int split_huge_page(struct page *page) { return 0; } +static inline void deferred_split_huge_page(struct page *page) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, diff --git a/include/linux/mm.h b/include/linux/mm.h index e4397f640e86..aa8ae8330a75 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -507,6 +507,9 @@ enum compound_dtor_id { COMPOUND_PAGE_DTOR, #ifdef CONFIG_HUGETLB_PAGE HUGETLB_PAGE_DTOR, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + TRANSHUGE_PAGE_DTOR, #endif NR_COMPOUND_DTORS, }; @@ -537,6 +540,8 @@ static inline void set_compound_order(struct page *page, unsigned int order) page[1].compound_order = order; } +void free_compound_page(struct page *page); + #ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 809defe0597d..2dd9c313a8c0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -55,6 +55,7 @@ struct page { */ void *s_mem; /* slab first object */ atomic_t compound_mapcount; /* first tail page */ + /* page_deferred_list().next -- second tail page */ }; /* Second double word */ @@ -62,6 +63,7 @@ struct page { union { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* sl[aou]b first free object */ + /* page_deferred_list().prev -- second tail page */ }; union { -- cgit v1.2.3 From b20ce5e03b936be077463015661dcf52be274e5b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:54:37 -0800 Subject: mm: prepare page_referenced() and page_idle to new THP refcounting Both page_referenced() and page_idle_clear_pte_refs_one() assume that THP can only be mapped with PMD, so there's no reason to look on PTEs for PageTransHuge() pages. That's no true anymore: THP can be mapped with PTEs too. The patch removes PageTransHuge() test from the functions and opencode page table check. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kirill A. Shutemov Cc: Vladimir Davydov Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Naoya Horiguchi Cc: Sasha Levin Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 5 ----- include/linux/mm.h | 23 ++++++++++++++--------- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7aec5ee9cfdf..72cd942edb22 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -48,11 +48,6 @@ enum transparent_hugepage_flag { #endif }; -extern pmd_t *page_check_address_pmd(struct page *page, - struct mm_struct *mm, - unsigned long address, - spinlock_t **ptl); - #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<_mapcount, -1); } +int __page_mapcount(struct page *page); + static inline int page_mapcount(struct page *page) { - int ret; VM_BUG_ON_PAGE(PageSlab(page), page); - ret = atomic_read(&page->_mapcount) + 1; - if (PageCompound(page)) { - page = compound_head(page); - ret += atomic_read(compound_mapcount_ptr(page)) + 1; - if (PageDoubleMap(page)) - ret--; - } - return ret; + if (unlikely(PageCompound(page))) + return __page_mapcount(page); + return atomic_read(&page->_mapcount) + 1; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int total_mapcount(struct page *page); +#else +static inline int total_mapcount(struct page *page) +{ + return page_mapcount(page); } +#endif static inline int page_count(struct page *page) { -- cgit v1.2.3 From 8749cfea11f3fffe8f7cad891470a77b36e0185f Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 15 Jan 2016 16:54:45 -0800 Subject: mm: add page_check_address_transhuge() helper page_referenced_one() and page_idle_clear_pte_refs_one() duplicate the code for looking up pte of a (possibly transhuge) page. Move this code to a new helper function, page_check_address_transhuge(), and make the above mentioned functions use it. This is just a cleanup, no functional changes are intended. Signed-off-by: Vladimir Davydov Reviewed-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index ebf3750e42b2..77d1ba57d495 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -215,6 +215,25 @@ static inline pte_t *page_check_address(struct page *page, struct mm_struct *mm, return ptep; } +/* + * Used by idle page tracking to check if a page was referenced via page + * tables. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +bool page_check_address_transhuge(struct page *page, struct mm_struct *mm, + unsigned long address, pmd_t **pmdp, + pte_t **ptep, spinlock_t **ptlp); +#else +static inline bool page_check_address_transhuge(struct page *page, + struct mm_struct *mm, unsigned long address, + pmd_t **pmdp, pte_t **ptep, spinlock_t **ptlp) +{ + *ptep = page_check_address(page, mm, address, ptlp, 0); + *pmdp = NULL; + return !!*ptep; +} +#endif + /* * Used by swapoff to help locate where page is expected in vma. */ -- cgit v1.2.3 From 854e9ed09dedf0c19ac8640e91bcc74bc3f9e5c9 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 15 Jan 2016 16:54:53 -0800 Subject: mm: support madvise(MADV_FREE) Linux doesn't have an ability to free pages lazy while other OS already have been supported that named by madvise(MADV_FREE). The gain is clear that kernel can discard freed pages rather than swapping out or OOM if memory pressure happens. Without memory pressure, freed pages would be reused by userspace without another additional overhead(ex, page fault + allocation + zeroing). Jason Evans said: : Facebook has been using MAP_UNINITIALIZED : (https://lkml.org/lkml/2012/1/18/308) in some of its applications for : several years, but there are operational costs to maintaining this : out-of-tree in our kernel and in jemalloc, and we are anxious to retire it : in favor of MADV_FREE. When we first enabled MAP_UNINITIALIZED it : increased throughput for much of our workload by ~5%, and although the : benefit has decreased using newer hardware and kernels, there is still : enough benefit that we cannot reasonably retire it without a replacement. : : Aside from Facebook operations, there are numerous broadly used : applications that would benefit from MADV_FREE. The ones that immediately : come to mind are redis, varnish, and MariaDB. I don't have much insight : into Android internals and development process, but I would hope to see : MADV_FREE support eventually end up there as well to benefit applications : linked with the integrated jemalloc. : : jemalloc will use MADV_FREE once it becomes available in the Linux kernel. : In fact, jemalloc already uses MADV_FREE or equivalent everywhere it's : available: *BSD, OS X, Windows, and Solaris -- every platform except Linux : (and AIX, but I'm not sure it even compiles on AIX). The lack of : MADV_FREE on Linux forced me down a long series of increasingly : sophisticated heuristics for madvise() volume reduction, and even so this : remains a common performance issue for people using jemalloc on Linux. : Please integrate MADV_FREE; many people will benefit substantially. How it works: When madvise syscall is called, VM clears dirty bit of ptes of the range. If memory pressure happens, VM checks dirty bit of page table and if it found still "clean", it means it's a "lazyfree pages" so VM could discard the page instead of swapping out. Once there was store operation for the page before VM peek a page to reclaim, dirty bit is set so VM can swap out the page instead of discarding. One thing we should notice is that basically, MADV_FREE relies on dirty bit in page table entry to decide whether VM allows to discard the page or not. IOW, if page table entry includes marked dirty bit, VM shouldn't discard the page. However, as a example, if swap-in by read fault happens, page table entry doesn't have dirty bit so MADV_FREE could discard the page wrongly. For avoiding the problem, MADV_FREE did more checks with PageDirty and PageSwapCache. It worked out because swapped-in page lives on swap cache and since it is evicted from the swap cache, the page has PG_dirty flag. So both page flags check effectively prevent wrong discarding by MADV_FREE. However, a problem in above logic is that swapped-in page has PG_dirty still after they are removed from swap cache so VM cannot consider the page as freeable any more even if madvise_free is called in future. Look at below example for detail. ptr = malloc(); memset(ptr); .. .. .. heavy memory pressure so all of pages are swapped out .. .. var = *ptr; -> a page swapped-in and could be removed from swapcache. Then, page table doesn't mark dirty bit and page descriptor includes PG_dirty .. .. madvise_free(ptr); -> It doesn't clear PG_dirty of the page. .. .. .. .. heavy memory pressure again. .. In this time, VM cannot discard the page because the page .. has *PG_dirty* To solve the problem, this patch clears PG_dirty if only the page is owned exclusively by current process when madvise is called because PG_dirty represents ptes's dirtiness in several processes so we could clear it only if we own it exclusively. Firstly, heavy users would be general allocators(ex, jemalloc, tcmalloc and hope glibc supports it) and jemalloc/tcmalloc already have supported the feature for other OS(ex, FreeBSD) barrios@blaptop:~/benchmark/ebizzy$ lscpu Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 12 On-line CPU(s) list: 0-11 Thread(s) per core: 1 Core(s) per socket: 1 Socket(s): 12 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 2 Stepping: 3 CPU MHz: 3200.185 BogoMIPS: 6400.53 Virtualization: VT-x Hypervisor vendor: KVM Virtualization type: full L1d cache: 32K L1i cache: 32K L2 cache: 4096K NUMA node0 CPU(s): 0-11 ebizzy benchmark(./ebizzy -S 10 -n 512) Higher avg is better. vanilla-jemalloc MADV_free-jemalloc 1 thread records: 10 records: 10 avg: 2961.90 avg: 12069.70 std: 71.96(2.43%) std: 186.68(1.55%) max: 3070.00 max: 12385.00 min: 2796.00 min: 11746.00 2 thread records: 10 records: 10 avg: 5020.00 avg: 17827.00 std: 264.87(5.28%) std: 358.52(2.01%) max: 5244.00 max: 18760.00 min: 4251.00 min: 17382.00 4 thread records: 10 records: 10 avg: 8988.80 avg: 27930.80 std: 1175.33(13.08%) std: 3317.33(11.88%) max: 9508.00 max: 30879.00 min: 5477.00 min: 21024.00 8 thread records: 10 records: 10 avg: 13036.50 avg: 33739.40 std: 170.67(1.31%) std: 5146.22(15.25%) max: 13371.00 max: 40572.00 min: 12785.00 min: 24088.00 16 thread records: 10 records: 10 avg: 11092.40 avg: 31424.20 std: 710.60(6.41%) std: 3763.89(11.98%) max: 12446.00 max: 36635.00 min: 9949.00 min: 25669.00 32 thread records: 10 records: 10 avg: 11067.00 avg: 34495.80 std: 971.06(8.77%) std: 2721.36(7.89%) max: 12010.00 max: 38598.00 min: 9002.00 min: 30636.00 In summary, MADV_FREE is about much faster than MADV_DONTNEED. This patch (of 12): Add core MADV_FREE implementation. [akpm@linux-foundation.org: small cleanups] Signed-off-by: Minchan Kim Acked-by: Michal Hocko Acked-by: Hugh Dickins Cc: Mika Penttil Cc: Michael Kerrisk Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Jason Evans Cc: Daniel Micay Cc: "Kirill A. Shutemov" Cc: Shaohua Li Cc: Cc: Andy Lutomirski Cc: "James E.J. Bottomley" Cc: "Kirill A. Shutemov" Cc: "Shaohua Li" Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Gang Cc: Chris Zankel Cc: Darrick J. Wong Cc: David S. Miller Cc: Helge Deller Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Max Filippov Cc: Ralf Baechle Cc: Richard Henderson Cc: Roland Dreier Cc: Russell King Cc: Shaohua Li Cc: Will Deacon Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 2 ++ include/linux/vm_event_item.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 77d1ba57d495..bdf597c4f0be 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -85,6 +85,7 @@ enum ttu_flags { TTU_UNMAP = 1, /* unmap mode */ TTU_MIGRATION = 2, /* migration mode */ TTU_MUNLOCK = 4, /* munlock mode */ + TTU_LZFREE = 8, /* lazy free mode */ TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ @@ -311,5 +312,6 @@ static inline int page_mkclean(struct page *page) #define SWAP_AGAIN 1 #define SWAP_FAIL 2 #define SWAP_MLOCK 3 +#define SWAP_LZFREE 4 #endif /* _LINUX_RMAP_H */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index e1f8c993e73b..67c1dbd19c6d 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -25,6 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGALLOC), PGFREE, PGACTIVATE, PGDEACTIVATE, PGFAULT, PGMAJFAULT, + PGLAZYFREED, FOR_ALL_ZONES(PGREFILL), FOR_ALL_ZONES(PGSTEAL_KSWAPD), FOR_ALL_ZONES(PGSTEAL_DIRECT), -- cgit v1.2.3 From 10853a039208c4afaa322a7d802456c8dca222f4 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 15 Jan 2016 16:55:11 -0800 Subject: mm: move lazily freed pages to inactive list MADV_FREE is a hint that it's okay to discard pages if there is memory pressure and we use reclaimers(ie, kswapd and direct reclaim) to free them so there is no value keeping them in the active anonymous LRU so this patch moves them to inactive LRU list's head. This means that MADV_FREE-ed pages which were living on the inactive list are reclaimed first because they are more likely to be cold rather than recently active pages. An arguable issue for the approach would be whether we should put the page to the head or tail of the inactive list. I chose head because the kernel cannot make sure it's really cold or warm for every MADV_FREE usecase but at least we know it's not *hot*, so landing of inactive head would be a comprimise for various usecases. This fixes suboptimal behavior of MADV_FREE when pages living on the active list will sit there for a long time even under memory pressure while the inactive list is reclaimed heavily. This basically breaks the whole purpose of using MADV_FREE to help the system to free memory which is might not be used. Signed-off-by: Minchan Kim Acked-by: Hugh Dickins Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Shaohua Li Cc: "James E.J. Bottomley" Cc: "Kirill A. Shutemov" Cc: Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Gang Cc: Chris Zankel Cc: Daniel Micay Cc: Darrick J. Wong Cc: David S. Miller Cc: Helge Deller Cc: Ivan Kokshaysky Cc: Jason Evans Cc: KOSAKI Motohiro Cc: Kirill A. Shutemov Cc: Matt Turner Cc: Max Filippov Cc: Michael Kerrisk Cc: Mika Penttil Cc: Ralf Baechle Cc: Richard Henderson Cc: Roland Dreier Cc: Russell King Cc: Will Deacon Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index a282933c5bc6..414e101cd061 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -307,6 +307,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); +extern void deactivate_page(struct page *page); extern void swap_setup(void); extern void add_page_to_unevictable_list(struct page *page); -- cgit v1.2.3 From b8d3c4c3009d42869dc03a1da0efc2aa687d0ab4 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 15 Jan 2016 16:55:42 -0800 Subject: mm/huge_memory.c: don't split THP page when MADV_FREE syscall is called We don't need to split THP page when MADV_FREE syscall is called if [start, len] is aligned with THP size. The split could be done when VM decide to free it in reclaim path if memory pressure is heavy. With that, we could avoid unnecessary THP split. For the feature, this patch changes pte dirtness marking logic of THP. Now, it marks every ptes of pages dirty unconditionally in splitting, which makes MADV_FREE void. So, instead, this patch propagates pmd dirtiness to all pages via PG_dirty and restores pte dirtiness from PG_dirty. With this, if pmd is clean(ie, MADV_FREEed) when split happens(e,g, shrink_page_list), all of pages are clean too so we could discard them. Signed-off-by: Minchan Kim Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: "James E.J. Bottomley" Cc: "Kirill A. Shutemov" Cc: Shaohua Li Cc: Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Gang Cc: Chris Zankel Cc: Daniel Micay Cc: Darrick J. Wong Cc: David S. Miller Cc: Helge Deller Cc: Ivan Kokshaysky Cc: Jason Evans Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Matt Turner Cc: Max Filippov Cc: Mel Gorman Cc: Michael Kerrisk Cc: Michal Hocko Cc: Mika Penttil Cc: Ralf Baechle Cc: Richard Henderson Cc: Rik van Riel Cc: Roland Dreier Cc: Russell King Cc: Shaohua Li Cc: Will Deacon Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 72cd942edb22..0160201993d4 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); +extern int madvise_free_huge_pmd(struct mmu_gather *tlb, + struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long next); extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); -- cgit v1.2.3 From b2e0d1625e193b40cbbd45b799f82d54d34e015c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:55:59 -0800 Subject: dax: fix lifetime of in-kernel dax mappings with dax_map_atomic() The DAX implementation needs to protect new calls to ->direct_access() and usage of its return value against the driver for the underlying block device being disabled. Use blk_queue_enter()/blk_queue_exit() to hold off blk_cleanup_queue() from proceeding, or otherwise fail new mapping requests if the request_queue is being torn down. This also introduces blk_dax_ctl to simplify the interface from fs/dax.c through dax_map_atomic() to bdev_direct_access(). [willy@linux.intel.com: fix read() of a hole] Signed-off-by: Dan Williams Reviewed-by: Jeff Moyer Cc: Jan Kara Cc: Jens Axboe Cc: Dave Chinner Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c70e3588a48c..88821fa26f19 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1617,6 +1617,20 @@ static inline bool integrity_req_gap_front_merge(struct request *req, #endif /* CONFIG_BLK_DEV_INTEGRITY */ +/** + * struct blk_dax_ctl - control and output parameters for ->direct_access + * @sector: (input) offset relative to a block_device + * @addr: (output) kernel virtual address for @sector populated by driver + * @pfn: (output) page frame number for @addr populated by driver + * @size: (input) number of bytes requested + */ +struct blk_dax_ctl { + sector_t sector; + void __pmem *addr; + long size; + unsigned long pfn; +}; + struct block_device_operations { int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); @@ -1643,8 +1657,7 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); -extern long bdev_direct_access(struct block_device *, sector_t, - void __pmem **addr, unsigned long *pfn, long size); +extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *); #else /* CONFIG_BLOCK */ struct block_device; -- cgit v1.2.3 From ba049e93aef7e8c571567088b1b73f4f5b99272a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:11 -0800 Subject: kvm: rename pfn_t to kvm_pfn_t To date, we have implemented two I/O usage models for persistent memory, PMEM (a persistent "ram disk") and DAX (mmap persistent memory into userspace). This series adds a third, DAX-GUP, that allows DAX mappings to be the target of direct-i/o. It allows userspace to coordinate DMA/RDMA from/to persistent memory. The implementation leverages the ZONE_DEVICE mm-zone that went into 4.3-rc1 (also discussed at kernel summit) to flag pages that are owned and dynamically mapped by a device driver. The pmem driver, after mapping a persistent memory range into the system memmap via devm_memremap_pages(), arranges for DAX to distinguish pfn-only versus page-backed pmem-pfns via flags in the new pfn_t type. The DAX code, upon seeing a PFN_DEV+PFN_MAP flagged pfn, flags the resulting pte(s) inserted into the process page tables with a new _PAGE_DEVMAP flag. Later, when get_user_pages() is walking ptes it keys off _PAGE_DEVMAP to pin the device hosting the page range active. Finally, get_page() and put_page() are modified to take references against the device driver established page mapping. Finally, this need for "struct page" for persistent memory requires memory capacity to store the memmap array. Given the memmap array for a large pool of persistent may exhaust available DRAM introduce a mechanism to allocate the memmap from persistent memory. The new "struct vmem_altmap *" parameter to devm_memremap_pages() enables arch_add_memory() to use reserved pmem capacity rather than the page allocator. This patch (of 18): The core has developed a need for a "pfn_t" type [1]. Move the existing pfn_t in KVM to kvm_pfn_t [2]. [1]: https://lists.01.org/pipermail/linux-nvdimm/2015-September/002199.html [2]: https://lists.01.org/pipermail/linux-nvdimm/2015-September/002218.html Signed-off-by: Dan Williams Acked-by: Christoffer Dall Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kvm_host.h | 37 +++++++++++++++++++------------------ include/linux/kvm_types.h | 2 +- 2 files changed, 20 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f707f74055c3..861f690aa791 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -66,7 +66,7 @@ * error pfns indicate that the gfn is in slot but faild to * translate it to pfn on host. */ -static inline bool is_error_pfn(pfn_t pfn) +static inline bool is_error_pfn(kvm_pfn_t pfn) { return !!(pfn & KVM_PFN_ERR_MASK); } @@ -76,13 +76,13 @@ static inline bool is_error_pfn(pfn_t pfn) * translated to pfn - it is not in slot or failed to * translate it to pfn. */ -static inline bool is_error_noslot_pfn(pfn_t pfn) +static inline bool is_error_noslot_pfn(kvm_pfn_t pfn) { return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK); } /* noslot pfn indicates that the gfn is not in slot. */ -static inline bool is_noslot_pfn(pfn_t pfn) +static inline bool is_noslot_pfn(kvm_pfn_t pfn) { return pfn == KVM_PFN_NOSLOT; } @@ -591,19 +591,20 @@ void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); void kvm_set_page_accessed(struct page *page); -pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); -pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); -pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, +kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); +kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); +kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); -pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); -pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); -pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, - bool *async, bool write_fault, bool *writable); +kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); +kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); +kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, + bool atomic, bool *async, bool write_fault, + bool *writable); -void kvm_release_pfn_clean(pfn_t pfn); -void kvm_set_pfn_dirty(pfn_t pfn); -void kvm_set_pfn_accessed(pfn_t pfn); -void kvm_get_pfn(pfn_t pfn); +void kvm_release_pfn_clean(kvm_pfn_t pfn); +void kvm_set_pfn_dirty(kvm_pfn_t pfn); +void kvm_set_pfn_accessed(kvm_pfn_t pfn); +void kvm_get_pfn(kvm_pfn_t pfn); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); @@ -629,8 +630,8 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); -pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); -pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); +kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); +kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); @@ -811,7 +812,7 @@ void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); -bool kvm_is_reserved_pfn(pfn_t pfn); +bool kvm_is_reserved_pfn(kvm_pfn_t pfn); struct kvm_irq_ack_notifier { struct hlist_node link; @@ -965,7 +966,7 @@ static inline gfn_t gpa_to_gfn(gpa_t gpa) return (gfn_t)(gpa >> PAGE_SHIFT); } -static inline hpa_t pfn_to_hpa(pfn_t pfn) +static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn) { return (hpa_t)pfn << PAGE_SHIFT; } diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 1b47a185c2f0..8bf259dae9f6 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -53,7 +53,7 @@ typedef unsigned long hva_t; typedef u64 hpa_t; typedef u64 hfn_t; -typedef hfn_t pfn_t; +typedef hfn_t kvm_pfn_t; struct gfn_to_hva_cache { u64 generation; -- cgit v1.2.3 From 34c0fd540e79fb49ef9ce864dae1058cca265780 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:14 -0800 Subject: mm, dax, pmem: introduce pfn_t For the purpose of communicating the optional presence of a 'struct page' for the pfn returned from ->direct_access(), introduce a type that encapsulates a page-frame-number plus flags. These flags contain the historical "page_link" encoding for a scatterlist entry, but can also denote "device memory". Where "device memory" is a set of pfns that are not part of the kernel's linear mapping by default, but are accessed via the same memory controller as ram. The motivation for this new type is large capacity persistent memory that needs struct page entries in the 'memmap' to support 3rd party DMA (i.e. O_DIRECT I/O with a persistent memory source/target). However, we also need it in support of maintaining a list of mapped inodes which need to be unmapped at driver teardown or freeze_bdev() time. Signed-off-by: Dan Williams Cc: Christoph Hellwig Cc: Dave Hansen Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 5 ++-- include/linux/pfn.h | 9 +++++++ include/linux/pfn_t.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 include/linux/pfn_t.h (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 88821fa26f19..bfb64d672e19 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1628,7 +1629,7 @@ struct blk_dax_ctl { sector_t sector; void __pmem *addr; long size; - unsigned long pfn; + pfn_t pfn; }; struct block_device_operations { @@ -1638,7 +1639,7 @@ struct block_device_operations { int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); long (*direct_access)(struct block_device *, sector_t, void __pmem **, - unsigned long *pfn); + pfn_t *); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); /* ->media_changed() is DEPRECATED, use ->check_events() instead */ diff --git a/include/linux/pfn.h b/include/linux/pfn.h index 97f3e88aead4..2d8e49711b63 100644 --- a/include/linux/pfn.h +++ b/include/linux/pfn.h @@ -3,6 +3,15 @@ #ifndef __ASSEMBLY__ #include + +/* + * pfn_t: encapsulates a page-frame number that is optionally backed + * by memmap (struct page). Whether a pfn_t has a 'struct page' + * backing is indicated by flags in the high bits of the value. + */ +typedef struct { + unsigned long val; +} pfn_t; #endif #define PFN_ALIGN(x) (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK) diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h new file mode 100644 index 000000000000..c557a0e0b20c --- /dev/null +++ b/include/linux/pfn_t.h @@ -0,0 +1,67 @@ +#ifndef _LINUX_PFN_T_H_ +#define _LINUX_PFN_T_H_ +#include + +/* + * PFN_FLAGS_MASK - mask of all the possible valid pfn_t flags + * PFN_SG_CHAIN - pfn is a pointer to the next scatterlist entry + * PFN_SG_LAST - pfn references a page and is the last scatterlist entry + * PFN_DEV - pfn is not covered by system memmap by default + * PFN_MAP - pfn has a dynamic page mapping established by a device driver + */ +#define PFN_FLAGS_MASK (((unsigned long) ~PAGE_MASK) \ + << (BITS_PER_LONG - PAGE_SHIFT)) +#define PFN_SG_CHAIN (1UL << (BITS_PER_LONG - 1)) +#define PFN_SG_LAST (1UL << (BITS_PER_LONG - 2)) +#define PFN_DEV (1UL << (BITS_PER_LONG - 3)) +#define PFN_MAP (1UL << (BITS_PER_LONG - 4)) + +static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, unsigned long flags) +{ + pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), }; + + return pfn_t; +} + +/* a default pfn to pfn_t conversion assumes that @pfn is pfn_valid() */ +static inline pfn_t pfn_to_pfn_t(unsigned long pfn) +{ + return __pfn_to_pfn_t(pfn, 0); +} + +extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags); + +static inline bool pfn_t_has_page(pfn_t pfn) +{ + return (pfn.val & PFN_MAP) == PFN_MAP || (pfn.val & PFN_DEV) == 0; +} + +static inline unsigned long pfn_t_to_pfn(pfn_t pfn) +{ + return pfn.val & ~PFN_FLAGS_MASK; +} + +static inline struct page *pfn_t_to_page(pfn_t pfn) +{ + if (pfn_t_has_page(pfn)) + return pfn_to_page(pfn_t_to_pfn(pfn)); + return NULL; +} + +static inline dma_addr_t pfn_t_to_phys(pfn_t pfn) +{ + return PFN_PHYS(pfn_t_to_pfn(pfn)); +} + +static inline void *pfn_t_to_virt(pfn_t pfn) +{ + if (pfn_t_has_page(pfn)) + return __va(pfn_t_to_phys(pfn)); + return NULL; +} + +static inline pfn_t page_to_pfn_t(struct page *page) +{ + return pfn_to_pfn_t(page_to_pfn(page)); +} +#endif /* _LINUX_PFN_T_H_ */ -- cgit v1.2.3 From 260ae3f7db614a5c4aa4b773599f99adc1d9859e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:17 -0800 Subject: mm: skip memory block registration for ZONE_DEVICE Prevent userspace from trying and failing to online ZONE_DEVICE pages which are meant to never be onlined. For example on platforms with a udev rule like the following: SUBSYSTEM=="memory", ACTION=="add", ATTR{state}=="offline", ATTR{state}="online" ...will generate futile attempts to online the ZONE_DEVICE sections. Example kernel messages: Built 1 zonelists in Node order, mobility grouping on. Total pages: 1004747 Policy zone: Normal online_pages [mem 0x248000000-0x24fffffff] failed Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0ef5f21735af..d9fe12d45c21 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -674,6 +674,18 @@ static inline enum zone_type page_zonenum(const struct page *page) return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } +#ifdef CONFIG_ZONE_DEVICE +static inline bool is_zone_device_page(const struct page *page) +{ + return page_zonenum(page) == ZONE_DEVICE; +} +#else +static inline bool is_zone_device_page(const struct page *page) +{ + return false; +} +#endif + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif -- cgit v1.2.3 From 9476df7d80dfc425b37bfecf1d89edf8ec81fcb6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:19 -0800 Subject: mm: introduce find_dev_pagemap() There are several scenarios where we need to retrieve and update metadata associated with a given devm_memremap_pages() mapping, and the only lookup key available is a pfn in the range: 1/ We want to augment vmemmap_populate() (called via arch_add_memory()) to allocate memmap storage from pre-allocated pages reserved by the device driver. At vmemmap_alloc_block_buf() time it grabs device pages rather than page allocator pages. This is in support of devm_memremap_pages() mappings where the memmap is too large to fit in main memory (i.e. large persistent memory devices). 2/ Taking a reference against the mapping when inserting device pages into the address_space radix of a given inode. This facilitates unmap_mapping_range() and truncate_inode_pages() operations when the driver is tearing down the mapping. 3/ get_user_pages() operations on ZONE_DEVICE memory require taking a reference against the mapping so that the driver teardown path can revoke and drain usage of device pages. Signed-off-by: Dan Williams Tested-by: Logan Gunthorpe Cc: Christoph Hellwig Cc: Dave Chinner Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/io.h | 15 --------------- include/linux/memremap.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 15 deletions(-) create mode 100644 include/linux/memremap.h (limited to 'include/linux') diff --git a/include/linux/io.h b/include/linux/io.h index de64c1e53612..fffd88d7f426 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -89,21 +89,6 @@ void devm_memunmap(struct device *dev, void *addr); void *__devm_memremap_pages(struct device *dev, struct resource *res); -#ifdef CONFIG_ZONE_DEVICE -void *devm_memremap_pages(struct device *dev, struct resource *res); -#else -static inline void *devm_memremap_pages(struct device *dev, struct resource *res) -{ - /* - * Fail attempts to call devm_memremap_pages() without - * ZONE_DEVICE support enabled, this requires callers to fall - * back to plain devm_memremap() based on config - */ - WARN_ON_ONCE(1); - return ERR_PTR(-ENXIO); -} -#endif - /* * Some systems do not have legacy ISA devices. * /dev/port is not a valid interface on these systems. diff --git a/include/linux/memremap.h b/include/linux/memremap.h new file mode 100644 index 000000000000..d90721c178bb --- /dev/null +++ b/include/linux/memremap.h @@ -0,0 +1,38 @@ +#ifndef _LINUX_MEMREMAP_H_ +#define _LINUX_MEMREMAP_H_ +#include + +struct resource; +struct device; +/** + * struct dev_pagemap - metadata for ZONE_DEVICE mappings + * @dev: host device of the mapping for debug + */ +struct dev_pagemap { + /* TODO: vmem_altmap and percpu_ref count */ + struct device *dev; +}; + +#ifdef CONFIG_ZONE_DEVICE +void *devm_memremap_pages(struct device *dev, struct resource *res); +struct dev_pagemap *find_dev_pagemap(resource_size_t phys); +#else +static inline void *devm_memremap_pages(struct device *dev, + struct resource *res) +{ + /* + * Fail attempts to call devm_memremap_pages() without + * ZONE_DEVICE support enabled, this requires callers to fall + * back to plain devm_memremap() based on config + */ + WARN_ON_ONCE(1); + return ERR_PTR(-ENXIO); +} + +static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys) +{ + return NULL; +} +#endif + +#endif /* _LINUX_MEMREMAP_H_ */ -- cgit v1.2.3 From 4b94ffdc4163bae1ec73b6e977ffb7a7da3d06d3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:22 -0800 Subject: x86, mm: introduce vmem_altmap to augment vmemmap_populate() In support of providing struct page for large persistent memory capacities, use struct vmem_altmap to change the default policy for allocating memory for the memmap array. The default vmemmap_populate() allocates page table storage area from the page allocator. Given persistent memory capacities relative to DRAM it may not be feasible to store the memmap in 'System Memory'. Instead vmem_altmap represents pre-allocated "device pages" to satisfy vmemmap_alloc_block_buf() requests. Signed-off-by: Dan Williams Reported-by: kbuild test robot Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 ++- include/linux/memremap.h | 39 +++++++++++++++++++++++++++++++++++---- include/linux/mm.h | 9 ++++++++- 3 files changed, 45 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2ea574ff9714..43405992d027 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -275,7 +275,8 @@ extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); -extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms); +extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index d90721c178bb..aa3e82a80d7b 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -4,21 +4,53 @@ struct resource; struct device; + +/** + * struct vmem_altmap - pre-allocated storage for vmemmap_populate + * @base_pfn: base of the entire dev_pagemap mapping + * @reserve: pages mapped, but reserved for driver use (relative to @base) + * @free: free pages set aside in the mapping for memmap storage + * @align: pages reserved to meet allocation alignments + * @alloc: track pages consumed, private to vmemmap_populate() + */ +struct vmem_altmap { + const unsigned long base_pfn; + const unsigned long reserve; + unsigned long free; + unsigned long align; + unsigned long alloc; +}; + +unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); +void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); + +#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE) +struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start); +#else +static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) +{ + return NULL; +} +#endif + /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings + * @altmap: pre-allocated/reserved memory for vmemmap allocations * @dev: host device of the mapping for debug */ struct dev_pagemap { - /* TODO: vmem_altmap and percpu_ref count */ + struct vmem_altmap *altmap; + const struct resource *res; struct device *dev; }; #ifdef CONFIG_ZONE_DEVICE -void *devm_memremap_pages(struct device *dev, struct resource *res); +void *devm_memremap_pages(struct device *dev, struct resource *res, + struct vmem_altmap *altmap); struct dev_pagemap *find_dev_pagemap(resource_size_t phys); #else static inline void *devm_memremap_pages(struct device *dev, - struct resource *res) + struct resource *res, struct vmem_altmap *altmap) { /* * Fail attempts to call devm_memremap_pages() without @@ -34,5 +66,4 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys) return NULL; } #endif - #endif /* _LINUX_MEMREMAP_H_ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index d9fe12d45c21..8bb0907a3603 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2217,7 +2217,14 @@ pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); void *vmemmap_alloc_block(unsigned long size, int node); -void *vmemmap_alloc_block_buf(unsigned long size, int node); +struct vmem_altmap; +void *__vmemmap_alloc_block_buf(unsigned long size, int node, + struct vmem_altmap *altmap); +static inline void *vmemmap_alloc_block_buf(unsigned long size, int node) +{ + return __vmemmap_alloc_block_buf(size, node, NULL); +} + void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); int vmemmap_populate_basepages(unsigned long start, unsigned long end, int node); -- cgit v1.2.3 From 888cdbc2c9a76a0e450f533b1957cdbfe7d483d5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:32 -0800 Subject: hugetlb: fix compile error on tile Inlude asm/pgtable.h to get the definition for pud_t to fix: include/linux/hugetlb.h:203:29: error: unknown type name 'pud_t' Signed-off-by: Dan Williams Cc: Liviu Dudau Cc: Sudeep Holla Cc: Lorenzo Pieralisi Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e76574d8f9b5..7d953c2542a8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -8,6 +8,7 @@ #include #include #include +#include struct ctl_table; struct user_struct; -- cgit v1.2.3 From 01c8f1c44b83a0825b573e7c723b033cece37b86 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:40 -0800 Subject: mm, dax, gpu: convert vm_insert_mixed to pfn_t Convert the raw unsigned long 'pfn' argument to pfn_t for the purpose of evaluating the PFN_MAP and PFN_DEV flags. When both are set it triggers _PAGE_DEVMAP to be set in the resulting pte. There are no functional changes to the gpu drivers as a result of this conversion. Signed-off-by: Dan Williams Cc: Dave Hansen Cc: David Airlie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- include/linux/pfn_t.h | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bb0907a3603..a9902152449f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2107,7 +2107,7 @@ int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn); + pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index c557a0e0b20c..bdaa275d7623 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -64,4 +64,31 @@ static inline pfn_t page_to_pfn_t(struct page *page) { return pfn_to_pfn_t(page_to_pfn(page)); } + +static inline int pfn_t_valid(pfn_t pfn) +{ + return pfn_valid(pfn_t_to_pfn(pfn)); +} + +#ifdef CONFIG_MMU +static inline pte_t pfn_t_pte(pfn_t pfn, pgprot_t pgprot) +{ + return pfn_pte(pfn_t_to_pfn(pfn), pgprot); +} +#endif + +#ifdef __HAVE_ARCH_PTE_DEVMAP +static inline bool pfn_t_devmap(pfn_t pfn) +{ + const unsigned long flags = PFN_DEV|PFN_MAP; + + return (pfn.val & flags) == flags; +} +#else +static inline bool pfn_t_devmap(pfn_t pfn) +{ + return false; +} +pte_t pte_mkdevmap(pte_t pte); +#endif #endif /* _LINUX_PFN_T_H_ */ -- cgit v1.2.3 From f25748e3c34eb8bb54853e9adba2d3dcf030503c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:43 -0800 Subject: mm, dax: convert vmf_insert_pfn_pmd() to pfn_t Similar to the conversion of vm_insert_mixed() use pfn_t in the vmf_insert_pfn_pmd() to tag the resulting pte with _PAGE_DEVICE when the pfn is backed by a devm_memremap_pages() mapping. Signed-off-by: Dan Williams Cc: Dave Hansen Cc: Matthew Wilcox Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 2 +- include/linux/pfn_t.h | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 0160201993d4..8ca35a131904 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -37,7 +37,7 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa); int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, - unsigned long pfn, bool write); + pfn_t pfn, bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index bdaa275d7623..0703b5360d31 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -77,6 +77,13 @@ static inline pte_t pfn_t_pte(pfn_t pfn, pgprot_t pgprot) } #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot) +{ + return pfn_pmd(pfn_t_to_pfn(pfn), pgprot); +} +#endif + #ifdef __HAVE_ARCH_PTE_DEVMAP static inline bool pfn_t_devmap(pfn_t pfn) { @@ -90,5 +97,6 @@ static inline bool pfn_t_devmap(pfn_t pfn) return false; } pte_t pte_mkdevmap(pte_t pte); +pmd_t pmd_mkdevmap(pmd_t pmd); #endif #endif /* _LINUX_PFN_T_H_ */ -- cgit v1.2.3 From 5c2c2587b13235bf8b5c9027589f22eff68bdf49 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:49 -0800 Subject: mm, dax, pmem: introduce {get|put}_dev_pagemap() for dax-gup get_dev_page() enables paths like get_user_pages() to pin a dynamically mapped pfn-range (devm_memremap_pages()) while the resulting struct page objects are in use. Unlike get_page() it may fail if the device is, or is in the process of being, disabled. While the initial lookup of the range may be an expensive list walk, the result is cached to speed up subsequent lookups which are likely to be in the same mapped range. devm_memremap_pages() now requires a reference counter to be specified at init time. For pmem this means moving request_queue allocation into pmem_alloc() so the existing queue usage counter can track "device pages". ZONE_DEVICE pages always have an elevated count and will never be on an lru reclaim list. That space in 'struct page' can be redirected for other uses, but for safety introduce a poison value that will always trip __list_add() to assert. This allows half of the struct list_head storage to be reclaimed with some assurance to back up the assumption that the page count never goes to zero and a list_add() is never attempted. Signed-off-by: Dan Williams Tested-by: Logan Gunthorpe Cc: Dave Hansen Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list.h | 11 +++++++++++ include/linux/memremap.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++-- include/linux/mm_types.h | 5 +++++ 3 files changed, 63 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 5356f4d661a7..30cf4200ab40 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -113,6 +113,17 @@ extern void __list_del_entry(struct list_head *entry); extern void list_del(struct list_head *entry); #endif +#ifdef CONFIG_DEBUG_LIST +/* + * See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one + * of the pages it allocates is ever passed to list_add() + */ +extern void list_force_poison(struct list_head *entry); +#else +/* fallback to the less strict LIST_POISON* definitions */ +#define list_force_poison list_del +#endif + /** * list_replace - replace old entry by new one * @old : the element to be replaced diff --git a/include/linux/memremap.h b/include/linux/memremap.h index aa3e82a80d7b..bcaa634139a9 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -1,6 +1,8 @@ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ #include +#include +#include struct resource; struct device; @@ -36,21 +38,25 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations + * @res: physical address range covered by @ref + * @ref: reference count that pins the devm_memremap_pages() mapping * @dev: host device of the mapping for debug */ struct dev_pagemap { struct vmem_altmap *altmap; const struct resource *res; + struct percpu_ref *ref; struct device *dev; }; #ifdef CONFIG_ZONE_DEVICE void *devm_memremap_pages(struct device *dev, struct resource *res, - struct vmem_altmap *altmap); + struct percpu_ref *ref, struct vmem_altmap *altmap); struct dev_pagemap *find_dev_pagemap(resource_size_t phys); #else static inline void *devm_memremap_pages(struct device *dev, - struct resource *res, struct vmem_altmap *altmap) + struct resource *res, struct percpu_ref *ref, + struct vmem_altmap *altmap) { /* * Fail attempts to call devm_memremap_pages() without @@ -66,4 +72,43 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys) return NULL; } #endif + +/** + * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn + * @pfn: page frame number to lookup page_map + * @pgmap: optional known pgmap that already has a reference + * + * @pgmap allows the overhead of a lookup to be bypassed when @pfn lands in the + * same mapping. + */ +static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, + struct dev_pagemap *pgmap) +{ + const struct resource *res = pgmap ? pgmap->res : NULL; + resource_size_t phys = PFN_PHYS(pfn); + + /* + * In the cached case we're already holding a live reference so + * we can simply do a blind increment + */ + if (res && phys >= res->start && phys <= res->end) { + percpu_ref_get(pgmap->ref); + return pgmap; + } + + /* fall back to slow path lookup */ + rcu_read_lock(); + pgmap = find_dev_pagemap(phys); + if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) + pgmap = NULL; + rcu_read_unlock(); + + return pgmap; +} + +static inline void put_dev_pagemap(struct dev_pagemap *pgmap) +{ + if (pgmap) + percpu_ref_put(pgmap->ref); +} #endif /* _LINUX_MEMREMAP_H_ */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2dd9c313a8c0..d3ebb9d21a53 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -116,6 +116,11 @@ struct page { * Can be used as a generic list * by the page owner. */ + struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an + * lru or handled by a slab + * allocator, this points to the + * hosting device page map. + */ struct { /* slub per cpu partial pages */ struct page *next; /* Next partial slab */ #ifdef CONFIG_64BIT -- cgit v1.2.3 From 5c7fb56e5e3f7035dd798a8e1adee639f87043e5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:52 -0800 Subject: mm, dax: dax-pmd vs thp-pmd vs hugetlbfs-pmd A dax-huge-page mapping while it uses some thp helpers is ultimately not a transparent huge page. The distinction is especially important in the get_user_pages() path. pmd_devmap() is used to distinguish dax-pmds from pmd_huge() and pmd_trans_huge() which have slightly different semantics. Explicitly mark the pmd_trans_huge() helpers that dax needs by adding pmd_devmap() checks. [kirill.shutemov@linux.intel.com: fix regression in handling mlocked pages in __split_huge_pmd()] Signed-off-by: Dan Williams Cc: Dave Hansen Cc: Mel Gorman Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Matthew Wilcox Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 5 +++-- include/linux/mm.h | 7 +++++++ 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8ca35a131904..d39fa60bd6bf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -104,7 +104,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ - if (pmd_trans_huge(*____pmd)) \ + if (pmd_trans_huge(*____pmd) \ + || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address); \ } while (0) @@ -124,7 +125,7 @@ static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, spinlock_t **ptl) { VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); - if (pmd_trans_huge(*pmd)) + if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) return __pmd_trans_huge_lock(pmd, vma, ptl); else return false; diff --git a/include/linux/mm.h b/include/linux/mm.h index a9902152449f..cd123272d28d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -329,6 +329,13 @@ struct inode; #define page_private(page) ((page)->private) #define set_page_private(page, v) ((page)->private = (v)) +#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) +static inline int pmd_devmap(pmd_t pmd) +{ + return 0; +} +#endif + /* * FIXME: take this include out, include page-flags.h in * files which need it (119 of them) -- cgit v1.2.3 From 3565fce3a6597e91b8dee3e8e36ebf70f8b7ef9b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:55 -0800 Subject: mm, x86: get_user_pages() for dax mappings A dax mapping establishes a pte with _PAGE_DEVMAP set when the driver has established a devm_memremap_pages() mapping, i.e. when the pfn_t return from ->direct_access() has PFN_DEV and PFN_MAP set. Later, when encountering _PAGE_DEVMAP during a page table walk we lookup and pin a struct dev_pagemap instance to keep the result of pfn_to_page() valid until put_page(). Signed-off-by: Dan Williams Tested-by: Logan Gunthorpe Cc: Dave Hansen Cc: Mel Gorman Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 10 ++++++++- include/linux/mm.h | 59 ++++++++++++++++++++++++++++++++++--------------- 2 files changed, 50 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d39fa60bd6bf..cfe81e10bd54 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -38,7 +38,6 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, int prot_numa); int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, pfn_t pfn, bool write); - enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, @@ -55,6 +54,9 @@ enum transparent_hugepage_flag { #define HPAGE_PMD_NR (1< #include #include +#include #include #include #include @@ -465,17 +466,6 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } -static inline void get_page(struct page *page) -{ - page = compound_head(page); - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. - */ - VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); - atomic_inc(&page->_count); -} - static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); @@ -494,13 +484,6 @@ static inline void init_page_count(struct page *page) void __put_page(struct page *page); -static inline void put_page(struct page *page) -{ - page = compound_head(page); - if (put_page_testzero(page)) - __put_page(page); -} - void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); @@ -682,17 +665,50 @@ static inline enum zone_type page_zonenum(const struct page *page) } #ifdef CONFIG_ZONE_DEVICE +void get_zone_device_page(struct page *page); +void put_zone_device_page(struct page *page); static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } #else +static inline void get_zone_device_page(struct page *page) +{ +} +static inline void put_zone_device_page(struct page *page) +{ +} static inline bool is_zone_device_page(const struct page *page) { return false; } #endif +static inline void get_page(struct page *page) +{ + page = compound_head(page); + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. + */ + VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); + atomic_inc(&page->_count); + + if (unlikely(is_zone_device_page(page))) + get_zone_device_page(page); +} + +static inline void put_page(struct page *page) +{ + page = compound_head(page); + + if (put_page_testzero(page)) + __put_page(page); + + if (unlikely(is_zone_device_page(page))) + put_zone_device_page(page); +} + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif @@ -1444,6 +1460,13 @@ static inline void sync_mm_rss(struct mm_struct *mm) } #endif +#ifndef __HAVE_ARCH_PTE_DEVMAP +static inline int pte_devmap(pte_t pte) +{ + return 0; +} +#endif + int vma_wants_writenotify(struct vm_area_struct *vma); extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, -- cgit v1.2.3 From 4a9e1cda274893eca7d178d7dc265503ccb9d87a Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Fri, 15 Jan 2016 16:57:04 -0800 Subject: mm: bring in additional flag for fixup_user_fault to signal unlock During Jason's work with postcopy migration support for s390 a problem regarding gmap faults was discovered. The gmap code will call fixup_user_fault which will end up always in handle_mm_fault. Till now we never cared about retries, but as the userfaultfd code kind of relies on it. this needs some fix. This patchset does not take care of the futex code. I will now look closer at this. This patch (of 2): With the introduction of userfaultfd, kvm on s390 needs fixup_user_fault to pass in FAULT_FLAG_ALLOW_RETRY and give feedback if during the faulting we ever unlocked mmap_sem. This patch brings in the logic to handle retries as well as it cleans up the current documentation. fixup_user_fault was not having the same semantics as filemap_fault. It never indicated if a retry happened and so a caller wasn't able to handle that case. So we now changed the behaviour to always retry a locked mmap_sem. Signed-off-by: Dominik Dingel Reviewed-by: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Christian Borntraeger Cc: "Jason J. Herne" Cc: David Rientjes Cc: Eric B Munson Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Heiko Carstens Cc: Dominik Dingel Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 792f2469c142..1d6ec55d8b25 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1194,7 +1194,8 @@ int invalidate_inode_page(struct page *page); extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, - unsigned long address, unsigned int fault_flags); + unsigned long address, unsigned int fault_flags, + bool *unlocked); #else static inline int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -1206,7 +1207,7 @@ static inline int handle_mm_fault(struct mm_struct *mm, } static inline int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, - unsigned int fault_flags) + unsigned int fault_flags, bool *unlocked) { /* should never happen if there's no MMU */ BUG(); -- cgit v1.2.3 From 036fbb21de7c74d5637bf41110c47005363f3000 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:57:11 -0800 Subject: memblock: fix section mismatch allmodconfig produces following warning for me: WARNING: vmlinux.o(.text.unlikely+0x10314): Section mismatch in reference from the function movable_node_is_enabled() to the variable .meminit.data:movable_node_enabled The function movable_node_is_enabled() references the variable __meminitdata movable_node_enabled. This is often because movable_node_is_enabled lacks a __meminitdata annotation or the annotation of movable_node_enabled is wrong. Let's mark the function with __meminit. It fixes the warning. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 173fb44e22f1..3106ac1c895e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -61,6 +61,14 @@ extern int memblock_debug; extern bool movable_node_enabled; #endif /* CONFIG_MOVABLE_NODE */ +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK +#define __init_memblock __meminit +#define __initdata_memblock __meminitdata +#else +#define __init_memblock +#define __initdata_memblock +#endif + #define memblock_dbg(fmt, ...) \ if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) @@ -166,7 +174,7 @@ static inline bool memblock_is_hotpluggable(struct memblock_region *m) return m->flags & MEMBLOCK_HOTPLUG; } -static inline bool movable_node_is_enabled(void) +static inline bool __init_memblock movable_node_is_enabled(void) { return movable_node_enabled; } @@ -405,14 +413,6 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo for (idx = 0; idx < memblock_type->cnt; \ idx++,rgn = &memblock_type->regions[idx]) -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK -#define __init_memblock __meminit -#define __initdata_memblock __meminitdata -#else -#define __init_memblock -#define __initdata_memblock -#endif - #ifdef CONFIG_MEMTEST extern void early_memtest(phys_addr_t start, phys_addr_t end); #else -- cgit v1.2.3 From 7f43add451d2a0d235074b72d254ae266a6a023f Mon Sep 17 00:00:00 2001 From: Wang Xiaoqiang Date: Fri, 15 Jan 2016 16:57:22 -0800 Subject: mm/mlock.c: change can_do_mlock return value type to boolean Since can_do_mlock only return 1 or 0, so make it boolean. No functional change. [akpm@linux-foundation.org: update declaration in mm.h] Signed-off-by: Wang Xiaoqiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1d6ec55d8b25..f1cd22f2df1a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1100,7 +1100,7 @@ static inline bool shmem_mapping(struct address_space *mapping) } #endif -extern int can_do_mlock(void); +extern bool can_do_mlock(void); extern int user_shm_lock(size_t, struct user_struct *); extern void user_shm_unlock(size_t, struct user_struct *); -- cgit v1.2.3 From b8a0255db958b8f70c5267dda152e93b6fda1778 Mon Sep 17 00:00:00 2001 From: Vasily Kulikov Date: Fri, 15 Jan 2016 16:57:55 -0800 Subject: include/linux/poison.h: use POISON_POINTER_DELTA for poison pointers TIMER_ENTRY_STATIC and TAIL_MAPPING are defined as poison pointers which should point to nowhere. Redefine them using POISON_POINTER_DELTA arithmetics to make sure they really point to non-mappable area declared by the target architecture. Signed-off-by: Vasily Kulikov Acked-by: Thomas Gleixner Cc: Solar Designer Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/poison.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/poison.h b/include/linux/poison.h index 76c3b6c38c16..4a27153574e2 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -27,14 +27,14 @@ * Magic number "tsta" to indicate a static timer initializer * for the object debugging code. */ -#define TIMER_ENTRY_STATIC ((void *) 0x74737461) +#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) /********** mm/debug-pagealloc.c **********/ #define PAGE_POISON 0xaa /********** mm/page_alloc.c ************/ -#define TAIL_MAPPING ((void *) 0x01014A11 + POISON_POINTER_DELTA) +#define TAIL_MAPPING ((void *) 0x400 + POISON_POINTER_DELTA) /********** mm/slab.c **********/ /* -- cgit v1.2.3 From 8f57e4d930d48217268315898212518d4d3e0773 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Fri, 15 Jan 2016 16:57:58 -0800 Subject: include/linux/kernel.h: change abs() macro so it uses consistent return type Rewrite abs() so that its return type does not depend on the architecture and no unexpected type conversion happen inside of it. The only conversion is from unsigned to signed type. char is left as a return type but treated as a signed type regradless of it's actual signedness. With the old version, int arguments were promoted to long and depending on architecture a long argument might result in s64 or long return type (which may or may not be the same). This came after some back and forth with Nicolas. The current macro has different return type (for the same input type) depending on architecture which might be midly iritating. An alternative version would promote to int like so: #define abs(x) __abs_choose_expr(x, long long, \ __abs_choose_expr(x, long, \ __builtin_choose_expr( \ sizeof(x) <= sizeof(int), \ ({ int __x = (x); __x<0?-__x:__x; }), \ ((void)0)))) I have no preference but imagine Linus might. :] Nicolas argument against is that promoting to int causes iconsistent behaviour: int main(void) { unsigned short a = 0, b = 1, c = a - b; unsigned short d = abs(a - b); unsigned short e = abs(c); printf("%u %u\n", d, e); // prints: 1 65535 } Then again, no sane person expects consistent behaviour from C integer arithmetic. ;) Note: __builtin_types_compatible_p(unsigned char, char) is always false, and __builtin_types_compatible_p(signed char, char) is also always false. Signed-off-by: Michal Nazarewicz Reviewed-by: Nicolas Pitre Cc: Srinivas Pandruvada Cc: Wey-Yi Guy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 7311c3294e25..f31638c6e873 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -202,26 +202,26 @@ extern int _cond_resched(void); /** * abs - return absolute value of an argument - * @x: the value. If it is unsigned type, it is converted to signed type first - * (s64, long or int depending on its size). + * @x: the value. If it is unsigned type, it is converted to signed type first. + * char is treated as if it was signed (regardless of whether it really is) + * but the macro's return type is preserved as char. * - * Return: an absolute value of x. If x is 64-bit, macro's return type is s64, - * otherwise it is signed long. + * Return: an absolute value of x. */ -#define abs(x) __builtin_choose_expr(sizeof(x) == sizeof(s64), ({ \ - s64 __x = (x); \ - (__x < 0) ? -__x : __x; \ - }), ({ \ - long ret; \ - if (sizeof(x) == sizeof(long)) { \ - long __x = (x); \ - ret = (__x < 0) ? -__x : __x; \ - } else { \ - int __x = (x); \ - ret = (__x < 0) ? -__x : __x; \ - } \ - ret; \ - })) +#define abs(x) __abs_choose_expr(x, long long, \ + __abs_choose_expr(x, long, \ + __abs_choose_expr(x, int, \ + __abs_choose_expr(x, short, \ + __abs_choose_expr(x, char, \ + __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(x), char), \ + (char)({ signed char __x = (x); __x<0?-__x:__x; }), \ + ((void)0))))))) + +#define __abs_choose_expr(x, type, other) __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(x), signed type) || \ + __builtin_types_compatible_p(typeof(x), unsigned type), \ + ({ signed type __x = (x); __x < 0 ? -__x : __x; }), other) /** * reciprocal_scale - "scale" a value into range [0, ep_ro) -- cgit v1.2.3 From 3f1bfd94136ecb85889e6e22893c08e8a9c697c2 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Fri, 15 Jan 2016 16:58:04 -0800 Subject: include/linux/kdev_t.h: remove new_valid_dev() As all new_valid_dev() checks have been removed it's time to drop new_valid_dev() itself. No functional change. Signed-off-by: Yaowei Bai Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kdev_t.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kdev_t.h b/include/linux/kdev_t.h index 052c7b32cc91..8e9e288b08c1 100644 --- a/include/linux/kdev_t.h +++ b/include/linux/kdev_t.h @@ -35,11 +35,6 @@ static inline dev_t old_decode_dev(u16 val) return MKDEV((val >> 8) & 255, val & 255); } -static inline bool new_valid_dev(dev_t dev) -{ - return 1; -} - static inline u32 new_encode_dev(dev_t dev) { unsigned major = MAJOR(dev); -- cgit v1.2.3 From dfffa587a6bcd84f2087f88e11600b0e8b0aa1ee Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 15 Jan 2016 16:58:10 -0800 Subject: err.h: add (missing) unlikely() to IS_ERR_OR_NULL() IS_ERR_VALUE() already contains it and so we need to add this only to the !ptr check. That will allow users of IS_ERR_OR_NULL(), to not add this compiler flag. Signed-off-by: Viresh Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/err.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/err.h b/include/linux/err.h index a729120644d5..56762ab41713 100644 --- a/include/linux/err.h +++ b/include/linux/err.h @@ -37,7 +37,7 @@ static inline bool __must_check IS_ERR(__force const void *ptr) static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) { - return !ptr || IS_ERR_VALUE((unsigned long)ptr); + return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr); } /** -- cgit v1.2.3 From 8d91f8b15361dfb438ab6eb3b319e2ded43458ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 15 Jan 2016 16:58:24 -0800 Subject: printk: do cond_resched() between lines while outputting to consoles @console_may_schedule tracks whether console_sem was acquired through lock or trylock. If the former, we're inside a sleepable context and console_conditional_schedule() performs cond_resched(). This allows console drivers which use console_lock for synchronization to yield while performing time-consuming operations such as scrolling. However, the actual console outputting is performed while holding irq-safe logbuf_lock, so console_unlock() clears @console_may_schedule before starting outputting lines. Also, only a few drivers call console_conditional_schedule() to begin with. This means that when a lot of lines need to be output by console_unlock(), for example on a console registration, the task doing console_unlock() may not yield for a long time on a non-preemptible kernel. If this happens with a slow console devices, for example a serial console, the outputting task may occupy the cpu for a very long time. Long enough to trigger softlockup and/or RCU stall warnings, which in turn pile more messages, sometimes enough to trigger the next cycle of warnings incapacitating the system. Fix it by making console_unlock() insert cond_resched() between lines if @console_may_schedule. Signed-off-by: Tejun Heo Reported-by: Calvin Owens Acked-by: Jan Kara Cc: Dave Jones Cc: Kyle McMartin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/console.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index bd194343c346..ea731af2451e 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -150,6 +150,7 @@ extern int console_trylock(void); extern void console_unlock(void); extern void console_conditional_schedule(void); extern void console_unblank(void); +extern void console_flush_on_panic(void); extern struct tty_driver *console_device(int *); extern void console_stop(struct console *); extern void console_start(struct console *); -- cgit v1.2.3 From fe22cd9b7c980b8b948ec85f034a8668c57ec867 Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Fri, 15 Jan 2016 16:59:12 -0800 Subject: printk: help pr_debug and pr_devel to optimize out arguments Currently, pr_debug and pr_devel will not elide function call arguments appearing in calls to the no_printk for these macros. This is because all side effects must be honored before proceeding to the 0-value assignment in no_printk. The behavior is contrary to documentation found in the CodingStyle and the header file where these functions are declared. This patch corrects that behavior by shunting out the call to no_printk completely. The format string is still checked by gcc for correctness, but no code seems to be emitted in common cases. [akpm@linux-foundation.org: remove braces, per Joe] Fixes: 5264f2f75d86 ("include/linux/printk.h: use and neaten no_printk") Signed-off-by: Aaron Conole Reported-by: Dmitry Vyukov Cc: Joe Perches Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 9729565c25ff..9ccbdf2c1453 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -106,13 +106,13 @@ struct va_format { /* * Dummy printk for disabled debugging statements to use whilst maintaining - * gcc's format and side-effect checking. + * gcc's format checking. */ -static inline __printf(1, 2) -int no_printk(const char *fmt, ...) -{ - return 0; -} +#define no_printk(fmt, ...) \ +do { \ + if (0) \ + printk(fmt, ##__VA_ARGS__); \ +} while (0) #ifdef CONFIG_EARLY_PRINTK extern asmlinkage __printf(1, 2) -- cgit v1.2.3 From 203cbf77de59fc8f13502dcfd11350c6d4a5c95f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Jan 2016 16:54:46 +0000 Subject: hrtimer: Handle remaining time proper for TIME_LOW_RES If CONFIG_TIME_LOW_RES is enabled we add a jiffie to the relative timeout to prevent short sleeps, but we do not account for that in interfaces which retrieve the remaining time. Helge observed that timerfd can return a remaining time larger than the relative timeout. That's not expected and breaks userland test programs. Store the information that the timer was armed relative and provide functions to adjust the remaining time. To avoid bloating the hrtimer struct make state a u8, which as a bonus results in better code on x86 at least. Reported-and-tested-by: Helge Deller Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Cc: linux-m68k@lists.linux-m68k.org Cc: dhowells@redhat.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160114164159.273328486@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 76dd4f0da5ca..2ead22dd74a0 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -87,7 +87,8 @@ enum hrtimer_restart { * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) - * @start_pid: timer statistics field to store the pid of the task which + * @is_rel: Set if the timer was armed relative + * @start_pid: timer statistics field to store the pid of the task which * started the timer * @start_site: timer statistics field to store the site where the timer * was started @@ -101,7 +102,8 @@ struct hrtimer { ktime_t _softexpires; enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; - unsigned long state; + u8 state; + u8 is_rel; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; @@ -321,6 +323,27 @@ static inline void clock_was_set_delayed(void) { } #endif +static inline ktime_t +__hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) +{ + ktime_t rem = ktime_sub(timer->node.expires, now); + + /* + * Adjust relative timers for the extra we added in + * hrtimer_start_range_ns() to prevent short timeouts. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel) + rem.tv64 -= hrtimer_resolution; + return rem; +} + +static inline ktime_t +hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) +{ + return __hrtimer_expires_remaining_adjusted(timer, + timer->base->get_time()); +} + extern void clock_was_set(void); #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); @@ -390,7 +413,12 @@ static inline void hrtimer_restart(struct hrtimer *timer) } /* Query timers: */ -extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); +extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust); + +static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +{ + return __hrtimer_get_remaining(timer, false); +} extern u64 hrtimer_get_next_event(void); -- cgit v1.2.3 From 0b6e26ce89391327d955a756a7823272238eb867 Mon Sep 17 00:00:00 2001 From: Doron Tsur Date: Sun, 17 Jan 2016 11:25:47 +0200 Subject: net/mlx5_core: Fix trimming down IRQ number With several ConnectX-4 cards installed on a server, one may receive irqn > 255 from the kernel API, which we mistakenly trim to 8bit. This causes EQ creation failure with the following stack trace: [] dump_stack+0x48/0x64 [] __setup_irq+0x3a1/0x4f0 [] request_threaded_irq+0x120/0x180 [] ? mlx5_eq_int+0x450/0x450 [mlx5_core] [] mlx5_create_map_eq+0x1e4/0x2b0 [mlx5_core] [] alloc_comp_eqs+0xb1/0x180 [mlx5_core] [] mlx5_dev_init+0x5e9/0x6e0 [mlx5_core] [] init_one+0x99/0x1c0 [mlx5_core] [] local_pci_probe+0x4c/0xa0 Fixing it by changing of the irqn type from u8 to unsigned int to support values > 255 Fixes: 61d0e73e0a5a ('net/mlx5_core: Use the the real irqn in eq->irqn') Reported-by: Jiri Pirko Signed-off-by: Doron Tsur Signed-off-by: Matan Barak Signed-off-by: David S. Miller --- include/linux/mlx5/cq.h | 2 +- include/linux/mlx5/driver.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index abc4767695e4..b2c9fada8eac 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -45,7 +45,7 @@ struct mlx5_core_cq { atomic_t refcount; struct completion free; unsigned vector; - int irqn; + unsigned int irqn; void (*comp) (struct mlx5_core_cq *); void (*event) (struct mlx5_core_cq *, enum mlx5_event); struct mlx5_uar *uar; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2fd7019f69db..5162f3533042 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -303,7 +303,7 @@ struct mlx5_eq { u32 cons_index; struct mlx5_buf buf; int size; - u8 irqn; + unsigned int irqn; u8 eqn; int nent; u64 mask; @@ -783,7 +783,8 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq); int mlx5_start_eqs(struct mlx5_core_dev *dev); int mlx5_stop_eqs(struct mlx5_core_dev *dev); -int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, int *irqn); +int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, + unsigned int *irqn); int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); -- cgit v1.2.3 From 00d27c6336b00345724b2510f7c5b8cee3055f02 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Tue, 5 Jan 2016 11:22:10 -0500 Subject: numa: remove stale node_has_online_mem() define This isn't used anywhere, so delete it. Looks like the last usage (in x86-specific code) was removed by Tejun in 2011 in commit bd6709a91a59 ("x86, NUMA: Make 32bit use common NUMA init path"). Signed-off-by: Chris Metcalf --- include/linux/topology.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index 73ddad1e0fa3..afce69296ac0 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -34,10 +34,6 @@ #include #include -#ifndef node_has_online_mem -#define node_has_online_mem(nid) (1) -#endif - #ifndef nr_cpus_node #define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node)) #endif -- cgit v1.2.3 From f25bf1977f7a968e85fe8ab99252b8132c6cf8c4 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 14 Jan 2016 17:48:07 +0200 Subject: net/mlx4: Remove unused macro The macro mlx4_foreach_non_ib_transport_port() is not used anywhere. Remove it. Fixes: aa9a2d51a3e7 ("mlx4: Activate RoCE/SRIOV") Signed-off-by: Moni Shoua Signed-off-by: Doug Ledford --- include/linux/mlx4/device.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index d3133be12d92..971037188907 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -979,10 +979,6 @@ struct mlx4_mad_ifc { for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ if ((type) == (dev)->caps.port_mask[(port)]) -#define mlx4_foreach_non_ib_transport_port(port, dev) \ - for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ - if (((dev)->caps.port_mask[port] != MLX4_PORT_TYPE_IB)) - #define mlx4_foreach_ib_transport_port(port, dev) \ for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \ -- cgit v1.2.3 From cc886c9ff1607eda04062bdcec963e2f8e6a3eb1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Jan 2016 14:49:12 -0500 Subject: svcrdma: Improve allocation of struct svc_rdma_op_ctxt When the maximum payload size of NFS READ and WRITE was increased by commit cc9a903d915c ("svcrdma: Change maximum server payload back to RPCSVC_MAXPAYLOAD"), the size of struct svc_rdma_op_ctxt increased to over 6KB (on x86_64). That makes allocating one of these from a kmem_cache more likely to fail in situations when system memory is exhausted. Since I'm about to add a caller where this allocation must always work _and_ it cannot sleep, pre-allocate ctxts for each connection. Another motivation for this change is that NFSv4.x servers are required by specification not to drop NFS requests. Pre-allocating memory resources reduces the likelihood of a drop. Signed-off-by: Chuck Lever Acked-by: Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index f869807a0d0e..be2804b72cd8 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -69,6 +69,7 @@ extern atomic_t rdma_stat_sq_prod; * completes. */ struct svc_rdma_op_ctxt { + struct list_head free; struct svc_rdma_op_ctxt *read_hdr; struct svc_rdma_fastreg_mr *frmr; int hdr_count; @@ -141,7 +142,10 @@ struct svcxprt_rdma { struct ib_pd *sc_pd; atomic_t sc_dma_used; - atomic_t sc_ctxt_used; + spinlock_t sc_ctxt_lock; + struct list_head sc_ctxts; + int sc_ctxt_used; + struct list_head sc_rq_dto_q; spinlock_t sc_rq_dto_lock; struct ib_qp *sc_qp; -- cgit v1.2.3 From 2fe81b239dbb00d0a2fd8858ac9dd4ef4a8841ee Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Jan 2016 14:49:20 -0500 Subject: svcrdma: Improve allocation of struct svc_rdma_req_map To ensure this allocation cannot fail and will not sleep, pre-allocate the req_map structures per-connection. Signed-off-by: Chuck Lever Acked-by: Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index be2804b72cd8..05bf4febad44 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -113,6 +113,7 @@ struct svc_rdma_fastreg_mr { struct list_head frmr_list; }; struct svc_rdma_req_map { + struct list_head free; unsigned long count; union { struct kvec sge[RPCSVC_MAXPAGES]; @@ -145,6 +146,8 @@ struct svcxprt_rdma { spinlock_t sc_ctxt_lock; struct list_head sc_ctxts; int sc_ctxt_used; + spinlock_t sc_map_lock; + struct list_head sc_maps; struct list_head sc_rq_dto_q; spinlock_t sc_rq_dto_lock; @@ -223,8 +226,9 @@ extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt); -extern struct svc_rdma_req_map *svc_rdma_get_req_map(void); -extern void svc_rdma_put_req_map(struct svc_rdma_req_map *); +extern struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *); +extern void svc_rdma_put_req_map(struct svcxprt_rdma *, + struct svc_rdma_req_map *); extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *); extern void svc_rdma_put_frmr(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *); -- cgit v1.2.3 From 71810ef3271d1a06f7002c55c7e354d8c3233762 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Jan 2016 14:49:28 -0500 Subject: svcrdma: Remove unused req_map and ctxt kmem_caches Clean up. Signed-off-by: Chuck Lever Acked-by: Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 05bf4febad44..141edbbb73b3 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -242,6 +242,7 @@ extern struct svc_xprt_class svc_rdma_bc_class; #endif /* svc_rdma.c */ +extern struct workqueue_struct *svc_rdma_wq; extern int svc_rdma_init(void); extern void svc_rdma_cleanup(void); -- cgit v1.2.3 From 39b09a1a121cb22820c374f4e92f7ca34be1b75d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Jan 2016 14:49:37 -0500 Subject: svcrdma: Add gfp flags to svc_rdma_post_recv() svc_rdma_post_recv() allocates pages for receive buffers on-demand. It uses GFP_KERNEL so the allocator tries hard, and may sleep. But I'm about to add a call to svc_rdma_post_recv() from a function that may not sleep. Since all svc_rdma_post_recv() call sites can tolerate its failure, allow it to fail if the page allocator returns nothing. Longer term, receive buffers, being a finite resource per-connection, should be pre-allocated and re-used. Signed-off-by: Chuck Lever Acked-by: Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 141edbbb73b3..729ff356c18a 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -221,7 +221,7 @@ extern struct rpcrdma_read_chunk * extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, enum rpcrdma_errcode); -extern int svc_rdma_post_recv(struct svcxprt_rdma *); +extern int svc_rdma_post_recv(struct svcxprt_rdma *, gfp_t); extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); -- cgit v1.2.3 From ba986c96f907a513215fb7f1c0a89261c97251ca Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Jan 2016 14:49:53 -0500 Subject: svcrdma: Make map_xdr non-static Pre-requisite to use map_xdr in the backchannel code. Signed-off-by: Chuck Lever Acked-by: Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 729ff356c18a..aeffa30655ce 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -213,6 +213,8 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *, u32, u32, u64, bool); /* svc_rdma_sendto.c */ +extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *, + struct svc_rdma_req_map *); extern int svc_rdma_sendto(struct svc_rqst *); extern struct rpcrdma_read_chunk * svc_rdma_get_read_chunk(struct rpcrdma_msg *); -- cgit v1.2.3 From 03fe9931536fe4782e9e34f7f499d588acd2015b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Jan 2016 14:50:02 -0500 Subject: svcrdma: Define maximum number of backchannel requests Extra resources for handling backchannel requests have to be pre-allocated when a transport instance is created. Set up additional fields in svcxprt_rdma to track these resources. The max_requests fields are elements of the RPC-over-RDMA protocol, so they should be u32. To ensure that unsigned arithmetic is used everywhere, some other fields in the svcxprt_rdma struct are updated. Signed-off-by: Chuck Lever Acked-by: Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index aeffa30655ce..9a2c418dc690 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -51,6 +51,7 @@ /* RPC/RDMA parameters and stats */ extern unsigned int svcrdma_ord; extern unsigned int svcrdma_max_requests; +extern unsigned int svcrdma_max_bc_requests; extern unsigned int svcrdma_max_req_size; extern atomic_t rdma_stat_recv; @@ -134,10 +135,11 @@ struct svcxprt_rdma { int sc_max_sge; int sc_max_sge_rd; /* max sge for read target */ - int sc_sq_depth; /* Depth of SQ */ atomic_t sc_sq_count; /* Number of SQ WR on queue */ - - int sc_max_requests; /* Depth of RQ */ + unsigned int sc_sq_depth; /* Depth of SQ */ + unsigned int sc_rq_depth; /* Depth of RQ */ + u32 sc_max_requests; /* Forward credits */ + u32 sc_max_bc_requests;/* Backward credits */ int sc_max_req_size; /* Size of each RQ WR buf */ struct ib_pd *sc_pd; @@ -186,6 +188,11 @@ struct svcxprt_rdma { #define RPCRDMA_MAX_REQUESTS 32 #define RPCRDMA_MAX_REQ_SIZE 4096 +/* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our + * current NFSv4.1 implementation supports one backchannel slot. + */ +#define RPCRDMA_MAX_BC_REQUESTS 2 + #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD /* svc_rdma_marshal.c */ -- cgit v1.2.3 From 5d252f90a800cee5bc57c76d636ae60464f7a887 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Jan 2016 14:50:10 -0500 Subject: svcrdma: Add class for RDMA backwards direction transport To support the server-side of an NFSv4.1 backchannel on RDMA connections, add a transport class that enables backward direction messages on an existing forward channel connection. Signed-off-by: Chuck Lever Acked-by: Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 9a2c418dc690..b13513a0caf4 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -195,6 +195,11 @@ struct svcxprt_rdma { #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD +/* svc_rdma_backchannel.c */ +extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, + struct rpcrdma_msg *rmsgp, + struct xdr_buf *rcvbuf); + /* svc_rdma_marshal.c */ extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, -- cgit v1.2.3 From 5fe1043da84887369d32459514f2c7d98ff37936 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Jan 2016 23:53:41 -0800 Subject: svc_rdma: use local_dma_lkey We now alwasy have a per-PD local_dma_lkey available. Make use of that fact in svc_rdma and stop registering our own MR. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Jason Gunthorpe Reviewed-by: Chuck Lever Reviewed-by: Steve Wise Acked-by: J. Bruce Fields Signed-off-by: Doug Ledford --- include/linux/sunrpc/svc_rdma.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index b13513a0caf4..5322fea6fe4c 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -156,13 +156,11 @@ struct svcxprt_rdma { struct ib_qp *sc_qp; struct ib_cq *sc_rq_cq; struct ib_cq *sc_sq_cq; - struct ib_mr *sc_phys_mr; /* MR for server memory */ int (*sc_reader)(struct svcxprt_rdma *, struct svc_rqst *, struct svc_rdma_op_ctxt *, int *, u32 *, u32, u32, u64, bool); u32 sc_dev_caps; /* distilled device caps */ - u32 sc_dma_lkey; /* local dma key */ unsigned int sc_frmr_pg_list_len; struct list_head sc_frmr_q; spinlock_t sc_frmr_q_lock; -- cgit v1.2.3 From d8ae914196d35bbc0c459aec6de588ba585a1c3e Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 14 Jan 2016 17:50:32 +0200 Subject: net/mlx4: Query RoCE support Query the RoCE support from firmware using the appropriate firmware commands. Downstream patches will read these capabilities and act accordingly. Signed-off-by: Moni Shoua Signed-off-by: Doug Ledford --- include/linux/mlx4/device.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 971037188907..28cbee0df7d7 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -216,6 +216,7 @@ enum { MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN = 1LL << 30, MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB = 1ULL << 31, MLX4_DEV_CAP_FLAG2_LB_SRC_CHK = 1ULL << 32, + MLX4_DEV_CAP_FLAG2_ROCE_V1_V2 = 1ULL << 33, }; enum { @@ -267,12 +268,14 @@ enum { MLX4_BMME_FLAG_TYPE_2_WIN = 1 << 9, MLX4_BMME_FLAG_RESERVED_LKEY = 1 << 10, MLX4_BMME_FLAG_FAST_REG_WR = 1 << 11, + MLX4_BMME_FLAG_ROCE_V1_V2 = 1 << 19, MLX4_BMME_FLAG_PORT_REMAP = 1 << 24, MLX4_BMME_FLAG_VSD_INIT2RTR = 1 << 28, }; enum { - MLX4_FLAG_PORT_REMAP = MLX4_BMME_FLAG_PORT_REMAP + MLX4_FLAG_PORT_REMAP = MLX4_BMME_FLAG_PORT_REMAP, + MLX4_FLAG_ROCE_V1_V2 = MLX4_BMME_FLAG_ROCE_V1_V2 }; enum mlx4_event { @@ -980,9 +983,10 @@ struct mlx4_mad_ifc { if ((type) == (dev)->caps.port_mask[(port)]) #define mlx4_foreach_ib_transport_port(port, dev) \ - for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ + for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \ - ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)) + ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) || \ + ((dev)->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)) #define MLX4_INVALID_SLAVE_ID 0xFF #define MLX4_SINK_COUNTER_INDEX(dev) (dev->caps.max_counters - 1) -- cgit v1.2.3 From 7e57b85c444c3c1bf3550aa6890666fc4353bd33 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 14 Jan 2016 17:50:35 +0200 Subject: IB/mlx4: Add support for setting RoCEv2 gids in hardware To tell hardware about a gid with type RoCEv2, software needs a new modifier to the SET_PORT command: MLX4_SET_PORT_ROCE_ADDR. This can replace the old method, MLX4_SET_PORT_GID_TABLE, for RoCEv1 gids. Signed-off-by: Moni Shoua Signed-off-by: Doug Ledford --- include/linux/mlx4/cmd.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index 58391f2e0414..116b284bc4ce 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -206,7 +206,8 @@ enum { MLX4_SET_PORT_GID_TABLE = 0x5, MLX4_SET_PORT_PRIO2TC = 0x8, MLX4_SET_PORT_SCHEDULER = 0x9, - MLX4_SET_PORT_VXLAN = 0xB + MLX4_SET_PORT_VXLAN = 0xB, + MLX4_SET_PORT_ROCE_ADDR = 0xD }; enum { -- cgit v1.2.3 From fca83006294a6356705781eee31da1658fd411a5 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 14 Jan 2016 17:50:36 +0200 Subject: net/mlx4_core: Add support for configuring RoCE v2 UDP port In order to support RoCE v2, the hardware needs to be configured to classify certain UDP packets as RoCE v2 packets and pass it through its RoCE pipeline. This patch enables configuring this UDP port. Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- include/linux/mlx4/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 28cbee0df7d7..430a929f048b 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1457,6 +1457,7 @@ int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port); int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port); int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis); +int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port); int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2); int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port); int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port); -- cgit v1.2.3 From 3f723f42d9d625bb9ecfe923d19d1d42da775797 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 14 Jan 2016 17:50:37 +0200 Subject: net/mlx4_core: Add support for RoCE v2 entropy In RoCE v2 we need to choose a source UDP port, we do so by using entropy over the source and dest QPNs. Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- include/linux/mlx4/qp.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index fe052e234906..cdf110d3f260 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -204,7 +204,8 @@ struct mlx4_qp_context { u32 reserved1; __be32 next_send_psn; __be32 cqn_send; - u32 reserved2[2]; + __be16 roce_entropy; + __be16 reserved2[3]; __be32 last_acked_psn; __be32 ssn; __be32 params2; @@ -487,4 +488,14 @@ static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn) void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp); +static inline u16 folded_qp(u32 q) +{ + u16 res; + + res = ((q & 0xff) ^ ((q & 0xff0000) >> 16)) | (q & 0xff00); + return res; +} + +u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn); + #endif /* MLX4_QP_H */ -- cgit v1.2.3 From 3b5daf28ac4bb9354b7d2f10ce5942cad23e979a Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 14 Jan 2016 17:50:39 +0200 Subject: IB/mlx4: Support modify_qp for RoCE v2 In order to support modify_qp for RoCE v2, we need to set the gid_type in the QP context. Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- include/linux/mlx4/qp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index cdf110d3f260..587cdf943b52 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -194,7 +194,7 @@ struct mlx4_qp_context { u8 mtu_msgmax; u8 rq_size_stride; u8 sq_size_stride; - u8 rlkey; + u8 rlkey_roce_mode; __be32 usr_page; __be32 local_qpn; __be32 remote_qpn; -- cgit v1.2.3 From 759c01142a5d0f364a462346168a56de28a80f52 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 18 Jan 2016 16:36:09 +0100 Subject: pipe: limit the per-user amount of pages allocated in pipes On no-so-small systems, it is possible for a single process to cause an OOM condition by filling large pipes with data that are never read. A typical process filling 4000 pipes with 1 MB of data will use 4 GB of memory. On small systems it may be tricky to set the pipe max size to prevent this from happening. This patch makes it possible to enforce a per-user soft limit above which new pipes will be limited to a single page, effectively limiting them to 4 kB each, as well as a hard limit above which no new pipes may be created for this user. This has the effect of protecting the system against memory abuse without hurting other users, and still allowing pipes to work correctly though with less data at once. The limit are controlled by two new sysctls : pipe-user-pages-soft, and pipe-user-pages-hard. Both may be disabled by setting them to zero. The default soft limit allows the default number of FDs per process (1024) to create pipes of the default size (64kB), thus reaching a limit of 64MB before starting to create only smaller pipes. With 256 processes limited to 1024 FDs each, this results in 1024*64kB + (256*1024 - 1024) * 4kB = 1084 MB of memory allocated for a user. The hard limit is disabled by default to avoid breaking existing applications that make intensive use of pipes (eg: for splicing). Reported-by: socketpair@gmail.com Reported-by: Tetsuo Handa Mitigates: CVE-2013-4312 (Linux 2.0+) Suggested-by: Linus Torvalds Signed-off-by: Willy Tarreau Signed-off-by: Al Viro --- include/linux/pipe_fs_i.h | 4 ++++ include/linux/sched.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index eb8b8ac6df3c..24f5470d3944 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -42,6 +42,7 @@ struct pipe_buffer { * @fasync_readers: reader side fasync * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers + * @user: the user who created this pipe **/ struct pipe_inode_info { struct mutex mutex; @@ -57,6 +58,7 @@ struct pipe_inode_info { struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; + struct user_struct *user; }; /* @@ -123,6 +125,8 @@ void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); extern unsigned int pipe_max_size, pipe_min_size; +extern unsigned long pipe_user_pages_hard; +extern unsigned long pipe_user_pages_soft; int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *); diff --git a/include/linux/sched.h b/include/linux/sched.h index 61aa9bbea871..1589ddc88e38 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -835,6 +835,7 @@ struct user_struct { #endif unsigned long locked_shm; /* How many pages of mlocked shm ? */ unsigned long unix_inflight; /* How many files in flight in unix sockets */ + atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ #ifdef CONFIG_KEYS struct key *uid_keyring; /* UID specific keyring */ -- cgit v1.2.3 From 386744425e35e04984c6e741c7750fd6eef1a9df Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 1 Jul 2015 14:17:58 +0200 Subject: swiotlb: Make linux/swiotlb.h standalone includible This header file uses the enum dma_data_direction and struct page types without explicitly including the corresponding header files. This makes it rely on the includer to have included the proper headers before. To fix this, include linux/dma-direction.h and forward-declare struct page. The swiotlb_free() function is also annotated __init, therefore requires linux/init.h to be included as well. Signed-off-by: Thierry Reding Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index e7a018eaf3a2..017fced60242 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -1,10 +1,13 @@ #ifndef __LINUX_SWIOTLB_H #define __LINUX_SWIOTLB_H +#include +#include #include struct device; struct dma_attrs; +struct page; struct scatterlist; extern int swiotlb_force; -- cgit v1.2.3 From a9aec5881b9d4aca184b29d33484a6a58d23f7f2 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 20 Jan 2016 14:58:35 -0800 Subject: lib/iomap_copy.c: add __ioread32_copy() Some drivers need to read data out of iomem areas 32-bits at a time. Add an API to do this. Signed-off-by: Stephen Boyd Cc: Bjorn Andersson Cc: Cc: David Howells Cc: Hauke Mehrtens Cc: Paul Walmsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/io.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io.h b/include/linux/io.h index fffd88d7f426..32403b5716e5 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -29,6 +29,7 @@ struct device; struct resource; __visible void __iowrite32_copy(void __iomem *to, const void *from, size_t count); +void __ioread32_copy(void *to, const void __iomem *from, size_t count); void __iowrite64_copy(void __iomem *to, const void *from, size_t count); #ifdef CONFIG_MMU -- cgit v1.2.3 From 243c2137cda52599f6112f52b6be5e61fa6536ae Mon Sep 17 00:00:00 2001 From: Adam Barth Date: Wed, 20 Jan 2016 14:59:09 -0800 Subject: include/linux/radix-tree.h: fix error in docs about locks This text refers to the "first 7 functions", which was correct when written but became incorrect when Johannes Weiner added another function to the list in 139e561660fe ("lib: radix_tree: tree node interface"). Change the text to correctly refer to the first 8 functions. Signed-off-by: Adam Barth Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 33170dbd9db4..57e7d87d2d4c 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -154,7 +154,7 @@ do { \ * radix_tree_gang_lookup_tag_slot * radix_tree_tagged * - * The first 7 functions are able to be called locklessly, using RCU. The + * The first 8 functions are able to be called locklessly, using RCU. The * caller must ensure calls to these functions are made within rcu_read_lock() * regions. Other readers (lock-free or otherwise) and modifications may be * running concurrently. -- cgit v1.2.3 From caaee6234d05a58c5b4d05e7bf766131b810a657 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 20 Jan 2016 15:00:04 -0800 Subject: ptrace: use fsuid, fsgid, effective creds for fs access checks By checking the effective credentials instead of the real UID / permitted capabilities, ensure that the calling process actually intended to use its credentials. To ensure that all ptrace checks use the correct caller credentials (e.g. in case out-of-tree code or newly added code omits the PTRACE_MODE_*CREDS flag), use two new flags and require one of them to be set. The problem was that when a privileged task had temporarily dropped its privileges, e.g. by calling setreuid(0, user_uid), with the intent to perform following syscalls with the credentials of a user, it still passed ptrace access checks that the user would not be able to pass. While an attacker should not be able to convince the privileged task to perform a ptrace() syscall, this is a problem because the ptrace access check is reused for things in procfs. In particular, the following somewhat interesting procfs entries only rely on ptrace access checks: /proc/$pid/stat - uses the check for determining whether pointers should be visible, useful for bypassing ASLR /proc/$pid/maps - also useful for bypassing ASLR /proc/$pid/cwd - useful for gaining access to restricted directories that contain files with lax permissions, e.g. in this scenario: lrwxrwxrwx root root /proc/13020/cwd -> /root/foobar drwx------ root root /root drwxr-xr-x root root /root/foobar -rw-r--r-- root root /root/foobar/secret Therefore, on a system where a root-owned mode 6755 binary changes its effective credentials as described and then dumps a user-specified file, this could be used by an attacker to reveal the memory layout of root's processes or reveal the contents of files he is not allowed to access (through /proc/$pid/cwd). [akpm@linux-foundation.org: fix warning] Signed-off-by: Jann Horn Acked-by: Kees Cook Cc: Casey Schaufler Cc: Oleg Nesterov Cc: Ingo Molnar Cc: James Morris Cc: "Serge E. Hallyn" Cc: Andy Shevchenko Cc: Andy Lutomirski Cc: Al Viro Cc: "Eric W. Biederman" Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptrace.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 061265f92876..504c98a278d4 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -57,7 +57,29 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 #define PTRACE_MODE_ATTACH 0x02 #define PTRACE_MODE_NOAUDIT 0x04 -/* Returns true on success, false on denial. */ +#define PTRACE_MODE_FSCREDS 0x08 +#define PTRACE_MODE_REALCREDS 0x10 + +/* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ +#define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) +#define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) +#define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) +#define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) + +/** + * ptrace_may_access - check whether the caller is permitted to access + * a target task. + * @task: target task + * @mode: selects type of access and caller credentials + * + * Returns true on success, false on denial. + * + * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must + * be set in @mode to specify whether the access was requested through + * a filesystem syscall (should use effective capabilities and fsuid + * of the caller) or through an explicit syscall such as + * process_vm_writev or ptrace (and should use the real credentials). + */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); static inline int ptrace_reparented(struct task_struct *child) -- cgit v1.2.3 From 4b804c85dc37db6c108832b28cd54673ff7ee037 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Jan 2016 15:00:19 -0800 Subject: kernel/cpu.c: export __cpu_*_mask Exporting the cpumasks __cpu_possible_mask and friends will allow us to remove the extra indirection through the cpu_*_mask variables. It will also allow the set_cpu_* functions to become static inlines, which will give a .text reduction. Signed-off-by: Rasmus Villemoes Acked-by: Rusty Russell Cc: Greg Kroah-Hartman Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 59915ea5373c..d4545a1852f2 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -89,6 +89,10 @@ extern const struct cpumask *const cpu_possible_mask; extern const struct cpumask *const cpu_online_mask; extern const struct cpumask *const cpu_present_mask; extern const struct cpumask *const cpu_active_mask; +extern struct cpumask __cpu_possible_mask; +extern struct cpumask __cpu_online_mask; +extern struct cpumask __cpu_present_mask; +extern struct cpumask __cpu_active_mask; #if NR_CPUS > 1 #define num_online_cpus() cpumask_weight(cpu_online_mask) -- cgit v1.2.3 From 5aec01b834fd6f8ca49d1aeede665b950d0c148e Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Jan 2016 15:00:25 -0800 Subject: kernel/cpu.c: eliminate cpu_*_mask Replace the variables cpu_possible_mask, cpu_online_mask, cpu_present_mask and cpu_active_mask with macros expanding to expressions of the same type and value, eliminating some indirection. Signed-off-by: Rasmus Villemoes Acked-by: Rusty Russell Cc: Greg Kroah-Hartman Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index d4545a1852f2..52ab539aefce 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -85,14 +85,14 @@ extern int nr_cpu_ids; * only one CPU. */ -extern const struct cpumask *const cpu_possible_mask; -extern const struct cpumask *const cpu_online_mask; -extern const struct cpumask *const cpu_present_mask; -extern const struct cpumask *const cpu_active_mask; extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; +#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) +#define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) +#define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) +#define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) #if NR_CPUS > 1 #define num_online_cpus() cpumask_weight(cpu_online_mask) -- cgit v1.2.3 From 9425676a363c0976e3d43dda792dc4711a651d1d Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Jan 2016 15:00:28 -0800 Subject: kernel/cpu.c: make set_cpu_* static inlines Almost all callers of the set_cpu_* functions pass an explicit true or false. Making them static inline thus replaces the function calls with a simple set_bit/clear_bit, saving some .text. Signed-off-by: Rasmus Villemoes Acked-by: Rusty Russell Cc: Greg Kroah-Hartman Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 52ab539aefce..fc14275ff34e 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -720,14 +720,49 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) /* Wrappers for arch boot code to manipulate normally-constant masks */ -void set_cpu_possible(unsigned int cpu, bool possible); -void set_cpu_present(unsigned int cpu, bool present); -void set_cpu_online(unsigned int cpu, bool online); -void set_cpu_active(unsigned int cpu, bool active); void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); void init_cpu_online(const struct cpumask *src); +static inline void +set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) + cpumask_set_cpu(cpu, &__cpu_possible_mask); + else + cpumask_clear_cpu(cpu, &__cpu_possible_mask); +} + +static inline void +set_cpu_present(unsigned int cpu, bool present) +{ + if (present) + cpumask_set_cpu(cpu, &__cpu_present_mask); + else + cpumask_clear_cpu(cpu, &__cpu_present_mask); +} + +static inline void +set_cpu_online(unsigned int cpu, bool online) +{ + if (online) { + cpumask_set_cpu(cpu, &__cpu_online_mask); + cpumask_set_cpu(cpu, &__cpu_active_mask); + } else { + cpumask_clear_cpu(cpu, &__cpu_online_mask); + } +} + +static inline void +set_cpu_active(unsigned int cpu, bool active) +{ + if (active) + cpumask_set_cpu(cpu, &__cpu_active_mask); + else + cpumask_clear_cpu(cpu, &__cpu_active_mask); +} + + /** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * * @bitmap: the bitmap -- cgit v1.2.3 From 978e30c9b46161c792ecdad0091fd017b21b8ca5 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 20 Jan 2016 15:00:36 -0800 Subject: kexec: move some memembers and definitions within the scope of CONFIG_KEXEC_FILE Move the stuff currently only used by the kexec file code within CONFIG_KEXEC_FILE (and CONFIG_KEXEC_VERIFY_SIG). Also move internal "struct kexec_sha_region" and "struct kexec_buf" into "kexec_internal.h". Signed-off-by: Xunlei Pang Cc: "Eric W. Biederman" Cc: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kexec.h | 62 +++++++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 7b68d2788a56..2cc643c6e870 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -109,11 +109,7 @@ struct compat_kexec_segment { }; #endif -struct kexec_sha_region { - unsigned long start; - unsigned long len; -}; - +#ifdef CONFIG_KEXEC_FILE struct purgatory_info { /* Pointer to elf header of read only purgatory */ Elf_Ehdr *ehdr; @@ -130,6 +126,28 @@ struct purgatory_info { unsigned long purgatory_load_addr; }; +typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size); +typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len); +typedef int (kexec_cleanup_t)(void *loader_data); + +#ifdef CONFIG_KEXEC_VERIFY_SIG +typedef int (kexec_verify_sig_t)(const char *kernel_buf, + unsigned long kernel_len); +#endif + +struct kexec_file_ops { + kexec_probe_t *probe; + kexec_load_t *load; + kexec_cleanup_t *cleanup; +#ifdef CONFIG_KEXEC_VERIFY_SIG + kexec_verify_sig_t *verify_sig; +#endif +}; +#endif + struct kimage { kimage_entry_t head; kimage_entry_t *entry; @@ -161,6 +179,7 @@ struct kimage { struct kimage_arch arch; #endif +#ifdef CONFIG_KEXEC_FILE /* Additional fields for file based kexec syscall */ void *kernel_buf; unsigned long kernel_buf_len; @@ -179,38 +198,7 @@ struct kimage { /* Information for loading purgatory */ struct purgatory_info purgatory_info; -}; - -/* - * Keeps track of buffer parameters as provided by caller for requesting - * memory placement of buffer. - */ -struct kexec_buf { - struct kimage *image; - char *buffer; - unsigned long bufsz; - unsigned long mem; - unsigned long memsz; - unsigned long buf_align; - unsigned long buf_min; - unsigned long buf_max; - bool top_down; /* allocate from top of memory hole */ -}; - -typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size); -typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, - unsigned long kernel_len, char *initrd, - unsigned long initrd_len, char *cmdline, - unsigned long cmdline_len); -typedef int (kexec_cleanup_t)(void *loader_data); -typedef int (kexec_verify_sig_t)(const char *kernel_buf, - unsigned long kernel_len); - -struct kexec_file_ops { - kexec_probe_t *probe; - kexec_load_t *load; - kexec_cleanup_t *cleanup; - kexec_verify_sig_t *verify_sig; +#endif }; /* kexec interface functions */ -- cgit v1.2.3 From a460bece027301e079b9e53c5e0f67c8e3eaebc1 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 20 Jan 2016 15:00:42 -0800 Subject: rbtree: use READ_ONCE in RB_EMPTY_ROOT With commit d72da4a4d97 ("rbtree: Make lockless searches non-fatal") our rbtrees provide weak guarantees that allows us to do lockless (and very speculative) reads of the tree. Such readers cannot see partial stores on nodes, ie left/right as well as root. As such, similar to the WRITE_ONCE semantics when doing rotations, use READ_ONCE when checking the root node in RB_EMPTY_ROOT. Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Cc: Michel Lespinasse Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index a5aa7ae671f4..b6900099ea81 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -50,7 +50,7 @@ struct rb_root { #define RB_ROOT (struct rb_root) { NULL, } #define rb_entry(ptr, type, member) container_of(ptr, type, member) -#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) +#define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ -- cgit v1.2.3 From c6d308534aef6c99904bf5862066360ae067abc4 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 20 Jan 2016 15:00:55 -0800 Subject: UBSAN: run-time undefined behavior sanity checker UBSAN uses compile-time instrumentation to catch undefined behavior (UB). Compiler inserts code that perform certain kinds of checks before operations that could cause UB. If check fails (i.e. UB detected) __ubsan_handle_* function called to print error message. So the most of the work is done by compiler. This patch just implements ubsan handlers printing errors. GCC has this capability since 4.9.x [1] (see -fsanitize=undefined option and its suboptions). However GCC 5.x has more checkers implemented [2]. Article [3] has a bit more details about UBSAN in the GCC. [1] - https://gcc.gnu.org/onlinedocs/gcc-4.9.0/gcc/Debugging-Options.html [2] - https://gcc.gnu.org/onlinedocs/gcc/Debugging-Options.html [3] - http://developerblog.redhat.com/2014/10/16/gcc-undefined-behavior-sanitizer-ubsan/ Issues which UBSAN has found thus far are: Found bugs: * out-of-bounds access - 97840cb67ff5 ("netfilter: nfnetlink: fix insufficient validation in nfnetlink_bind") undefined shifts: * d48458d4a768 ("jbd2: use a better hash function for the revoke table") * 10632008b9e1 ("clockevents: Prevent shift out of bounds") * 'x << -1' shift in ext4 - http://lkml.kernel.org/r/<5444EF21.8020501@samsung.com> * undefined rol32(0) - http://lkml.kernel.org/r/<1449198241-20654-1-git-send-email-sasha.levin@oracle.com> * undefined dirty_ratelimit calculation - http://lkml.kernel.org/r/<566594E2.3050306@odin.com> * undefined roundown_pow_of_two(0) - http://lkml.kernel.org/r/<1449156616-11474-1-git-send-email-sasha.levin@oracle.com> * [WONTFIX] undefined shift in __bpf_prog_run - http://lkml.kernel.org/r/ WONTFIX here because it should be fixed in bpf program, not in kernel. signed overflows: * 32a8df4e0b33f ("sched: Fix odd values in effective_load() calculations") * mul overflow in ntp - http://lkml.kernel.org/r/<1449175608-1146-1-git-send-email-sasha.levin@oracle.com> * incorrect conversion into rtc_time in rtc_time64_to_tm() - http://lkml.kernel.org/r/<1449187944-11730-1-git-send-email-sasha.levin@oracle.com> * unvalidated timespec in io_getevents() - http://lkml.kernel.org/r/ * [NOTABUG] signed overflow in ktime_add_safe() - http://lkml.kernel.org/r/ [akpm@linux-foundation.org: fix unused local warning] [akpm@linux-foundation.org: fix __int128 build woes] Signed-off-by: Andrey Ryabinin Cc: Peter Zijlstra Cc: Sasha Levin Cc: Randy Dunlap Cc: Rasmus Villemoes Cc: Jonathan Corbet Cc: Michal Marek Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Yury Gribov Cc: Dmitry Vyukov Cc: Konstantin Khlebnikov Cc: Kostya Serebryany Cc: Johannes Berg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 61aa9bbea871..02dabf281b2f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1643,6 +1643,9 @@ struct task_struct { struct held_lock held_locks[MAX_LOCK_DEPTH]; gfp_t lockdep_reclaim_gfp; #endif +#ifdef CONFIG_UBSAN + unsigned int in_ubsan; +#endif /* journalling filesystem info */ void *journal_info; -- cgit v1.2.3 From 06af1c52c9ea234e0b1266cc0b52c3e0c6c8fe9f Mon Sep 17 00:00:00 2001 From: Bongkyu Kim Date: Wed, 20 Jan 2016 15:01:08 -0800 Subject: lz4: fix wrong compress buffer size for 64-bits The current lz4 compress buffer is 16kb on 32-bits, 32kb on 64-bits system. But, lz4 needs only 16kb on both. On 64-bits, this causes wasted cpu cycles for additional memset during every compression. In case of lz4hc, the current buffer size is (256kb + 8) on 32-bits, (512kb + 16) on 64-bits. But, lz4hc needs only (256kb + 2 * pointer) on both. This patch fixes these wrong compress buffer sizes for 64-bits. Signed-off-by: Bongkyu Kim Cc: Chanho Min Cc: Yann Collet Cc: Kyungsik Lee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lz4.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lz4.h b/include/linux/lz4.h index 4356686b0a39..6b784c59f321 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -9,8 +9,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#define LZ4_MEM_COMPRESS (4096 * sizeof(unsigned char *)) -#define LZ4HC_MEM_COMPRESS (65538 * sizeof(unsigned char *)) +#define LZ4_MEM_COMPRESS (16384) +#define LZ4HC_MEM_COMPRESS (262144 + (2 * sizeof(unsigned char *))) /* * lz4_compressbound() -- cgit v1.2.3 From 2954e440be7305134be632a94536b412899490f7 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Wed, 20 Jan 2016 15:01:11 -0800 Subject: ipc/shm.c: is_file_shm_hugepages() can be boolean Make is_file_shm_hugepages() return bool to improve readability due to this particular function only using either one or zero as its return value. No functional change. Signed-off-by: Yaowei Bai Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shm.h b/include/linux/shm.h index 6fb801686ad6..04e881829625 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -52,7 +52,7 @@ struct sysv_shm { long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, unsigned long shmlba); -int is_file_shm_hugepages(struct file *file); +bool is_file_shm_hugepages(struct file *file); void exit_shm(struct task_struct *task); #define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist) #else @@ -66,9 +66,9 @@ static inline long do_shmat(int shmid, char __user *shmaddr, { return -ENOSYS; } -static inline int is_file_shm_hugepages(struct file *file) +static inline bool is_file_shm_hugepages(struct file *file) { - return 0; + return false; } static inline void exit_shm(struct task_struct *task) { -- cgit v1.2.3 From e1c7e324539ada3b2b13ca2898bcb4948a9ef9db Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:02:05 -0800 Subject: dma-mapping: always provide the dma_map_ops based implementation Move the generic implementation to now that all architectures support it and remove the HAVE_DMA_ATTR Kconfig symbol now that everyone supports them. [valentinrothberg@gmail.com: remove leftovers in Kconfig] Signed-off-by: Christoph Hellwig Cc: "David S. Miller" Cc: Aurelien Jacquiot Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Helge Deller Cc: James Hogan Cc: Jesper Nilsson Cc: Koichi Yasutake Cc: Ley Foon Tan Cc: Mark Salter Cc: Mikael Starvik Cc: Steven Miao Cc: Vineet Gupta Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Valentin Rothberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dma-attrs.h | 10 -- include/linux/dma-mapping.h | 379 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 361 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-attrs.h b/include/linux/dma-attrs.h index c8e1831d7572..99c0be00b47c 100644 --- a/include/linux/dma-attrs.h +++ b/include/linux/dma-attrs.h @@ -41,7 +41,6 @@ static inline void init_dma_attrs(struct dma_attrs *attrs) bitmap_zero(attrs->flags, __DMA_ATTRS_LONGS); } -#ifdef CONFIG_HAVE_DMA_ATTRS /** * dma_set_attr - set a specific attribute * @attr: attribute to set @@ -67,14 +66,5 @@ static inline int dma_get_attr(enum dma_attr attr, struct dma_attrs *attrs) BUG_ON(attr >= DMA_ATTR_MAX); return test_bit(attr, attrs->flags); } -#else /* !CONFIG_HAVE_DMA_ATTRS */ -static inline void dma_set_attr(enum dma_attr attr, struct dma_attrs *attrs) -{ -} -static inline int dma_get_attr(enum dma_attr attr, struct dma_attrs *attrs) -{ - return 0; -} -#endif /* CONFIG_HAVE_DMA_ATTRS */ #endif /* _DMA_ATTR_H */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2e551e2d2d03..cc0517b71c5e 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -6,8 +6,12 @@ #include #include #include +#include #include #include +#include +#include +#include /* * A dma_addr_t can hold any valid DMA or bus address for the platform. @@ -86,7 +90,363 @@ static inline int is_device_dma_capable(struct device *dev) #ifdef CONFIG_HAS_DMA #include #else -#include +/* + * Define the dma api to allow compilation but not linking of + * dma dependent code. Code that depends on the dma-mapping + * API needs to set 'depends on HAS_DMA' in its Kconfig + */ +extern struct dma_map_ops bad_dma_ops; +static inline struct dma_map_ops *get_dma_ops(struct device *dev) +{ + return &bad_dma_ops; +} +#endif + +static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, + size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr; + + kmemcheck_mark_initialized(ptr, size); + BUG_ON(!valid_dma_direction(dir)); + addr = ops->map_page(dev, virt_to_page(ptr), + (unsigned long)ptr & ~PAGE_MASK, size, + dir, attrs); + debug_dma_map_page(dev, virt_to_page(ptr), + (unsigned long)ptr & ~PAGE_MASK, size, + dir, addr, true); + return addr; +} + +static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, + size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->unmap_page) + ops->unmap_page(dev, addr, size, dir, attrs); + debug_dma_unmap_page(dev, addr, size, dir, true); +} + +/* + * dma_maps_sg_attrs returns 0 on error and > 0 on success. + * It should never return a value < 0. + */ +static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + int i, ents; + struct scatterlist *s; + + for_each_sg(sg, s, nents, i) + kmemcheck_mark_initialized(sg_virt(s), s->length); + BUG_ON(!valid_dma_direction(dir)); + ents = ops->map_sg(dev, sg, nents, dir, attrs); + BUG_ON(ents < 0); + debug_dma_map_sg(dev, sg, nents, ents, dir); + + return ents; +} + +static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + debug_dma_unmap_sg(dev, sg, nents, dir); + if (ops->unmap_sg) + ops->unmap_sg(dev, sg, nents, dir, attrs); +} + +static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, + size_t offset, size_t size, + enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr; + + kmemcheck_mark_initialized(page_address(page) + offset, size); + BUG_ON(!valid_dma_direction(dir)); + addr = ops->map_page(dev, page, offset, size, dir, NULL); + debug_dma_map_page(dev, page, offset, size, dir, addr, false); + + return addr; +} + +static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->unmap_page) + ops->unmap_page(dev, addr, size, dir, NULL); + debug_dma_unmap_page(dev, addr, size, dir, false); +} + +static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, + size_t size, + enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_for_cpu) + ops->sync_single_for_cpu(dev, addr, size, dir); + debug_dma_sync_single_for_cpu(dev, addr, size, dir); +} + +static inline void dma_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_for_device) + ops->sync_single_for_device(dev, addr, size, dir); + debug_dma_sync_single_for_device(dev, addr, size, dir); +} + +static inline void dma_sync_single_range_for_cpu(struct device *dev, + dma_addr_t addr, + unsigned long offset, + size_t size, + enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_for_cpu) + ops->sync_single_for_cpu(dev, addr + offset, size, dir); + debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); +} + +static inline void dma_sync_single_range_for_device(struct device *dev, + dma_addr_t addr, + unsigned long offset, + size_t size, + enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_for_device) + ops->sync_single_for_device(dev, addr + offset, size, dir); + debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir); +} + +static inline void +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_sg_for_cpu) + ops->sync_sg_for_cpu(dev, sg, nelems, dir); + debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); +} + +static inline void +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_sg_for_device) + ops->sync_sg_for_device(dev, sg, nelems, dir); + debug_dma_sync_sg_for_device(dev, sg, nelems, dir); + +} + +#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL) +#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, NULL) +#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL) +#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, NULL) + +extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size); + +void *dma_common_contiguous_remap(struct page *page, size_t size, + unsigned long vm_flags, + pgprot_t prot, const void *caller); + +void *dma_common_pages_remap(struct page **pages, size_t size, + unsigned long vm_flags, pgprot_t prot, + const void *caller); +void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags); + +/** + * dma_mmap_attrs - map a coherent DMA allocation into user space + * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices + * @vma: vm_area_struct describing requested user mapping + * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs + * @handle: device-view address returned from dma_alloc_attrs + * @size: size of memory originally requested in dma_alloc_attrs + * @attrs: attributes of mapping properties requested in dma_alloc_attrs + * + * Map a coherent DMA buffer previously allocated by dma_alloc_attrs + * into user space. The coherent DMA buffer must not be freed by the + * driver until the user space mapping has been released. + */ +static inline int +dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, + dma_addr_t dma_addr, size_t size, struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + BUG_ON(!ops); + if (ops->mmap) + return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); + return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size); +} + +#define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, NULL) + +int +dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size); + +static inline int +dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, + dma_addr_t dma_addr, size_t size, struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + BUG_ON(!ops); + if (ops->get_sgtable) + return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, + attrs); + return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size); +} + +#define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, NULL) + +#ifndef arch_dma_alloc_attrs +#define arch_dma_alloc_attrs(dev, flag) (true) +#endif + +static inline void *dma_alloc_attrs(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + void *cpu_addr; + + BUG_ON(!ops); + + if (dma_alloc_from_coherent(dev, size, dma_handle, &cpu_addr)) + return cpu_addr; + + if (!arch_dma_alloc_attrs(&dev, &flag)) + return NULL; + if (!ops->alloc) + return NULL; + + cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); + debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); + return cpu_addr; +} + +static inline void dma_free_attrs(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!ops); + WARN_ON(irqs_disabled()); + + if (dma_release_from_coherent(dev, get_order(size), cpu_addr)) + return; + + if (!ops->free) + return; + + debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); + ops->free(dev, size, cpu_addr, dma_handle, attrs); +} + +static inline void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + return dma_alloc_attrs(dev, size, dma_handle, flag, NULL); +} + +static inline void dma_free_coherent(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle) +{ + return dma_free_attrs(dev, size, cpu_addr, dma_handle, NULL); +} + +static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + DEFINE_DMA_ATTRS(attrs); + + dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs); + return dma_alloc_attrs(dev, size, dma_handle, gfp, &attrs); +} + +static inline void dma_free_noncoherent(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle) +{ + DEFINE_DMA_ATTRS(attrs); + + dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs); + dma_free_attrs(dev, size, cpu_addr, dma_handle, &attrs); +} + +static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + debug_dma_mapping_error(dev, dma_addr); + + if (get_dma_ops(dev)->mapping_error) + return get_dma_ops(dev)->mapping_error(dev, dma_addr); + +#ifdef DMA_ERROR_CODE + return dma_addr == DMA_ERROR_CODE; +#else + return 0; +#endif +} + +#ifndef HAVE_ARCH_DMA_SUPPORTED +static inline int dma_supported(struct device *dev, u64 mask) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + if (!ops) + return 0; + if (!ops->dma_supported) + return 1; + return ops->dma_supported(dev, mask); +} +#endif + +#ifndef HAVE_ARCH_DMA_SET_MASK +static inline int dma_set_mask(struct device *dev, u64 mask) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops->set_dma_mask) + return ops->set_dma_mask(dev, mask); + + if (!dev->dma_mask || !dma_supported(dev, mask)) + return -EIO; + *dev->dma_mask = mask; + return 0; +} #endif static inline u64 dma_get_mask(struct device *dev) @@ -259,22 +619,6 @@ static inline void dmam_release_declared_memory(struct device *dev) } #endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ -#ifndef CONFIG_HAVE_DMA_ATTRS -struct dma_attrs; - -#define dma_map_single_attrs(dev, cpu_addr, size, dir, attrs) \ - dma_map_single(dev, cpu_addr, size, dir) - -#define dma_unmap_single_attrs(dev, dma_addr, size, dir, attrs) \ - dma_unmap_single(dev, dma_addr, size, dir) - -#define dma_map_sg_attrs(dev, sgl, nents, dir, attrs) \ - dma_map_sg(dev, sgl, nents, dir) - -#define dma_unmap_sg_attrs(dev, sgl, nents, dir, attrs) \ - dma_unmap_sg(dev, sgl, nents, dir) - -#else static inline void *dma_alloc_writecombine(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t gfp) { @@ -300,7 +644,6 @@ static inline int dma_mmap_writecombine(struct device *dev, dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs); return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, &attrs); } -#endif /* CONFIG_HAVE_DMA_ATTRS */ #ifdef CONFIG_NEED_DMA_MAP_STATE #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME -- cgit v1.2.3 From 20d666e41166f8023ff3d960e832d87ded18c5c4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:02:09 -0800 Subject: dma-mapping: remove This wasn't an asm-generic header to start with, and can be merged into dma-mapping.h trivially. Signed-off-by: Christoph Hellwig Cc: "David S. Miller" Cc: Aurelien Jacquiot Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Helge Deller Cc: James Hogan Cc: Jesper Nilsson Cc: Koichi Yasutake Cc: Ley Foon Tan Cc: Mark Salter Cc: Mikael Starvik Cc: Steven Miao Cc: Vineet Gupta Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dma-mapping.h | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index cc0517b71c5e..d6b575bb45a7 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -11,7 +11,6 @@ #include #include #include -#include /* * A dma_addr_t can hold any valid DMA or bus address for the platform. @@ -87,6 +86,23 @@ static inline int is_device_dma_capable(struct device *dev) return dev->dma_mask != NULL && *dev->dma_mask != DMA_MASK_NONE; } +#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT +/* + * These three functions are only for dma allocator. + * Don't use them in device drivers. + */ +int dma_alloc_from_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle, void **ret); +int dma_release_from_coherent(struct device *dev, int order, void *vaddr); + +int dma_mmap_from_coherent(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, size_t size, int *ret); +#else +#define dma_alloc_from_coherent(dev, size, handle, ret) (0) +#define dma_release_from_coherent(dev, order, vaddr) (0) +#define dma_mmap_from_coherent(dev, vma, vaddr, order, ret) (0) +#endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ + #ifdef CONFIG_HAS_DMA #include #else @@ -568,7 +584,13 @@ static inline int dma_get_cache_alignment(void) #define DMA_MEMORY_INCLUDES_CHILDREN 0x04 #define DMA_MEMORY_EXCLUSIVE 0x08 -#ifndef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY +#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT +int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, + dma_addr_t device_addr, size_t size, int flags); +void dma_release_declared_memory(struct device *dev); +void *dma_mark_declared_memory_occupied(struct device *dev, + dma_addr_t device_addr, size_t size); +#else static inline int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags) @@ -587,7 +609,7 @@ dma_mark_declared_memory_occupied(struct device *dev, { return ERR_PTR(-EBUSY); } -#endif +#endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ /* * Managed DMA API @@ -600,13 +622,13 @@ extern void *dmam_alloc_noncoherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp); extern void dmam_free_noncoherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); -#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY +#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT extern int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags); extern void dmam_release_declared_memory(struct device *dev); -#else /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ +#else /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ static inline int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, gfp_t gfp) @@ -617,7 +639,7 @@ static inline int dmam_declare_coherent_memory(struct device *dev, static inline void dmam_release_declared_memory(struct device *dev) { } -#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ +#endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ static inline void *dma_alloc_writecombine(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t gfp) -- cgit v1.2.3 From 8e99469ab0f821bea77625cd4775ca529d4ca7d4 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 20 Jan 2016 15:02:12 -0800 Subject: dma-mapping: use offset_in_page macro Use offset_in_page macro instead of (addr & ~PAGE_MASK). Signed-off-by: Geliang Tang Acked-by: Will Deacon Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dma-mapping.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index d6b575bb45a7..75857cda38e9 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -129,10 +129,10 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, kmemcheck_mark_initialized(ptr, size); BUG_ON(!valid_dma_direction(dir)); addr = ops->map_page(dev, virt_to_page(ptr), - (unsigned long)ptr & ~PAGE_MASK, size, + offset_in_page(ptr), size, dir, attrs); debug_dma_map_page(dev, virt_to_page(ptr), - (unsigned long)ptr & ~PAGE_MASK, size, + offset_in_page(ptr), size, dir, addr, true); return addr; } -- cgit v1.2.3 From 567e9ab2e614e55feca20e8bcb54b629e9cc1a3b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:24 -0800 Subject: mm: memcontrol: give the kmem states more descriptive names On any given memcg, the kmem accounting feature has three separate states: not initialized, structures allocated, and actively accounting slab memory. These are represented through a combination of the kmem_acct_activated and kmem_acct_active flags, which is confusing. Convert to a kmem_state enum with the states NONE, ALLOCATED, and ONLINE. Then rename the functions to modify the state accordingly. This follows the nomenclature of css object states more closely. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 189f04d4d2ec..54dab4d43e6d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -152,6 +152,12 @@ struct mem_cgroup_thresholds { struct mem_cgroup_threshold_ary *spare; }; +enum memcg_kmem_state { + KMEM_NONE, + KMEM_ALLOCATED, + KMEM_ONLINE, +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -233,8 +239,7 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; - bool kmem_acct_activated; - bool kmem_acct_active; + enum memcg_kmem_state kmem_state; #endif int last_scanned_node; @@ -750,9 +755,9 @@ static inline bool memcg_kmem_enabled(void) return static_branch_unlikely(&memcg_kmem_enabled_key); } -static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) +static inline bool memcg_kmem_online(struct mem_cgroup *memcg) { - return memcg->kmem_acct_active; + return memcg->kmem_state == KMEM_ONLINE; } /* @@ -850,7 +855,7 @@ static inline bool memcg_kmem_enabled(void) return false; } -static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) +static inline bool memcg_kmem_online(struct mem_cgroup *memcg) { return false; } -- cgit v1.2.3 From 127424c86bb6cb87f0b563d9fdcfbbaf3c86ecec Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:32 -0800 Subject: mm: memcontrol: move kmem accounting code to CONFIG_MEMCG The cgroup2 memory controller will account important in-kernel memory consumers per default. Move all necessary components to CONFIG_MEMCG. Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list_lru.h | 4 ++-- include/linux/memcontrol.h | 7 ++++--- include/linux/sched.h | 4 ++-- include/linux/slab.h | 2 +- include/linux/slab_def.h | 3 ++- include/linux/slub_def.h | 2 +- 6 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 2a6b9947aaa3..cb0ba9f2a9a2 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -40,7 +40,7 @@ struct list_lru_node { spinlock_t lock; /* global list, used for the root cgroup in cgroup aware lrus */ struct list_lru_one lru; -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */ struct list_lru_memcg *memcg_lrus; #endif @@ -48,7 +48,7 @@ struct list_lru_node { struct list_lru { struct list_lru_node *node; -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) struct list_head list; #endif }; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 54dab4d43e6d..a87704e3668e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -236,7 +236,7 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct cg_proto tcp_mem; #endif -#if defined(CONFIG_MEMCG_KMEM) +#ifndef CONFIG_SLOB /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; enum memcg_kmem_state kmem_state; @@ -735,7 +735,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) } #endif -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) extern struct static_key_false memcg_kmem_enabled_key; extern int memcg_nr_cache_ids; @@ -891,5 +891,6 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ + #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 02dabf281b2f..f1e81e128592 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1476,10 +1476,10 @@ struct task_struct { unsigned in_iowait:1; #ifdef CONFIG_MEMCG unsigned memcg_may_oom:1; -#endif -#ifdef CONFIG_MEMCG_KMEM +#ifndef CONFIG_SLOB unsigned memcg_kmem_skip_account:1; #endif +#endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif diff --git a/include/linux/slab.h b/include/linux/slab.h index 3ffee7422012..3627d5c1bc47 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -86,7 +86,7 @@ #else # define SLAB_FAILSLAB 0x00000000UL #endif -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) # define SLAB_ACCOUNT 0x04000000UL /* Account to memcg */ #else # define SLAB_ACCOUNT 0x00000000UL diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 33d049066c3d..cf139d3fa513 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -69,7 +69,8 @@ struct kmem_cache { */ int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ -#ifdef CONFIG_MEMCG_KMEM + +#ifdef CONFIG_MEMCG struct memcg_cache_params memcg_params; #endif diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 33885118523c..b7e57927f521 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -84,7 +84,7 @@ struct kmem_cache { #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct memcg_cache_params memcg_params; int max_attr_size; /* for propagation, maximum size of a stored attr */ #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 489c2a20a414351fe0813a727c34600c0f7292ae Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:41 -0800 Subject: mm: memcontrol: introduce CONFIG_MEMCG_LEGACY_KMEM Let the user know that CONFIG_MEMCG_KMEM does not apply to the cgroup2 interface. This also makes legacy-only code sections stand out better. [arnd@arndb.de: mm: memcontrol: only manage socket pressure for CONFIG_INET] Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a87704e3668e..2bb14d021cd0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -233,7 +233,7 @@ struct mem_cgroup { */ struct mem_cgroup_stat_cpu __percpu *stat; -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) +#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) struct cg_proto tcp_mem; #endif #ifndef CONFIG_SLOB @@ -717,7 +717,7 @@ extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG_LEGACY_KMEM if (memcg->tcp_mem.memory_pressure) return true; #endif -- cgit v1.2.3 From d886f4e483ce63a3304adc9eda87031b93341c28 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:47 -0800 Subject: mm: memcontrol: rein in the CONFIG space madness What CONFIG_INET and CONFIG_LEGACY_KMEM guard inside the memory controller code is insignificant, having these conditionals is not worth the complication and fragility that comes with them. [akpm@linux-foundation.org: rework mem_cgroup_css_free() statement ordering] Signed-off-by: Johannes Weiner Cc: Michal Hocko Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2bb14d021cd0..47995b499429 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -233,9 +233,11 @@ struct mem_cgroup { */ struct mem_cgroup_stat_cpu __percpu *stat; -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) + unsigned long socket_pressure; + + /* Legacy tcp memory accounting */ struct cg_proto tcp_mem; -#endif + #ifndef CONFIG_SLOB /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; @@ -254,10 +256,6 @@ struct mem_cgroup { struct wb_domain cgwb_domain; #endif -#ifdef CONFIG_INET - unsigned long socket_pressure; -#endif - /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; @@ -712,15 +710,13 @@ void sock_update_memcg(struct sock *sk); void sock_release_memcg(struct sock *sk); bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); -#if defined(CONFIG_MEMCG) && defined(CONFIG_INET) +#ifdef CONFIG_MEMCG extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { -#ifdef CONFIG_MEMCG_LEGACY_KMEM if (memcg->tcp_mem.memory_pressure) return true; -#endif do { if (time_before(jiffies, memcg->socket_pressure)) return true; -- cgit v1.2.3 From 0db1529817b7b16226421f01470c5ba982c5f302 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:50 -0800 Subject: mm: memcontrol: flatten struct cg_proto There are no more external users of struct cg_proto, flatten the structure into struct mem_cgroup. Since using those struct members doesn't stand out as much anymore, add cgroup2 static branches to make it clearer which code is legacy. Suggested-by: Vladimir Davydov Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 47995b499429..a3869bf97746 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -85,12 +85,6 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, }; -struct cg_proto { - struct page_counter memory_allocated; /* Current allocated memory. */ - int memory_pressure; - bool active; -}; - #ifdef CONFIG_MEMCG struct mem_cgroup_stat_cpu { long count[MEM_CGROUP_STAT_NSTATS]; @@ -169,8 +163,11 @@ struct mem_cgroup { /* Accounted resources */ struct page_counter memory; + + /* Legacy consumer-oriented counters */ struct page_counter memsw; struct page_counter kmem; + struct page_counter tcpmem; /* Normal memory consumption range */ unsigned long low; @@ -236,7 +233,8 @@ struct mem_cgroup { unsigned long socket_pressure; /* Legacy tcp memory accounting */ - struct cg_proto tcp_mem; + bool tcpmem_active; + int tcpmem_pressure; #ifndef CONFIG_SLOB /* Index in the kmem_cache->memcg_params.memcg_caches array */ @@ -715,7 +713,7 @@ extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { - if (memcg->tcp_mem.memory_pressure) + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure) return true; do { if (time_before(jiffies, memcg->socket_pressure)) -- cgit v1.2.3 From 0b8f73e104285a4badf9d768d1c39b06d77d1f97 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:53 -0800 Subject: mm: memcontrol: clean up alloc, online, offline, free functions The creation and teardown of struct mem_cgroup is fairly messy and that has attracted mistakes and subtle bugs before. The main cause for this is that there is no clear model about what needs to happen when, and that attracts more chaos. So create one: 1. mem_cgroup_alloc() should allocate struct mem_cgroup and its auxiliary members and initialize work items, locks etc. so that the object it returns is fully initialized and in a neutral state. 2. mem_cgroup_css_alloc() will use mem_cgroup_alloc() to obtain a new memcg object and configure it and the system according to the role of the new memory-controlled cgroup in the hierarchy. 3. mem_cgroup_css_online() is no longer needed to synchronize with iterators, but it verifies css->id which isn't available earlier. 4. mem_cgroup_css_offline() implements stuff that needs to happen upon the user-visible destruction of a cgroup, which includes stopping all user interfacing as well as releasing certain structures when continued memory consumption would be unexpected at that point. 5. mem_cgroup_css_free() prepares the system and the memcg object for the object's disappearance, neutralizes its state, and then gives it back to mem_cgroup_free(). 6. mem_cgroup_free() releases struct mem_cgroup and auxiliary memory. [arnd@arndb.de: fix SLOB build regression] Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a3869bf97746..27123e597eca 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -181,9 +181,6 @@ struct mem_cgroup { /* vmpressure notifications */ struct vmpressure vmpressure; - /* css_online() has been completed */ - int initialized; - /* * Should the accounting and control be hierarchical, per subtree? */ -- cgit v1.2.3 From 37e84351198be087335ad2b2253b35c7cc76a5ad Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:02:56 -0800 Subject: mm: memcontrol: charge swap to cgroup2 This patchset introduces swap accounting to cgroup2. This patch (of 7): In the legacy hierarchy we charge memsw, which is dubious, because: - memsw.limit must be >= memory.limit, so it is impossible to limit swap usage less than memory usage. Taking into account the fact that the primary limiting mechanism in the unified hierarchy is memory.high while memory.limit is either left unset or set to a very large value, moving memsw.limit knob to the unified hierarchy would effectively make it impossible to limit swap usage according to the user preference. - memsw.usage != memory.usage + swap.usage, because a page occupying both swap entry and a swap cache page is charged only once to memsw counter. As a result, it is possible to effectively eat up to memory.limit of memory pages *and* memsw.limit of swap entries, which looks unexpected. That said, we should provide a different swap limiting mechanism for cgroup2. This patch adds mem_cgroup->swap counter, which charges the actual number of swap entries used by a cgroup. It is only charged in the unified hierarchy, while the legacy hierarchy memsw logic is left intact. The swap usage can be monitored using new memory.swap.current file and limited using memory.swap.max. Note, to charge swap resource properly in the unified hierarchy, we have to make swap_entry_free uncharge swap only when ->usage reaches zero, not just ->count, i.e. when all references to a swap entry, including the one taken by swap cache, are gone. This is necessary, because otherwise swap-in could result in uncharging swap even if the page is still in swap cache and hence still occupies a swap entry. At the same time, this shouldn't break memsw counter logic, where a page is never charged twice for using both memory and swap, because in case of legacy hierarchy we uncharge swap on commit (see mem_cgroup_commit_charge). Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 + include/linux/swap.h | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 27123e597eca..6e0126230878 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -163,6 +163,7 @@ struct mem_cgroup { /* Accounted resources */ struct page_counter memory; + struct page_counter swap; /* Legacy consumer-oriented counters */ struct page_counter memsw; diff --git a/include/linux/swap.h b/include/linux/swap.h index 414e101cd061..83b95f343ab1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -368,11 +368,17 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) #endif #ifdef CONFIG_MEMCG_SWAP extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); +extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); extern void mem_cgroup_uncharge_swap(swp_entry_t entry); #else static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { } +static inline int mem_cgroup_try_charge_swap(struct page *page, + swp_entry_t entry) +{ + return 0; +} static inline void mem_cgroup_uncharge_swap(swp_entry_t entry) { } -- cgit v1.2.3 From eb01aaab43084f1c919ce66183fea005033351b9 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:03:02 -0800 Subject: mm: memcontrol: replace mem_cgroup_lruvec_online with mem_cgroup_online mem_cgroup_lruvec_online() takes lruvec, but it only needs memcg. Since get_scan_count(), which is the only user of this function, now possesses pointer to memcg, let's pass memcg directly to mem_cgroup_online() instead of picking it out of lruvec and rename the function accordingly. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6e0126230878..166661708410 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -355,6 +355,13 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } +static inline bool mem_cgroup_online(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) + return true; + return !!(memcg->css.flags & CSS_ONLINE); +} + /* * For memory reclaim. */ @@ -363,20 +370,6 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int nr_pages); -static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec) -{ - struct mem_cgroup_per_zone *mz; - struct mem_cgroup *memcg; - - if (mem_cgroup_disabled()) - return true; - - mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); - memcg = mz->memcg; - - return !!(memcg->css.flags & CSS_ONLINE); -} - static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { @@ -589,13 +582,13 @@ static inline bool mem_cgroup_disabled(void) return true; } -static inline bool -mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) +static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { return true; } -static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec) +static inline bool +mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) { return true; } -- cgit v1.2.3 From 6f2cb2f17700a39567cf3e9a2e95041def5f3688 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:03:05 -0800 Subject: swap.h: move memcg related stuff to the end of the file The following patches will add more functions to the memcg section of include/linux/swap.h. Some of them will need values defined below the current location of the section. So let's move the section to the end of the file. No functional changes intended. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 70 ++++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 83b95f343ab1..c2bd163a5e7e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -350,39 +350,7 @@ extern void check_move_unevictable_pages(struct page **, int nr_pages); extern int kswapd_run(int nid); extern void kswapd_stop(int nid); -#ifdef CONFIG_MEMCG -static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) -{ - /* root ? */ - if (mem_cgroup_disabled() || !memcg->css.parent) - return vm_swappiness; - - return memcg->swappiness; -} -#else -static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) -{ - return vm_swappiness; -} -#endif -#ifdef CONFIG_MEMCG_SWAP -extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); -extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); -extern void mem_cgroup_uncharge_swap(swp_entry_t entry); -#else -static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry) -{ -} -static inline int mem_cgroup_try_charge_swap(struct page *page, - swp_entry_t entry) -{ - return 0; -} -static inline void mem_cgroup_uncharge_swap(swp_entry_t entry) -{ -} -#endif #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ extern int swap_readpage(struct page *); @@ -561,5 +529,43 @@ static inline swp_entry_t get_swap_page(void) } #endif /* CONFIG_SWAP */ + +#ifdef CONFIG_MEMCG +static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) +{ + /* root ? */ + if (mem_cgroup_disabled() || !memcg->css.parent) + return vm_swappiness; + + return memcg->swappiness; +} + +#else +static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) +{ + return vm_swappiness; +} +#endif + +#ifdef CONFIG_MEMCG_SWAP +extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); +extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); +extern void mem_cgroup_uncharge_swap(swp_entry_t entry); +#else +static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry) +{ +} + +static inline int mem_cgroup_try_charge_swap(struct page *page, + swp_entry_t entry) +{ + return 0; +} + +static inline void mem_cgroup_uncharge_swap(swp_entry_t entry) +{ +} +#endif + #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ -- cgit v1.2.3 From d8b38438a0bcb362c396f49d8279ef7b505917f4 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:03:07 -0800 Subject: mm: vmscan: do not scan anon pages if memcg swap limit is hit We don't scan anonymous memory if we ran out of swap, neither should we do it in case memcg swap limit is hit, because swap out is impossible anyway. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index c2bd163a5e7e..a587050204f9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -551,6 +551,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); extern void mem_cgroup_uncharge_swap(swp_entry_t entry); +extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); #else static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { @@ -565,6 +566,11 @@ static inline int mem_cgroup_try_charge_swap(struct page *page, static inline void mem_cgroup_uncharge_swap(swp_entry_t entry) { } + +static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) +{ + return get_nr_swap_pages(); +} #endif #endif /* __KERNEL__*/ -- cgit v1.2.3 From 5ccc5abaaf6f9242cc63342c5286990233f392fa Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:03:10 -0800 Subject: mm: free swap cache aggressively if memcg swap is full Swap cache pages are freed aggressively if swap is nearly full (>50% currently), because otherwise we are likely to stop scanning anonymous when we near the swap limit even if there is plenty of freeable swap cache pages. We should follow the same trend in case of memory cgroup, which has its own swap limit. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index a587050204f9..d18b65c53dbb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -552,6 +552,7 @@ extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); extern void mem_cgroup_uncharge_swap(swp_entry_t entry); extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); +extern bool mem_cgroup_swap_full(struct page *page); #else static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { @@ -571,6 +572,11 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { return get_nr_swap_pages(); } + +static inline bool mem_cgroup_swap_full(struct page *page) +{ + return vm_swap_full(); +} #endif #endif /* __KERNEL__*/ -- cgit v1.2.3 From b2807f07f4f87362925b8a5b8cbb7b624da10f03 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:03:22 -0800 Subject: mm: memcontrol: add "sock" to cgroup2 memory.stat Provide statistics on how much of a cgroup's memory footprint is made up of socket buffers from network connections owned by the group. Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 166661708410..9ae48d4aeb5e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -50,6 +50,9 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, + /* default hierarchy stats */ + MEMCG_SOCK, + MEMCG_NR_STAT, }; struct mem_cgroup_reclaim_cookie { @@ -87,7 +90,7 @@ enum mem_cgroup_events_target { #ifdef CONFIG_MEMCG struct mem_cgroup_stat_cpu { - long count[MEM_CGROUP_STAT_NSTATS]; + long count[MEMCG_NR_STAT]; unsigned long events[MEMCG_NR_EVENTS]; unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; -- cgit v1.2.3 From 8d7f9ecb371a15e48754fa816e3f716517df7b13 Mon Sep 17 00:00:00 2001 From: "majd@mellanox.com" Date: Thu, 14 Jan 2016 19:12:59 +0200 Subject: net/mlx5_core: Export transport objects To be used by mlx5_ib in the following patches for implementing RAW PACKET QP. Add mlx5_core_ prefix to alloc and delloc transport_domain since they are exposed now. Signed-off-by: Majd Dibbiny Reviewed-by: Matan Barak Signed-off-by: Doug Ledford --- include/linux/mlx5/transobj.h | 74 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 include/linux/mlx5/transobj.h (limited to 'include/linux') diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h new file mode 100644 index 000000000000..376229f09499 --- /dev/null +++ b/include/linux/mlx5/transobj.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __TRANSOBJ_H__ +#define __TRANSOBJ_H__ + +#include + +int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn); +void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn); +int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, + u32 *rqn); +int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen); +void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn); +int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, + u32 *sqn); +int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen); +void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn); +int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, + u32 *tirn); +int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, + int inlen); +void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn); +int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, + u32 *tisn); +void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn); +int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen, + u32 *rmpn); +int mlx5_core_modify_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen); +int mlx5_core_destroy_rmp(struct mlx5_core_dev *dev, u32 rmpn); +int mlx5_core_query_rmp(struct mlx5_core_dev *dev, u32 rmpn, u32 *out); +int mlx5_core_arm_rmp(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm); +int mlx5_core_create_xsrq(struct mlx5_core_dev *dev, u32 *in, int inlen, + u32 *rmpn); +int mlx5_core_destroy_xsrq(struct mlx5_core_dev *dev, u32 rmpn); +int mlx5_core_query_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u32 *out); +int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm); + +int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen, + u32 *rqtn); +int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in, + int inlen); +void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn); + +#endif /* __TRANSOBJ_H__ */ -- cgit v1.2.3 From e2013b212f9f201c71fc5826ce41f39ebece0852 Mon Sep 17 00:00:00 2001 From: "majd@mellanox.com" Date: Thu, 14 Jan 2016 19:13:00 +0200 Subject: net/mlx5_core: Add RQ and SQ event handling RQ/SQ will be used to implement IB verbs QPs, so the IB QP affiliated events are affiliated also with SQs and RQs. Since SQ, RQ and QP resource numbers do not share the same name space, a queue type field was added to the event data to specify the SW object that the event is affiliated with. Signed-off-by: Majd Dibbiny Reviewed-by: Matan Barak Signed-off-by: Doug Ledford --- include/linux/mlx5/device.h | 12 +++++++++++- include/linux/mlx5/driver.h | 8 +++++--- include/linux/mlx5/qp.h | 8 ++++++++ 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 48c4623ad651..b7eaccf997ff 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -223,6 +223,14 @@ enum { #define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) #define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT +#define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) + +enum { + MLX5_EVENT_QUEUE_TYPE_QP = 0, + MLX5_EVENT_QUEUE_TYPE_RQ = 1, + MLX5_EVENT_QUEUE_TYPE_SQ = 2, +}; + enum mlx5_event { MLX5_EVENT_TYPE_COMP = 0x0, @@ -479,7 +487,9 @@ struct mlx5_eqe_comp { }; struct mlx5_eqe_qp_srq { - __be32 reserved[6]; + __be32 reserved1[5]; + u8 type; + u8 reserved2[3]; __be32 qp_srq_n; }; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 53c57724c8dd..ae8f91528b6f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -346,9 +346,11 @@ struct mlx5_core_mr { }; enum mlx5_res_type { - MLX5_RES_QP, - MLX5_RES_SRQ, - MLX5_RES_XSRQ, + MLX5_RES_QP = MLX5_EVENT_QUEUE_TYPE_QP, + MLX5_RES_RQ = MLX5_EVENT_QUEUE_TYPE_RQ, + MLX5_RES_SQ = MLX5_EVENT_QUEUE_TYPE_SQ, + MLX5_RES_SRQ = 3, + MLX5_RES_XSRQ = 4, }; struct mlx5_core_rsc_common { diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index fd1ff4110e80..431176ec70e2 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -651,6 +651,14 @@ void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, u8 context, int error); #endif +int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *rq); +void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev, + struct mlx5_core_qp *rq); +int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *sq); +void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev, + struct mlx5_core_qp *sq); static inline const char *mlx5_qp_type_str(int type) { -- cgit v1.2.3 From 6d2f89df04b796e7dcc4f9f8dc0d8f04ad7f144b Mon Sep 17 00:00:00 2001 From: "majd@mellanox.com" Date: Thu, 14 Jan 2016 19:13:05 +0200 Subject: IB/mlx5: Add Raw Packet QP query functionality Since Raw Packet QP is composed of RQ and SQ, the IB QP's state is derived from the sub-objects. Therefore we need to query each one of the sub-objects, and decide on the IB QP's state. Signed-off-by: Majd Dibbiny Reviewed-by: Matan Barak Signed-off-by: Doug Ledford --- include/linux/mlx5/qp.h | 11 ++++++++++- include/linux/mlx5/transobj.h | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 431176ec70e2..f033c7a1490c 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -85,7 +85,16 @@ enum mlx5_qp_state { MLX5_QP_STATE_ERR = 6, MLX5_QP_STATE_SQ_DRAINING = 7, MLX5_QP_STATE_SUSPENDED = 9, - MLX5_QP_NUM_STATE + MLX5_QP_NUM_STATE, + MLX5_QP_STATE, + MLX5_QP_STATE_BAD, +}; + +enum { + MLX5_SQ_STATE_NA = MLX5_SQC_STATE_ERR + 1, + MLX5_SQ_NUM_STATE = MLX5_SQ_STATE_NA + 1, + MLX5_RQ_STATE_NA = MLX5_RQC_STATE_ERR + 1, + MLX5_RQ_NUM_STATE = MLX5_RQ_STATE_NA + 1, }; enum { diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h index 376229f09499..d259e4c423dd 100644 --- a/include/linux/mlx5/transobj.h +++ b/include/linux/mlx5/transobj.h @@ -41,10 +41,12 @@ int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn); int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen); void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn); +int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out); int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn); int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen); void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn); +int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out); int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tirn); int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, -- cgit v1.2.3 From 75850d0bcece42416ba81bd38e4c719f101c832d Mon Sep 17 00:00:00 2001 From: "majd@mellanox.com" Date: Thu, 14 Jan 2016 19:13:06 +0200 Subject: IB/mlx5: Support setting Ethernet priority for Raw Packet QPs When the user changes the Address Vector(AV) in the modify QP, he provides an SL. This SL should be translated to Ethernet Priority by taking the 3 LSB bits, and modify the QP's TIS according to this Ethernet priority. Signed-off-by: Majd Dibbiny Reviewed-by: Matan Barak Signed-off-by: Doug Ledford --- include/linux/mlx5/mlx5_ifc.h | 9 ++++++++- include/linux/mlx5/transobj.h | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 991283b51f61..4633b88b0c3b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4052,6 +4052,13 @@ struct mlx5_ifc_modify_tis_out_bits { u8 reserved_1[0x40]; }; +struct mlx5_ifc_modify_tis_bitmask_bits { + u8 reserved_0[0x20]; + + u8 reserved_1[0x1f]; + u8 prio[0x1]; +}; + struct mlx5_ifc_modify_tis_in_bits { u8 opcode[0x10]; u8 reserved_0[0x10]; @@ -4064,7 +4071,7 @@ struct mlx5_ifc_modify_tis_in_bits { u8 reserved_3[0x20]; - u8 modify_bitmask[0x40]; + struct mlx5_ifc_modify_tis_bitmask_bits bitmask; u8 reserved_4[0x40]; diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h index d259e4c423dd..88441f5ece25 100644 --- a/include/linux/mlx5/transobj.h +++ b/include/linux/mlx5/transobj.h @@ -54,6 +54,8 @@ int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn); int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tisn); +int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in, + int inlen); void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn); int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rmpn); -- cgit v1.2.3 From 427c1e7bcd7e5cd62160fcda0ce215ebbe0da3a1 Mon Sep 17 00:00:00 2001 From: "majd@mellanox.com" Date: Thu, 14 Jan 2016 19:13:07 +0200 Subject: {IB, net}/mlx5: Move the modify QP operation table to mlx5_ib When modifying a QP, the desired operation was determined in the mlx5_core using a transition table that takes the current state, the final state, and returns the desired operation. Since this logic will be used for Raw Packet QP, move the operation table to the mlx5_ib. Signed-off-by: Majd Dibbiny Reviewed-by: Matan Barak Signed-off-by: Doug Ledford --- include/linux/mlx5/qp.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index f033c7a1490c..5b8c89ffaa58 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -641,8 +641,7 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, struct mlx5_create_qp_mbox_in *in, int inlen); -int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state, - enum mlx5_qp_state new_state, +int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation, struct mlx5_modify_qp_mbox_in *in, int sqd_event, struct mlx5_core_qp *qp); int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, -- cgit v1.2.3 From fae3fde65138b6071b1b0e0b567d4058a8b6a88c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Jan 2016 15:00:50 +0100 Subject: perf: Collapse and fix event_function_call() users There is one common bug left in all the event_function_call() users, between loading ctx->task and getting to the remote_function(), ctx->task can already have been changed. Therefore we need to double check and retry if ctx->task != current. Insert another trampoline specific to event_function_call() that checks for this and further validates state. This also allows getting rid of the active/inactive functions. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f9828a48f16a..6612732d8fd0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1044,7 +1044,7 @@ extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); -extern int __perf_event_disable(void *info); +extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_task_tick(void); #else /* !CONFIG_PERF_EVENTS: */ static inline void * -- cgit v1.2.3 From eade1fe75fd3ad89c10d011969c0eed8f13e28e0 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 17 Nov 2015 14:52:18 +0800 Subject: ceph: remove unused functions in ceph_frag.h These functions were introduced in commit 3d14c5d2b ("ceph: factor out libceph from Ceph file system"). Howover, there's no user of these functions since then, so remove them for simplicity. Signed-off-by: Yaowei Bai Signed-off-by: Yan, Zheng --- include/linux/ceph/ceph_frag.h | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h index 5babb8e95352..970ba5cb1409 100644 --- a/include/linux/ceph/ceph_frag.h +++ b/include/linux/ceph/ceph_frag.h @@ -44,42 +44,7 @@ static inline int ceph_frag_contains_value(__u32 f, __u32 v) { return (v & ceph_frag_mask(f)) == ceph_frag_value(f); } -static inline int ceph_frag_contains_frag(__u32 f, __u32 sub) -{ - /* is sub as specific as us, and contained by us? */ - return ceph_frag_bits(sub) >= ceph_frag_bits(f) && - (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f); -} -static inline __u32 ceph_frag_parent(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f) - 1, - ceph_frag_value(f) & (ceph_frag_mask(f) << 1)); -} -static inline int ceph_frag_is_left_child(__u32 f) -{ - return ceph_frag_bits(f) > 0 && - (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0; -} -static inline int ceph_frag_is_right_child(__u32 f) -{ - return ceph_frag_bits(f) > 0 && - (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1; -} -static inline __u32 ceph_frag_sibling(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f), - ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f))); -} -static inline __u32 ceph_frag_left_child(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f)); -} -static inline __u32 ceph_frag_right_child(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f)+1, - ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f)))); -} static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) { int newbits = ceph_frag_bits(f) + by; -- cgit v1.2.3 From 79a3ed2e98557a2844e75c203e742f9c229ad1d3 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Tue, 17 Nov 2015 14:52:19 +0800 Subject: ceph: ceph_frag_contains_value can be boolean This patch makes ceph_frag_contains_value return bool to improve readability due to this particular function only using either one or zero as its return value. No functional change. Signed-off-by: Yaowei Bai Signed-off-by: Yan, Zheng --- include/linux/ceph/ceph_frag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h index 970ba5cb1409..b827e066e55a 100644 --- a/include/linux/ceph/ceph_frag.h +++ b/include/linux/ceph/ceph_frag.h @@ -40,7 +40,7 @@ static inline __u32 ceph_frag_mask_shift(__u32 f) return 24 - ceph_frag_bits(f); } -static inline int ceph_frag_contains_value(__u32 f, __u32 v) +static inline bool ceph_frag_contains_value(__u32 f, __u32 v) { return (v & ceph_frag_mask(f)) == ceph_frag_value(f); } -- cgit v1.2.3 From 67645d7619738e51c668ca69f097cb90b5470422 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 28 Dec 2015 13:18:34 +0300 Subject: libceph: fix ceph_msg_revoke() There are a number of problems with revoking a "was sending" message: (1) We never make any attempt to revoke data - only kvecs contibute to con->out_skip. However, once the header (envelope) is written to the socket, our peer learns data_len and sets itself to expect at least data_len bytes to follow front or front+middle. If ceph_msg_revoke() is called while the messenger is sending message's data portion, anything we send after that call is counted by the OSD towards the now revoked message's data portion. The effects vary, the most common one is the eventual hang - higher layers get stuck waiting for the reply to the message that was sent out after ceph_msg_revoke() returned and treated by the OSD as a bunch of data bytes. This is what Matt ran into. (2) Flat out zeroing con->out_kvec_bytes worth of bytes to handle kvecs is wrong. If ceph_msg_revoke() is called before the tag is sent out or while the messenger is sending the header, we will get a connection reset, either due to a bad tag (0 is not a valid tag) or a bad header CRC, which kind of defeats the purpose of revoke. Currently the kernel client refuses to work with header CRCs disabled, but that will likely change in the future, making this even worse. (3) con->out_skip is not reset on connection reset, leading to one or more spurious connection resets if we happen to get a real one between con->out_skip is set in ceph_msg_revoke() and before it's cleared in write_partial_skip(). Fixing (1) and (3) is trivial. The idea behind fixing (2) is to never zero the tag or the header, i.e. send out tag+header regardless of when ceph_msg_revoke() is called. That way the header is always correct, no unnecessary resets are induced and revoke stands ready for disabled CRCs. Since ceph_msg_revoke() rips out con->out_msg, introduce a new "message out temp" and copy the header into it before sending. Cc: stable@vger.kernel.org # 4.0+ Reported-by: Matt Conner Signed-off-by: Ilya Dryomov Tested-by: Matt Conner Reviewed-by: Sage Weil --- include/linux/ceph/messenger.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 71b1d6cdcb5d..8dbd7879fdc6 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -220,6 +220,7 @@ struct ceph_connection { struct ceph_entity_addr actual_peer_addr; /* message out temps */ + struct ceph_msg_header out_hdr; struct ceph_msg *out_msg; /* sending message (== tail of out_sent) */ bool out_msg_done; @@ -229,7 +230,6 @@ struct ceph_connection { int out_kvec_left; /* kvec's left in out_kvec */ int out_skip; /* skip this many bytes */ int out_kvec_bytes; /* total bytes left */ - bool out_kvec_is_msg; /* kvec refers to out_msg */ int out_more; /* there is more data after the kvecs */ __le64 out_temp_ack; /* for writing an ack */ struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2 -- cgit v1.2.3 From b6ec57f4b92e9bae4617f7d98a054d45370284bb Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 21 Jan 2016 16:40:25 -0800 Subject: thp: change pmd_trans_huge_lock() interface to return ptl After THP refcounting rework we have only two possible return values from pmd_trans_huge_lock(): success and failure. Return-by-pointer for ptl doesn't make much sense in this case. Let's convert pmd_trans_huge_lock() to return ptl on success and NULL on failure. Signed-off-by: Kirill A. Shutemov Suggested-by: Linus Torvalds Cc: Minchan Kim Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cfe81e10bd54..459fd25b378e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -120,15 +120,15 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); -extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, - spinlock_t **ptl); +extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, + struct vm_area_struct *vma); /* mmap_sem must be held on entry */ -static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, - spinlock_t **ptl) +static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, + struct vm_area_struct *vma) { VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) - return __pmd_trans_huge_lock(pmd, vma, ptl); + return __pmd_trans_huge_lock(pmd, vma); else return false; } @@ -190,10 +190,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, long adjust_next) { } -static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, - spinlock_t **ptl) +static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, + struct vm_area_struct *vma) { - return false; + return NULL; } static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, -- cgit v1.2.3 From 5955102c9984fa081b2d570cfac75c97eecf8f3b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 22 Jan 2016 15:40:57 -0500 Subject: wrappers for ->i_mutex access parallel to mutex_{lock,unlock,trylock,is_locked,lock_nested}, inode_foo(inode) being mutex_foo(&inode->i_mutex). Please, use those for access to ->i_mutex; over the coming cycle ->i_mutex will become rwsem, with ->lookup() done with it held only shared. Signed-off-by: Al Viro --- include/linux/fs.h | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index eb73d74ed992..2df6c033c3f5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -714,6 +714,31 @@ enum inode_i_mutex_lock_class I_MUTEX_PARENT2, }; +static inline void inode_lock(struct inode *inode) +{ + mutex_lock(&inode->i_mutex); +} + +static inline void inode_unlock(struct inode *inode) +{ + mutex_unlock(&inode->i_mutex); +} + +static inline int inode_trylock(struct inode *inode) +{ + return mutex_trylock(&inode->i_mutex); +} + +static inline int inode_is_locked(struct inode *inode) +{ + return mutex_is_locked(&inode->i_mutex); +} + +static inline void inode_lock_nested(struct inode *inode, unsigned subclass) +{ + mutex_lock_nested(&inode->i_mutex, subclass); +} + void lock_two_nondirectories(struct inode *, struct inode*); void unlock_two_nondirectories(struct inode *, struct inode*); @@ -3047,8 +3072,8 @@ static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) } static inline bool dir_relax(struct inode *inode) { - mutex_unlock(&inode->i_mutex); - mutex_lock(&inode->i_mutex); + inode_unlock(inode); + inode_lock(inode); return !IS_DEADDIR(inode); } -- cgit v1.2.3 From 3ed47db34f480df7caf44436e3e63e555351ae9a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 22 Jan 2016 18:08:52 -0500 Subject: make sure that freeing shmem fast symlinks is RCU-delayed Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Al Viro --- include/linux/shmem_fs.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a43f41cb3c43..4d4780c00d34 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -15,10 +15,7 @@ struct shmem_inode_info { unsigned int seals; /* shmem seals */ unsigned long flags; unsigned long alloced; /* data pages alloced to file */ - union { - unsigned long swapped; /* subtotal assigned to swap */ - char *symlink; /* unswappable short symlink */ - }; + unsigned long swapped; /* subtotal assigned to swap */ struct shared_policy policy; /* NUMA memory alloc policy */ struct list_head swaplist; /* chain of maybes on swap */ struct simple_xattrs xattrs; /* list of xattrs */ -- cgit v1.2.3 From 3f4a2670deea53e3765e24a7f46aafe6f077cb68 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:37 -0800 Subject: pmem: add wb_cache_pmem() to the PMEM API __arch_wb_cache_pmem() was already an internal implementation detail of the x86 PMEM API, but this functionality needs to be exported as part of the general PMEM API to handle the fsync/msync case for DAX mmaps. One thing worth noting is that we really do want this to be part of the PMEM API as opposed to a stand-alone function like clflush_cache_range() because of ordering restrictions. By having wb_cache_pmem() as part of the PMEM API we can leave it unordered, call it multiple times to write back large amounts of memory, and then order the multiple calls with a single wmb_pmem(). Signed-off-by: Ross Zwisler Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jan Kara Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pmem.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pmem.h b/include/linux/pmem.h index acfea8ce4a07..7c3d11a6b4ad 100644 --- a/include/linux/pmem.h +++ b/include/linux/pmem.h @@ -53,12 +53,18 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size) { BUG(); } + +static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) +{ + BUG(); +} #endif /* * Architectures that define ARCH_HAS_PMEM_API must provide * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), - * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem(). + * arch_copy_from_iter_pmem(), arch_clear_pmem(), arch_wb_cache_pmem() + * and arch_has_wmb_pmem(). */ static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) { @@ -178,4 +184,18 @@ static inline void clear_pmem(void __pmem *addr, size_t size) else default_clear_pmem(addr, size); } + +/** + * wb_cache_pmem - write back processor cache for PMEM memory range + * @addr: virtual start address + * @size: number of bytes to write back + * + * Write back the processor cache range starting at 'addr' for 'size' bytes. + * This function requires explicit ordering with a wmb_pmem() call. + */ +static inline void wb_cache_pmem(void __pmem *addr, size_t size) +{ + if (arch_has_pmem_api()) + arch_wb_cache_pmem(addr, size); +} #endif /* __PMEM_H__ */ -- cgit v1.2.3 From f9fe48bece3af2d60e1bad65db4825f5a025dd36 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:40 -0800 Subject: dax: support dirty DAX entries in radix tree Add support for tracking dirty DAX entries in the struct address_space radix tree. This tree is already used for dirty page writeback, and it already supports the use of exceptional (non struct page*) entries. In order to properly track dirty DAX pages we will insert new exceptional entries into the radix tree that represent dirty DAX PTE or PMD pages. These exceptional entries will also contain the writeback addresses for the PTE or PMD faults that we can use at fsync/msync time. There are currently two types of exceptional entries (shmem and shadow) that can be placed into the radix tree, and this adds a third. We rely on the fact that only one type of exceptional entry can be found in a given radix tree based on its usage. This happens for free with DAX vs shmem but we explicitly prevent shadow entries from being added to radix trees for DAX mappings. The only shadow entries that would be generated for DAX radix trees would be to track zero page mappings that were created for holes. These pages would receive minimal benefit from having shadow entries, and the choice to have only one type of exceptional entry in a given radix tree makes the logic simpler both in clear_exceptional_entry() and in the rest of DAX. Signed-off-by: Ross Zwisler Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jan Kara Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Hansen Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 5 +++++ include/linux/fs.h | 3 ++- include/linux/radix-tree.h | 9 +++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index b415e521528d..e9d57f680f50 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -36,4 +36,9 @@ static inline bool vma_is_dax(struct vm_area_struct *vma) { return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); } + +static inline bool dax_mapping(struct address_space *mapping) +{ + return mapping->host && IS_DAX(mapping->host); +} #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index eb73d74ed992..0d7570320d63 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -433,7 +433,8 @@ struct address_space { struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ - unsigned long nrshadows; /* number of shadow entries */ + /* number of shadow or DAX exceptional entries */ + unsigned long nrexceptional; pgoff_t writeback_index;/* writeback starts here */ const struct address_space_operations *a_ops; /* methods */ unsigned long flags; /* error bits/gfp mask */ diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 57e7d87d2d4c..7c88ad156a29 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -51,6 +51,15 @@ #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 +#define RADIX_DAX_MASK 0xf +#define RADIX_DAX_SHIFT 4 +#define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY) +#define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY) +#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK) +#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) +#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ + RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) + static inline int radix_tree_is_indirect_ptr(void *ptr) { return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); -- cgit v1.2.3 From 7e7f774984cd88c45c18e7ffaf0256c3e9118043 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:44 -0800 Subject: mm: add find_get_entries_tag() Add find_get_entries_tag() to the family of functions that include find_get_entries(), find_get_pages() and find_get_pages_tag(). This is needed for DAX dirty page handling because we need a list of both page offsets and radix tree entries ('indices' and 'entries' in this function) that are marked with the PAGECACHE_TAG_TOWRITE tag. Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4d08b6c33557..92395a0a7dc5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -361,6 +361,9 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages); +unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, + int tag, unsigned int nr_entries, + struct page **entries, pgoff_t *indices); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags); -- cgit v1.2.3 From 9973c98ecfda3a1dfcab981665b5f1e39bcde64a Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:47 -0800 Subject: dax: add support for fsync/sync To properly handle fsync/msync in an efficient way DAX needs to track dirty pages so it is able to flush them durably to media on demand. The tracking of dirty pages is done via the radix tree in struct address_space. This radix tree is already used by the page writeback infrastructure for tracking dirty pages associated with an open file, and it already has support for exceptional (non struct page*) entries. We build upon these features to add exceptional entries to the radix tree for DAX dirty PMD or PTE pages at fault time. [dan.j.williams@intel.com: fix dax_pmd_dbg build warning] Signed-off-by: Ross Zwisler Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jan Kara Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index e9d57f680f50..8204c3dc3800 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -41,4 +41,6 @@ static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); } +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, + loff_t end); #endif -- cgit v1.2.3 From 2572f00db8a68bb46001678c1c98ad8b70e04b31 Mon Sep 17 00:00:00 2001 From: Joshua Henderson Date: Wed, 13 Jan 2016 18:15:39 -0700 Subject: MIPS: Add support for PIC32MZDA platform This adds support for the Microchip PIC32 MIPS microcontroller with the specific variant PIC32MZDA. PIC32MZDA is based on the MIPS m14KEc core and boots using device tree. This includes an early pin setup and early clock setup needed prior to device tree being initialized. In additon, an interface is provided to synchronize access to registers shared across several peripherals. Signed-off-by: Joshua Henderson Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/12097/ Signed-off-by: Ralf Baechle --- include/linux/platform_data/sdhci-pic32.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 include/linux/platform_data/sdhci-pic32.h (limited to 'include/linux') diff --git a/include/linux/platform_data/sdhci-pic32.h b/include/linux/platform_data/sdhci-pic32.h new file mode 100644 index 000000000000..7e0efe64c8c5 --- /dev/null +++ b/include/linux/platform_data/sdhci-pic32.h @@ -0,0 +1,22 @@ +/* + * Purna Chandra Mandal, purna.mandal@microchip.com + * Copyright (C) 2015 Microchip Technology Inc. All rights reserved. + * + * This program is free software; you can distribute it and/or modify it + * under the terms of the GNU General Public License (Version 2) as + * published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ +#ifndef __PIC32_SDHCI_PDATA_H__ +#define __PIC32_SDHCI_PDATA_H__ + +struct pic32_sdhci_platform_data { + /* read & write fifo threshold */ + int (*setup_dma)(u32 rfifo, u32 wfifo); +}; + +#endif -- cgit v1.2.3 From 3271e6103189c5294acb06ffa504cc5495457fbf Mon Sep 17 00:00:00 2001 From: Simon Arlott Date: Sun, 13 Dec 2015 22:45:30 +0000 Subject: MIPS: bcm963xx: Add Broadcom BCM963xx board nvram data structure Broadcom BCM963xx boards have multiple nvram variants across different SoCs with additional checksum fields added whenever the size of the nvram was extended. Add this structure as a header file so that multiple drivers can use it. Signed-off-by: Simon Arlott Cc: David Woodhouse Cc: Brian Norris Cc: Kevin Cernekee Cc: Florian Fainelli Cc: Jonas Gorski Cc: Linux Kernel Mailing List Cc: MIPS Mailing List Cc: MTD Maling List Patchwork: https://patchwork.linux-mips.org/patch/11830/ Signed-off-by: Ralf Baechle --- include/linux/bcm963xx_nvram.h | 112 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 include/linux/bcm963xx_nvram.h (limited to 'include/linux') diff --git a/include/linux/bcm963xx_nvram.h b/include/linux/bcm963xx_nvram.h new file mode 100644 index 000000000000..290c231b8cf1 --- /dev/null +++ b/include/linux/bcm963xx_nvram.h @@ -0,0 +1,112 @@ +#ifndef __LINUX_BCM963XX_NVRAM_H__ +#define __LINUX_BCM963XX_NVRAM_H__ + +#include +#include +#include +#include + +/* + * Broadcom BCM963xx SoC board nvram data structure. + * + * The nvram structure varies in size depending on the SoC board version. Use + * the appropriate minimum BCM963XX_NVRAM_*_SIZE define for the information + * you need instead of sizeof(struct bcm963xx_nvram) as this may change. + */ + +#define BCM963XX_NVRAM_V4_SIZE 300 +#define BCM963XX_NVRAM_V5_SIZE (1 * SZ_1K) + +#define BCM963XX_DEFAULT_PSI_SIZE 64 + +enum bcm963xx_nvram_nand_part { + BCM963XX_NVRAM_NAND_PART_BOOT = 0, + BCM963XX_NVRAM_NAND_PART_ROOTFS_1, + BCM963XX_NVRAM_NAND_PART_ROOTFS_2, + BCM963XX_NVRAM_NAND_PART_DATA, + BCM963XX_NVRAM_NAND_PART_BBT, + + __BCM963XX_NVRAM_NAND_NR_PARTS +}; + +struct bcm963xx_nvram { + u32 version; + char bootline[256]; + char name[16]; + u32 main_tp_number; + u32 psi_size; + u32 mac_addr_count; + u8 mac_addr_base[ETH_ALEN]; + u8 __reserved1[2]; + u32 checksum_v4; + + u8 __reserved2[292]; + u32 nand_part_offset[__BCM963XX_NVRAM_NAND_NR_PARTS]; + u32 nand_part_size[__BCM963XX_NVRAM_NAND_NR_PARTS]; + u8 __reserved3[388]; + u32 checksum_v5; +}; + +#define BCM963XX_NVRAM_NAND_PART_OFFSET(nvram, part) \ + bcm963xx_nvram_nand_part_offset(nvram, BCM963XX_NVRAM_NAND_PART_ ##part) + +static inline u64 __pure bcm963xx_nvram_nand_part_offset( + const struct bcm963xx_nvram *nvram, + enum bcm963xx_nvram_nand_part part) +{ + return nvram->nand_part_offset[part] * SZ_1K; +} + +#define BCM963XX_NVRAM_NAND_PART_SIZE(nvram, part) \ + bcm963xx_nvram_nand_part_size(nvram, BCM963XX_NVRAM_NAND_PART_ ##part) + +static inline u64 __pure bcm963xx_nvram_nand_part_size( + const struct bcm963xx_nvram *nvram, + enum bcm963xx_nvram_nand_part part) +{ + return nvram->nand_part_size[part] * SZ_1K; +} + +/* + * bcm963xx_nvram_checksum - Verify nvram checksum + * + * @nvram: pointer to full size nvram data structure + * @expected_out: optional pointer to store expected checksum value + * @actual_out: optional pointer to store actual checksum value + * + * Return: 0 if the checksum is valid, otherwise -EINVAL + */ +static int __maybe_unused bcm963xx_nvram_checksum( + const struct bcm963xx_nvram *nvram, + u32 *expected_out, u32 *actual_out) +{ + u32 expected, actual; + size_t len; + + if (nvram->version <= 4) { + expected = nvram->checksum_v4; + len = BCM963XX_NVRAM_V4_SIZE - sizeof(u32); + } else { + expected = nvram->checksum_v5; + len = BCM963XX_NVRAM_V5_SIZE - sizeof(u32); + } + + /* + * Calculate the CRC32 value for the nvram with a checksum value + * of 0 without modifying or copying the nvram by combining: + * - The CRC32 of the nvram without the checksum value + * - The CRC32 of a zero checksum value (which is also 0) + */ + actual = crc32_le_combine( + crc32_le(~0, (u8 *)nvram, len), 0, sizeof(u32)); + + if (expected_out) + *expected_out = expected; + + if (actual_out) + *actual_out = actual; + + return expected == actual ? 0 : -EINVAL; +}; + +#endif /* __LINUX_BCM963XX_NVRAM_H__ */ -- cgit v1.2.3 From 8fce60b8d0c62363c29d64efb0cceb98519f0350 Mon Sep 17 00:00:00 2001 From: Simon Arlott Date: Sun, 13 Dec 2015 22:46:59 +0000 Subject: MIPS: bcm963xx: Move Broadcom BCM963xx image tag data structure Move Broadcom BCM963xx image tag data structure to include/linux/ so that drivers outside of mach-bcm63xx can use it. Signed-off-by: Simon Arlott Cc: David Woodhouse Cc: Brian Norris Cc: Kevin Cernekee Cc: Florian Fainelli Cc: Jonas Gorski Cc: Linux Kernel Mailing List Cc: MIPS Mailing List Cc: MTD Maling List Patchwork: https://patchwork.linux-mips.org/patch/11832/ Signed-off-by: Ralf Baechle --- include/linux/bcm963xx_tag.h | 98 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 include/linux/bcm963xx_tag.h (limited to 'include/linux') diff --git a/include/linux/bcm963xx_tag.h b/include/linux/bcm963xx_tag.h new file mode 100644 index 000000000000..f389dace6d95 --- /dev/null +++ b/include/linux/bcm963xx_tag.h @@ -0,0 +1,98 @@ +#ifndef __LINUX_BCM963XX_TAG_H__ +#define __LINUX_BCM963XX_TAG_H__ + +#include + +#define TAGVER_LEN 4 /* Length of Tag Version */ +#define TAGLAYOUT_LEN 4 /* Length of FlashLayoutVer */ +#define SIG1_LEN 20 /* Company Signature 1 Length */ +#define SIG2_LEN 14 /* Company Signature 2 Length */ +#define BOARDID_LEN 16 /* Length of BoardId */ +#define ENDIANFLAG_LEN 2 /* Endian Flag Length */ +#define CHIPID_LEN 6 /* Chip Id Length */ +#define IMAGE_LEN 10 /* Length of Length Field */ +#define ADDRESS_LEN 12 /* Length of Address field */ +#define DUALFLAG_LEN 2 /* Dual Image flag Length */ +#define INACTIVEFLAG_LEN 2 /* Inactie Flag Length */ +#define RSASIG_LEN 20 /* Length of RSA Signature in tag */ +#define TAGINFO1_LEN 30 /* Length of vendor information field1 in tag */ +#define FLASHLAYOUTVER_LEN 4 /* Length of Flash Layout Version String tag */ +#define TAGINFO2_LEN 16 /* Length of vendor information field2 in tag */ +#define ALTTAGINFO_LEN 54 /* Alternate length for vendor information; Pirelli */ + +#define NUM_PIRELLI 2 +#define IMAGETAG_CRC_START 0xFFFFFFFF + +#define PIRELLI_BOARDS { \ + "AGPF-S0", \ + "DWV-S0", \ +} + +/* + * The broadcom firmware assumes the rootfs starts the image, + * therefore uses the rootfs start (flash_image_address) + * to determine where to flash the image. Since we have the kernel first + * we have to give it the kernel address, but the crc uses the length + * associated with this address (root_length), which is added to the kernel + * length (kernel_length) to determine the length of image to flash and thus + * needs to be rootfs + deadcode (jffs2 EOF marker) +*/ + +struct bcm_tag { + /* 0-3: Version of the image tag */ + char tag_version[TAGVER_LEN]; + /* 4-23: Company Line 1 */ + char sig_1[SIG1_LEN]; + /* 24-37: Company Line 2 */ + char sig_2[SIG2_LEN]; + /* 38-43: Chip this image is for */ + char chip_id[CHIPID_LEN]; + /* 44-59: Board name */ + char board_id[BOARDID_LEN]; + /* 60-61: Map endianness -- 1 BE 0 LE */ + char big_endian[ENDIANFLAG_LEN]; + /* 62-71: Total length of image */ + char total_length[IMAGE_LEN]; + /* 72-83: Address in memory of CFE */ + char cfe__address[ADDRESS_LEN]; + /* 84-93: Size of CFE */ + char cfe_length[IMAGE_LEN]; + /* 94-105: Address in memory of image start + * (kernel for OpenWRT, rootfs for stock firmware) + */ + char flash_image_start[ADDRESS_LEN]; + /* 106-115: Size of rootfs */ + char root_length[IMAGE_LEN]; + /* 116-127: Address in memory of kernel */ + char kernel_address[ADDRESS_LEN]; + /* 128-137: Size of kernel */ + char kernel_length[IMAGE_LEN]; + /* 138-139: Unused at the moment */ + char dual_image[DUALFLAG_LEN]; + /* 140-141: Unused at the moment */ + char inactive_flag[INACTIVEFLAG_LEN]; + /* 142-161: RSA Signature (not used; some vendors may use this) */ + char rsa_signature[RSASIG_LEN]; + /* 162-191: Compilation and related information (not used in OpenWrt) */ + char information1[TAGINFO1_LEN]; + /* 192-195: Version flash layout */ + char flash_layout_ver[FLASHLAYOUTVER_LEN]; + /* 196-199: kernel+rootfs CRC32 */ + __u32 fskernel_crc; + /* 200-215: Unused except on Alice Gate where is is information */ + char information2[TAGINFO2_LEN]; + /* 216-219: CRC32 of image less imagetag (kernel for Alice Gate) */ + __u32 image_crc; + /* 220-223: CRC32 of rootfs partition */ + __u32 rootfs_crc; + /* 224-227: CRC32 of kernel partition */ + __u32 kernel_crc; + /* 228-235: Unused at present */ + char reserved1[8]; + /* 236-239: CRC32 of header excluding last 20 bytes */ + __u32 header_crc; + /* 240-255: Unused at present */ + char reserved2[16]; +}; + +#endif /* __LINUX_BCM63XX_TAG_H__ */ -- cgit v1.2.3 From 1f29cb19cb7c3bea870d7da02ec23823af9d636e Mon Sep 17 00:00:00 2001 From: Simon Arlott Date: Sun, 13 Dec 2015 22:47:55 +0000 Subject: MIPS: bcm963xx: Move extended flash address to bcm_tag header file The extended flash address needs to be subtracted from bcm_tag flash image offsets. Move this value to the bcm_tag header file. Renamed define name to consistently use bcm963xx for flash layout which should be considered a property of the board and not the SoC (i.e. bcm63xx could theoretically be used on a board without CFE or any flash). Signed-off-by: Simon Arlott Cc: David Woodhouse Cc: Brian Norris Cc: Kevin Cernekee Cc: Florian Fainelli Cc: Jonas Gorski Cc: Linux Kernel Mailing List Cc: MIPS Mailing List Cc: MTD Maling List Patchwork: https://patchwork.linux-mips.org/patch/11833/ Signed-off-by: Ralf Baechle --- include/linux/bcm963xx_tag.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bcm963xx_tag.h b/include/linux/bcm963xx_tag.h index f389dace6d95..08e0133820ed 100644 --- a/include/linux/bcm963xx_tag.h +++ b/include/linux/bcm963xx_tag.h @@ -28,6 +28,11 @@ "DWV-S0", \ } +/* Extended flash address, needs to be subtracted + * from bcm_tag flash image offsets. + */ +#define BCM963XX_EXTENDED_SIZE 0xBFC00000 + /* * The broadcom firmware assumes the rootfs starts the image, * therefore uses the rootfs start (flash_image_address) -- cgit v1.2.3 From 696569f759cdebc7da67666fc4f962eaee13562b Mon Sep 17 00:00:00 2001 From: Simon Arlott Date: Sun, 13 Dec 2015 22:48:44 +0000 Subject: MIPS: bcm963xx: Update bcm_tag field image_sequence The "dual_image" and "inactive_flag" fields should be merged into a single "image_sequence" field. Signed-off-by: Simon Arlott Cc: David Woodhouse Cc: Brian Norris Cc: Kevin Cernekee Cc: Florian Fainelli Cc: Jonas Gorski Cc: Linux Kernel Mailing List Cc: MIPS Mailing List Cc: MTD Maling List Patchwork: https://patchwork.linux-mips.org/patch/11834/ Signed-off-by: Ralf Baechle --- include/linux/bcm963xx_tag.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bcm963xx_tag.h b/include/linux/bcm963xx_tag.h index 08e0133820ed..161c7b37a77b 100644 --- a/include/linux/bcm963xx_tag.h +++ b/include/linux/bcm963xx_tag.h @@ -12,8 +12,7 @@ #define CHIPID_LEN 6 /* Chip Id Length */ #define IMAGE_LEN 10 /* Length of Length Field */ #define ADDRESS_LEN 12 /* Length of Address field */ -#define DUALFLAG_LEN 2 /* Dual Image flag Length */ -#define INACTIVEFLAG_LEN 2 /* Inactie Flag Length */ +#define IMAGE_SEQUENCE_LEN 4 /* Image sequence Length */ #define RSASIG_LEN 20 /* Length of RSA Signature in tag */ #define TAGINFO1_LEN 30 /* Length of vendor information field1 in tag */ #define FLASHLAYOUTVER_LEN 4 /* Length of Flash Layout Version String tag */ @@ -72,10 +71,10 @@ struct bcm_tag { char kernel_address[ADDRESS_LEN]; /* 128-137: Size of kernel */ char kernel_length[IMAGE_LEN]; - /* 138-139: Unused at the moment */ - char dual_image[DUALFLAG_LEN]; - /* 140-141: Unused at the moment */ - char inactive_flag[INACTIVEFLAG_LEN]; + /* 138-141: Image sequence number + * (to be incremented when flashed with a new image) + */ + char image_sequence[IMAGE_SEQUENCE_LEN]; /* 142-161: RSA Signature (not used; some vendors may use this) */ char rsa_signature[RSASIG_LEN]; /* 162-191: Compilation and related information (not used in OpenWrt) */ -- cgit v1.2.3 From facc432faa59414bd7c60c307ff1645154a66c98 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Jan 2016 11:43:44 +0100 Subject: net: simplify napi_synchronize() to avoid warnings The napi_synchronize() function is defined twice: The definition for SMP builds waits for other CPUs to be done, while the uniprocessor variant just contains a barrier and ignores its argument. In the mvneta driver, this leads to a warning about an unused variable when we lookup the NAPI struct of another CPU and then don't use it: ethernet/marvell/mvneta.c: In function 'mvneta_percpu_notifier': ethernet/marvell/mvneta.c:2910:30: error: unused variable 'other_port' [-Werror=unused-variable] There are no other CPUs on a UP build, so that code never runs, but gcc does not know this. The nicest solution seems to be to turn the napi_synchronize() helper into an inline function for the UP case as well, as that leads gcc to not complain about the argument being unused. Once we do that, we can also combine the two cases into a single function definition and use if(IS_ENABLED()) rather than #ifdef to make it look a bit nicer. The warning first came up in linux-4.4, but I failed to catch it earlier. Signed-off-by: Arnd Bergmann Fixes: f86428854480 ("net: mvneta: Statically assign queues to CPUs") Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5ac140dcb789..289c2314d766 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -512,7 +512,6 @@ static inline void napi_enable(struct napi_struct *n) clear_bit(NAPI_STATE_NPSVC, &n->state); } -#ifdef CONFIG_SMP /** * napi_synchronize - wait until NAPI is not running * @n: napi context @@ -523,12 +522,12 @@ static inline void napi_enable(struct napi_struct *n) */ static inline void napi_synchronize(const struct napi_struct *n) { - while (test_bit(NAPI_STATE_SCHED, &n->state)) - msleep(1); + if (IS_ENABLED(CONFIG_SMP)) + while (test_bit(NAPI_STATE_SCHED, &n->state)) + msleep(1); + else + barrier(); } -#else -# define napi_synchronize(n) barrier() -#endif enum netdev_queue_state_t { __QUEUE_STATE_DRV_XOFF, -- cgit v1.2.3 From 530cbe100ef7587aa5b5ac3a4b670cda4d50e598 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Jan 2016 13:52:25 +0000 Subject: irqdomain: Allow domain lookup with DOMAIN_BUS_WIRED token Let's take the (outlandish) example of an interrupt controller capable of handling both wired interrupts and PCI MSIs. With the current code, the PCI MSI domain is going to be tagged with DOMAIN_BUS_PCI_MSI, and the wired domain with DOMAIN_BUS_ANY. Things get hairy when we start looking up the domain for a wired interrupt (typically when creating it based on some firmware information - DT or ACPI). In irq_create_fwspec_mapping(), we perform the lookup using DOMAIN_BUS_ANY, which is actually used as a wildcard. This gives us one chance out of two to end up with the wrong domain, and we try to configure a wired interrupt with the MSI domain. Everything grinds to a halt pretty quickly. What we really need to do is to start looking for a domain that would uniquely identify a wired interrupt domain, and only use DOMAIN_BUS_ANY as a fallback. In order to solve this, let's introduce a new DOMAIN_BUS_WIRED token, which is going to be used exactly as described above. Of course, this depends on the irqchip to setup the domain bus_token, and nobody had to implement this so far. Only so far. Signed-off-by: Marc Zyngier Cc: Greg Kroah-Hartman Cc: Rob Herring Cc: Frank Rowand Cc: Grant Likely Cc: Thomas Petazzoni Cc: Jiang Liu Link: http://lkml.kernel.org/r/1453816347-32720-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index f64622ad02c1..04579d9fbce4 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -70,6 +70,7 @@ struct irq_fwspec { */ enum irq_domain_bus_token { DOMAIN_BUS_ANY = 0, + DOMAIN_BUS_WIRED, DOMAIN_BUS_PCI_MSI, DOMAIN_BUS_PLATFORM_MSI, DOMAIN_BUS_NEXUS, -- cgit v1.2.3 From 0bfd464d3fdd5bb322f9cace4cc47f1796545cf7 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sat, 9 Jan 2016 21:13:44 -0800 Subject: tty: Wait interruptibly for tty lock on reopen Allow a signal to interrupt the wait for a tty reopen; eg., if the tty has starting final close and is waiting for the device to drain. Signed-off-by: Peter Hurley Cc: stable # 4.4 Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 2fd8708ea888..d9fb4b043f56 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -649,6 +649,7 @@ extern long vt_compat_ioctl(struct tty_struct *tty, /* tty_mutex.c */ /* functions for preparation of BKL removal */ extern void __lockfunc tty_lock(struct tty_struct *tty); +extern int tty_lock_interruptible(struct tty_struct *tty); extern void __lockfunc tty_unlock(struct tty_struct *tty); extern void __lockfunc tty_lock_slave(struct tty_struct *tty); extern void __lockfunc tty_unlock_slave(struct tty_struct *tty); -- cgit v1.2.3 From b3c6de492b9ea30a8dcc535d4dc2eaaf0bb3f116 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 21 Jan 2016 16:47:29 +0100 Subject: cleancache: constify cleancache_ops structure The cleancache_ops structure is never modified, so declare it as const. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/cleancache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index bda5ec0b4b4d..cb3e14234502 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -37,7 +37,7 @@ struct cleancache_ops { void (*invalidate_fs)(int); }; -extern int cleancache_register_ops(struct cleancache_ops *ops); +extern int cleancache_register_ops(const struct cleancache_ops *ops); extern void __cleancache_init_fs(struct super_block *); extern void __cleancache_init_shared_fs(struct super_block *); extern int __cleancache_get_page(struct page *); -- cgit v1.2.3 From a39bb9a0557a3fc860c3c9d67a7326b0d2f1bd66 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 7 Jan 2016 05:06:13 +0800 Subject: include/linux/cleancache.h: Clean up code Let cleancache_fs_enabled() call cleancache_fs_enabled_mapping() directly. Remove redundant variable ret in cleancache_get_page(). Signed-off-by: Chen Gang Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/cleancache.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index cb3e14234502..fccf7f44139d 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -48,14 +48,14 @@ extern void __cleancache_invalidate_fs(struct super_block *); #ifdef CONFIG_CLEANCACHE #define cleancache_enabled (1) -static inline bool cleancache_fs_enabled(struct page *page) -{ - return page->mapping->host->i_sb->cleancache_poolid >= 0; -} static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping) { return mapping->host->i_sb->cleancache_poolid >= 0; } +static inline bool cleancache_fs_enabled(struct page *page) +{ + return cleancache_fs_enabled_mapping(page->mapping); +} #else #define cleancache_enabled (0) #define cleancache_fs_enabled(_page) (0) @@ -89,11 +89,9 @@ static inline void cleancache_init_shared_fs(struct super_block *sb) static inline int cleancache_get_page(struct page *page) { - int ret = -1; - if (cleancache_enabled && cleancache_fs_enabled(page)) - ret = __cleancache_get_page(page); - return ret; + return __cleancache_get_page(page); + return -1; } static inline void cleancache_put_page(struct page *page) -- cgit v1.2.3 From e03e7ee34fdd1c3ef494949a75cb8c61c7265fa9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Jan 2016 20:59:49 -0800 Subject: perf/bpf: Convert perf_event_array to use struct file Robustify refcounting. Signed-off-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wang Nan Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160126045947.GA40151@ast-mbp.thefacebook.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6612732d8fd0..4f90434b8d64 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -729,7 +729,7 @@ extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); -extern struct perf_event *perf_event_get(unsigned int fd); +extern struct file *perf_event_get(unsigned int fd); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); @@ -1070,7 +1070,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } -static inline struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } +static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); -- cgit v1.2.3 From c6e5b73242d2d9172ea880483bc4ba7ffca0cfb2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jan 2016 16:07:41 +0200 Subject: perf: Synchronously clean up child events The orphan cleanup workqueue doesn't always catch orphans, for example, if they never schedule after they are orphaned. IOW, the event leak is still very real. It also wouldn't work for kernel counters. Doing it synchonously is a little hairy due to lock inversion issues, but is made to work. Patch based on work by Alexander Shishkin. Suggested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4f90434b8d64..b35a61a481fa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -634,9 +634,6 @@ struct perf_event_context { int nr_cgroups; /* cgroup evts */ void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; - - struct delayed_work orphans_remove; - bool orphans_remove_sched; }; /* -- cgit v1.2.3 From 0d9bacb6c8265e7aa75ac28ae9b5d7748065942f Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 19 Jan 2016 14:28:48 +0900 Subject: iommu: Update struct iommu_ops comments Update the comments around struct iommu_ops to match current state and fix a few typos while at it. Signed-off-by: Magnus Damm Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f28dff313b07..a5c539fa5d2b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -133,8 +133,9 @@ struct iommu_dm_region { /** * struct iommu_ops - iommu ops and capabilities - * @domain_init: init iommu domain - * @domain_destroy: destroy iommu domain + * @capable: check capability + * @domain_alloc: allocate iommu domain + * @domain_free: free iommu domain * @attach_dev: attach device to an iommu domain * @detach_dev: detach device from an iommu domain * @map: map a physically contiguous memory region to an iommu domain @@ -144,8 +145,15 @@ struct iommu_dm_region { * @iova_to_phys: translate iova to physical address * @add_device: add device to iommu grouping * @remove_device: remove device from iommu grouping + * @device_group: find iommu group for a particular device * @domain_get_attr: Query domain attributes * @domain_set_attr: Change domain attributes + * @get_dm_regions: Request list of direct mapping requirements for a device + * @put_dm_regions: Free list of direct mapping requirements for a device + * @domain_window_enable: Configure and enable a particular window for a domain + * @domain_window_disable: Disable a particular window for a domain + * @domain_set_windows: Set the number of windows for a domain + * @domain_get_windows: Return the number of windows for a domain * @of_xlate: add OF master IDs to iommu grouping * @pgsize_bitmap: bitmap of supported page sizes * @priv: per-instance data private to the iommu driver @@ -182,9 +190,9 @@ struct iommu_ops { int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr, phys_addr_t paddr, u64 size, int prot); void (*domain_window_disable)(struct iommu_domain *domain, u32 wnd_nr); - /* Set the numer of window per domain */ + /* Set the number of windows per domain */ int (*domain_set_windows)(struct iommu_domain *domain, u32 w_count); - /* Get the numer of window per domain */ + /* Get the number of windows per domain */ u32 (*domain_get_windows)(struct iommu_domain *domain); #ifdef CONFIG_OF_IOMMU -- cgit v1.2.3 From 65f87ee71852a754f7981d0653e7136039b8798a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 25 Jan 2016 17:23:18 -0800 Subject: fs, block: force direct-I/O for dax-enabled block devices Similar to the file I/O path, re-direct all I/O to the DAX path for I/O to a block-device special file. Both regular files and device special files can use the common filp->f_mapping->host lookup to determing is DAX is enabled. Otherwise, we confuse the DAX code that does not expect to find live data in the page cache: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 7676 at mm/filemap.c:217 __delete_from_page_cache+0x9f6/0xb60() Modules linked in: CPU: 0 PID: 7676 Comm: a.out Not tainted 4.4.0+ #276 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 00000000ffffffff ffff88006d3f7738 ffffffff82999e2d 0000000000000000 ffff8800620a0000 ffffffff86473d20 ffff88006d3f7778 ffffffff81352089 ffffffff81658d36 ffffffff86473d20 00000000000000d9 ffffea0000009d60 Call Trace: [< inline >] __dump_stack lib/dump_stack.c:15 [] dump_stack+0x6f/0xa2 lib/dump_stack.c:50 [] warn_slowpath_common+0xd9/0x140 kernel/panic.c:482 [] warn_slowpath_null+0x29/0x30 kernel/panic.c:515 [] __delete_from_page_cache+0x9f6/0xb60 mm/filemap.c:217 [] delete_from_page_cache+0x112/0x200 mm/filemap.c:244 [] __dax_fault+0x859/0x1800 fs/dax.c:487 [] blkdev_dax_fault+0x26/0x30 fs/block_dev.c:1730 [< inline >] wp_pfn_shared mm/memory.c:2208 [] do_wp_page+0xc85/0x14f0 mm/memory.c:2307 [< inline >] handle_pte_fault mm/memory.c:3323 [< inline >] __handle_mm_fault mm/memory.c:3417 [] handle_mm_fault+0x2483/0x4640 mm/memory.c:3446 [] __do_page_fault+0x376/0x960 arch/x86/mm/fault.c:1238 [] trace_do_page_fault+0xe8/0x420 arch/x86/mm/fault.c:1331 [] do_async_page_fault+0x14/0xd0 arch/x86/kernel/kvm.c:264 [] async_page_fault+0x28/0x30 arch/x86/entry/entry_64.S:986 [] entry_SYSCALL_64_fastpath+0x16/0x7a arch/x86/entry/entry_64.S:185 ---[ end trace dae21e0f85f1f98c ]--- Fixes: 5a023cdba50c ("block: enable dax for raw block devices") Reported-by: Dmitry Vyukov Reported-by: Kirill A. Shutemov Suggested-by: Jan Kara Reviewed-by: Jan Kara Suggested-by: Matthew Wilcox Tested-by: Ross Zwisler Signed-off-by: Dan Williams --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1a2046275cdf..b10002d4a5f5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2907,7 +2907,7 @@ extern void replace_mount_options(struct super_block *sb, char *options); static inline bool io_is_direct(struct file *filp) { - return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp)); + return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host); } static inline int iocb_flags(struct file *file) -- cgit v1.2.3 From 9f4736fe7ca804aa79b5916221bb13dfc6221a0f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 28 Jan 2016 20:13:39 -0800 Subject: block: revert runtime dax control of the raw block device Dynamically enabling DAX requires that the page cache first be flushed and invalidated. This must occur atomically with the change of DAX mode otherwise we confuse the fsync/msync tracking and violate data durability guarantees. Eliminate the possibilty of DAX-disabled to DAX-enabled transitions for now and revisit this for the next cycle. Cc: Jan Kara Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Dave Chinner Cc: Matthew Wilcox Cc: Andrew Morton Cc: Ross Zwisler Signed-off-by: Dan Williams --- include/linux/fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b10002d4a5f5..ae681002100a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -484,9 +484,6 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; -#ifdef CONFIG_FS_DAX - int bd_map_count; -#endif }; /* -- cgit v1.2.3 From d1a5f2b4d8a125943dcb6b032fc7eaefc2c78296 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 28 Jan 2016 20:25:31 -0800 Subject: block: use DAX for partition table reads Avoid populating pagecache when the block device is in DAX mode. Otherwise these page cache entries collide with the fsync/msync implementation and break data durability guarantees. Cc: Jan Kara Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Dave Chinner Cc: Andrew Morton Reported-by: Ross Zwisler Tested-by: Ross Zwisler Reviewed-by: Matthew Wilcox Signed-off-by: Dan Williams --- include/linux/dax.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 8204c3dc3800..818e45078929 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -14,6 +14,17 @@ int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, dax_iodone_t); + +#ifdef CONFIG_FS_DAX +struct page *read_dax_sector(struct block_device *bdev, sector_t n); +#else +static inline struct page *read_dax_sector(struct block_device *bdev, + sector_t n) +{ + return ERR_PTR(-ENXIO); +} +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, unsigned int flags, get_block_t, dax_iodone_t); -- cgit v1.2.3 From 76e9f0ee52b0be5761e29847e0ef01f23f24f1df Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 22 Jan 2016 09:43:28 -0800 Subject: phys_to_pfn_t: use phys_addr_t A dma_addr_t is potentially smaller than a phys_addr_t on some archs. Don't truncate the address when doing the pfn conversion. Cc: Ross Zwisler Reported-by: Matthew Wilcox [willy: fix pfn_t_to_phys as well] Signed-off-by: Dan Williams --- include/linux/pfn_t.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 0703b5360d31..37448ab5fb5c 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -29,7 +29,7 @@ static inline pfn_t pfn_to_pfn_t(unsigned long pfn) return __pfn_to_pfn_t(pfn, 0); } -extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags); +extern pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags); static inline bool pfn_t_has_page(pfn_t pfn) { @@ -48,7 +48,7 @@ static inline struct page *pfn_t_to_page(pfn_t pfn) return NULL; } -static inline dma_addr_t pfn_t_to_phys(pfn_t pfn) +static inline phys_addr_t pfn_t_to_phys(pfn_t pfn) { return PFN_PHYS(pfn_t_to_pfn(pfn)); } -- cgit v1.2.3