From 923386f761f5ff35da2ac778839876fe4a2f165f Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Fri, 18 Dec 2015 15:36:57 +0200 Subject: clk: ti: remove un-used definitions from public clk_hw_omap struct Clksel support has been deprecated a while back, so remove these from the struct also. Signed-off-by: Tero Kristo Acked-by: Tony Lindgren --- include/linux/clk/ti.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h index 6110fe09ed18..07308db5a15d 100644 --- a/include/linux/clk/ti.h +++ b/include/linux/clk/ti.h @@ -129,8 +129,6 @@ struct clk_hw_omap_ops { * @enable_bit: bitshift to write to enable/disable the clock (see @enable_reg) * @flags: see "struct clk.flags possibilities" above * @clksel_reg: for clksel clks, register va containing src/divisor select - * @clksel_mask: bitmask in @clksel_reg for the src/divisor selector - * @clksel: for clksel clks, pointer to struct clksel for this clock * @dpll_data: for DPLLs, pointer to struct dpll_data for this clock * @clkdm_name: clockdomain name that this clock is contained in * @clkdm: pointer to struct clockdomain, resolved from @clkdm_name at runtime @@ -145,8 +143,6 @@ struct clk_hw_omap { u8 enable_bit; u8 flags; void __iomem *clksel_reg; - u32 clksel_mask; - const struct clksel *clksel; struct dpll_data *dpll_data; const char *clkdm_name; struct clockdomain *clkdm; -- cgit v1.2.3 From b6f27b2db2df395d65b02a758861c7fc54edbec1 Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Fri, 30 Sep 2016 14:10:11 +0300 Subject: clk: ti: add clkdm_lookup to the exported functions This will be needed to move some additional clockdomain functionality under clock driver. Signed-off-by: Tero Kristo Acked-by: Tony Lindgren --- include/linux/clk/ti.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h index 07308db5a15d..bc7fd8f0fb5d 100644 --- a/include/linux/clk/ti.h +++ b/include/linux/clk/ti.h @@ -213,6 +213,7 @@ struct clk_omap_reg { * @clk_writel: pointer to register write function * @clkdm_clk_enable: pointer to clockdomain enable function * @clkdm_clk_disable: pointer to clockdomain disable function + * @clkdm_lookup: pointer to clockdomain lookup function * @cm_wait_module_ready: pointer to CM module wait ready function * @cm_split_idlest_reg: pointer to CM module function to split idlest reg * @@ -228,6 +229,7 @@ struct ti_clk_ll_ops { int (*clkdm_clk_enable)(struct clockdomain *clkdm, struct clk *clk); int (*clkdm_clk_disable)(struct clockdomain *clkdm, struct clk *clk); + struct clockdomain * (*clkdm_lookup)(const char *name); int (*cm_wait_module_ready)(u8 part, s16 prcm_mod, u16 idlest_reg, u8 idlest_shift); int (*cm_split_idlest_reg)(void __iomem *idlest_reg, s16 *prcm_inst, -- cgit v1.2.3 From 2e1a294c0f2273a6d3537c91965ca46a6483bd8c Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Fri, 30 Sep 2016 14:13:38 +0300 Subject: clk: ti: move omap2_init_clk_clkdm under TI clock driver This is not needed outside the driver, so move it inside it and remove the prototype from the public header also. Signed-off-by: Tero Kristo Acked-by: Tony Lindgren --- include/linux/clk/ti.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h index bc7fd8f0fb5d..626ae94b7444 100644 --- a/include/linux/clk/ti.h +++ b/include/linux/clk/ti.h @@ -238,7 +238,6 @@ struct ti_clk_ll_ops { #define to_clk_hw_omap(_hw) container_of(_hw, struct clk_hw_omap, hw) -void omap2_init_clk_clkdm(struct clk_hw *clk); int omap2_clk_disable_autoidle_all(void); int omap2_clk_enable_autoidle_all(void); int omap2_clk_allow_idle(struct clk *clk); -- cgit v1.2.3 From c91f07801f144920f8467486a1e36e42ed9d9ff2 Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Mon, 30 Jan 2017 16:01:36 +0200 Subject: clk: ti: drop unnecessary MEMMAP_ADDRESSING flag This has been superceded by the usage of ti_clk_ll_ops for now. Signed-off-by: Tero Kristo Acked-by: Tony Lindgren --- include/linux/clk/ti.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h index 626ae94b7444..affdabd0b6a1 100644 --- a/include/linux/clk/ti.h +++ b/include/linux/clk/ti.h @@ -168,7 +168,6 @@ struct clk_hw_omap { * should be used. This is a temporary solution - a better approach * would be to associate clock type-specific data with the clock, * similar to the struct dpll_data approach. - * MEMMAP_ADDRESSING: Use memmap addressing to access clock registers. */ #define ENABLE_REG_32BIT (1 << 0) /* Use 32-bit access */ #define CLOCK_IDLE_CONTROL (1 << 1) @@ -176,7 +175,6 @@ struct clk_hw_omap { #define ENABLE_ON_INIT (1 << 3) /* Enable upon framework init */ #define INVERT_ENABLE (1 << 4) /* 0 enables, 1 disables */ #define CLOCK_CLKOUTX2 (1 << 5) -#define MEMMAP_ADDRESSING (1 << 6) /* CM_CLKEN_PLL*.EN* bit values - not all are available for every DPLL */ #define DPLL_LOW_POWER_STOP 0x1 -- cgit v1.2.3 From 6c0afb503937a12a8d20a805fcf263e31afa9871 Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Thu, 9 Feb 2017 11:24:37 +0200 Subject: clk: ti: convert to use proper register definition for all accesses Currently, TI clock driver uses an encapsulated struct that is cast into a void pointer to store all register addresses. This can be considered as rather nasty hackery, and prevents from expanding the register address field also. Instead, replace all the code to use proper struct in place for this, which contains all the previously used data. This patch is rather large as it is touching multiple files, but this can't be split up as we need to avoid any boot breakage. Signed-off-by: Tero Kristo Acked-by: Tony Lindgren --- include/linux/clk/ti.h | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h index affdabd0b6a1..d18da839b810 100644 --- a/include/linux/clk/ti.h +++ b/include/linux/clk/ti.h @@ -18,6 +18,18 @@ #include #include +/** + * struct clk_omap_reg - OMAP register declaration + * @offset: offset from the master IP module base address + * @index: index of the master IP module + */ +struct clk_omap_reg { + void __iomem *ptr; + u16 offset; + u8 index; + u8 flags; +}; + /** * struct dpll_data - DPLL registers and integration data * @mult_div1_reg: register containing the DPLL M and N bitfields @@ -67,12 +79,12 @@ * can be placed into read-only space. */ struct dpll_data { - void __iomem *mult_div1_reg; + struct clk_omap_reg mult_div1_reg; u32 mult_mask; u32 div1_mask; struct clk_hw *clk_bypass; struct clk_hw *clk_ref; - void __iomem *control_reg; + struct clk_omap_reg control_reg; u32 enable_mask; unsigned long last_rounded_rate; u16 last_rounded_m; @@ -84,8 +96,8 @@ struct dpll_data { u16 max_divider; unsigned long max_rate; u8 modes; - void __iomem *autoidle_reg; - void __iomem *idlest_reg; + struct clk_omap_reg autoidle_reg; + struct clk_omap_reg idlest_reg; u32 autoidle_mask; u32 freqsel_mask; u32 idlest_mask; @@ -113,10 +125,10 @@ struct clk_hw_omap; */ struct clk_hw_omap_ops { void (*find_idlest)(struct clk_hw_omap *oclk, - void __iomem **idlest_reg, + struct clk_omap_reg *idlest_reg, u8 *idlest_bit, u8 *idlest_val); void (*find_companion)(struct clk_hw_omap *oclk, - void __iomem **other_reg, + struct clk_omap_reg *other_reg, u8 *other_bit); void (*allow_idle)(struct clk_hw_omap *oclk); void (*deny_idle)(struct clk_hw_omap *oclk); @@ -139,10 +151,10 @@ struct clk_hw_omap { struct list_head node; unsigned long fixed_rate; u8 fixed_div; - void __iomem *enable_reg; + struct clk_omap_reg enable_reg; u8 enable_bit; u8 flags; - void __iomem *clksel_reg; + struct clk_omap_reg clksel_reg; struct dpll_data *dpll_data; const char *clkdm_name; struct clockdomain *clkdm; @@ -195,16 +207,6 @@ enum { CLK_MAX_MEMMAPS }; -/** - * struct clk_omap_reg - OMAP register declaration - * @offset: offset from the master IP module base address - * @index: index of the master IP module - */ -struct clk_omap_reg { - u16 offset; - u16 index; -}; - /** * struct ti_clk_ll_ops - low-level ops for clocks * @clk_readl: pointer to register read function @@ -222,16 +224,16 @@ struct clk_omap_reg { * operations not provided directly by clock drivers. */ struct ti_clk_ll_ops { - u32 (*clk_readl)(void __iomem *reg); - void (*clk_writel)(u32 val, void __iomem *reg); + u32 (*clk_readl)(const struct clk_omap_reg *reg); + void (*clk_writel)(u32 val, const struct clk_omap_reg *reg); int (*clkdm_clk_enable)(struct clockdomain *clkdm, struct clk *clk); int (*clkdm_clk_disable)(struct clockdomain *clkdm, struct clk *clk); struct clockdomain * (*clkdm_lookup)(const char *name); int (*cm_wait_module_ready)(u8 part, s16 prcm_mod, u16 idlest_reg, u8 idlest_shift); - int (*cm_split_idlest_reg)(void __iomem *idlest_reg, s16 *prcm_inst, - u8 *idlest_reg_id); + int (*cm_split_idlest_reg)(struct clk_omap_reg *idlest_reg, + s16 *prcm_inst, u8 *idlest_reg_id); }; #define to_clk_hw_omap(_hw) container_of(_hw, struct clk_hw_omap, hw) -- cgit v1.2.3 From 7f501f0a72036dc29ad9a53811474c393634b401 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 24 May 2016 19:20:05 +0200 Subject: mtd: nand: Store nand ID in struct nand_chip Store the NAND ID in struct nand_chip to avoid passing id_data and id_len as function parameters. Signed-off-by: Boris Brezillon Acked-by: Richard Weinberger Reviewed-by: Marek Vasut --- include/linux/mtd/nand.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 9591e0fbe5bd..e2c11351b1bd 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -464,6 +464,17 @@ struct nand_jedec_params { __le16 crc; } __packed; +/** + * struct nand_id - NAND id structure + * @data: buffer containing the id bytes. Currently 8 bytes large, but can + * be extended if required. + * @len: ID length. + */ +struct nand_id { + u8 data[8]; + int len; +}; + /** * struct nand_hw_control - Control structure for hardware controller (e.g ECC generator) shared among independent devices * @lock: protection lock @@ -793,6 +804,7 @@ nand_get_sdr_timings(const struct nand_data_interface *conf) * @pagebuf_bitflips: [INTERN] holds the bitflip count for the page which is * currently in data_buf. * @subpagesize: [INTERN] holds the subpagesize + * @id: [INTERN] holds NAND ID * @onfi_version: [INTERN] holds the chip ONFI version (BCD encoded), * non 0 if ONFI supported. * @jedec_version: [INTERN] holds the chip JEDEC version (BCD encoded), @@ -881,6 +893,7 @@ struct nand_chip { int badblockpos; int badblockbits; + struct nand_id id; int onfi_version; int jedec_version; union { -- cgit v1.2.3 From 8cfb9ab68f90703d419870fce7ac21ac401399f2 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Sat, 7 Jan 2017 15:15:57 +0100 Subject: mtd: nand: Rename the nand_manufacturers struct Drop the 's' at the end of nand_manufacturers since the struct is actually describing a single manufacturer, not a manufacturer table. Signed-off-by: Boris Brezillon --- include/linux/mtd/nand.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index e2c11351b1bd..9c679e8bde42 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -1062,17 +1062,17 @@ struct nand_flash_dev { }; /** - * struct nand_manufacturers - NAND Flash Manufacturer ID Structure + * struct nand_manufacturer - NAND Flash Manufacturer structure * @name: Manufacturer name * @id: manufacturer ID code of device. */ -struct nand_manufacturers { +struct nand_manufacturer { int id; char *name; }; extern struct nand_flash_dev nand_flash_ids[]; -extern struct nand_manufacturers nand_manuf_ids[]; +extern struct nand_manufacturer nand_manuf_ids[]; int nand_default_bbt(struct mtd_info *mtd); int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs); -- cgit v1.2.3 From bcc678c2d7a0e0af14cb3d858ebd367be378c172 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Sat, 7 Jan 2017 15:48:25 +0100 Subject: mtd: nand: Do not expose the NAND manufacturer table directly There is no reason to expose the NAND manufacturer table. Provide an helper function to find manufacturers by their id. We also turn the nand_manufacturers table into a const array, since its members are not modified after the initial assignment. Finally, we remove the sentinel manufacturer entry from the manufacturers table (we already have the array size information given by ARRAY_SIZE()), and add the nand_manufacturer_name() helper to handle the "Unknown" case properly. Signed-off-by: Boris Brezillon --- include/linux/mtd/nand.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 9c679e8bde42..6415aa16043c 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -1071,8 +1071,15 @@ struct nand_manufacturer { char *name; }; +const struct nand_manufacturer *nand_get_manufacturer(u8 id); + +static inline const char * +nand_manufacturer_name(const struct nand_manufacturer *manufacturer) +{ + return manufacturer ? manufacturer->name : "Unknown"; +} + extern struct nand_flash_dev nand_flash_ids[]; -extern struct nand_manufacturer nand_manuf_ids[]; int nand_default_bbt(struct mtd_info *mtd); int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs); -- cgit v1.2.3 From abbe26d144ec22bb067fa414d717b9f7ca2e12bd Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 8 Jun 2016 09:32:55 +0200 Subject: mtd: nand: Add manufacturer specific initialization/detection steps A lot of NANDs are implementing generic features in a non-generic way, or are providing advanced auto-detection logic where the NAND ID bytes meaning changes with the NAND generation. Providing this vendor specific initialization step will allow us to get rid of full-id entries in the nand_ids table or all the vendor specific cases added over the time in the generic NAND ID decoding logic. Signed-off-by: Boris Brezillon --- include/linux/mtd/nand.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 6415aa16043c..ee9a19f42293 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -731,6 +731,20 @@ nand_get_sdr_timings(const struct nand_data_interface *conf) return &conf->timings.sdr; } +/** + * struct nand_manufacturer_ops - NAND Manufacturer operations + * @detect: detect the NAND memory organization and capabilities + * @init: initialize all vendor specific fields (like the ->read_retry() + * implementation) if any. + * @cleanup: the ->init() function may have allocated resources, ->cleanup() + * is here to let vendor specific code release those resources. + */ +struct nand_manufacturer_ops { + void (*detect)(struct nand_chip *chip); + int (*init)(struct nand_chip *chip); + void (*cleanup)(struct nand_chip *chip); +}; + /** * struct nand_chip - NAND Private Flash Chip Data * @mtd: MTD device registered to the MTD framework @@ -835,6 +849,7 @@ nand_get_sdr_timings(const struct nand_data_interface *conf) * additional error status checks (determine if errors are * correctable). * @write_page: [REPLACEABLE] High-level page write function + * @manufacturer: [INTERN] Contains manufacturer information */ struct nand_chip { @@ -923,6 +938,11 @@ struct nand_chip { struct nand_bbt_descr *badblock_pattern; void *priv; + + struct { + const struct nand_manufacturer *desc; + void *priv; + } manufacturer; }; extern const struct mtd_ooblayout_ops nand_ooblayout_sp_ops; @@ -959,6 +979,17 @@ static inline void nand_set_controller_data(struct nand_chip *chip, void *priv) chip->priv = priv; } +static inline void nand_set_manufacturer_data(struct nand_chip *chip, + void *priv) +{ + chip->manufacturer.priv = priv; +} + +static inline void *nand_get_manufacturer_data(struct nand_chip *chip) +{ + return chip->manufacturer.priv; +} + /* * NAND Flash Manufacturer ID Codes */ @@ -1065,10 +1096,12 @@ struct nand_flash_dev { * struct nand_manufacturer - NAND Flash Manufacturer structure * @name: Manufacturer name * @id: manufacturer ID code of device. + * @ops: manufacturer operations */ struct nand_manufacturer { int id; char *name; + const struct nand_manufacturer_ops *ops; }; const struct nand_manufacturer *nand_get_manufacturer(u8 id); @@ -1246,4 +1279,6 @@ int nand_reset(struct nand_chip *chip, int chipnr); /* Free resources held by the NAND device */ void nand_cleanup(struct nand_chip *chip); +/* Default extended ID decoding function */ +void nand_decode_ext_id(struct nand_chip *chip); #endif /* __LINUX_MTD_NAND_H */ -- cgit v1.2.3 From c51d0ac59f24200dfdccc897ff7c3c9446c7599a Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 8 Jun 2016 10:22:19 +0200 Subject: mtd: nand: Move Samsung specific init/detection logic in nand_samsung.c Move Samsung specific initialization and detection logic into nand_samsung.c. This is part of the "separate vendor specific code from core" cleanup process. Signed-off-by: Boris Brezillon Acked-by: Richard Weinberger --- include/linux/mtd/nand.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index ee9a19f42293..2f83cb55392f 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -1114,6 +1114,8 @@ nand_manufacturer_name(const struct nand_manufacturer *manufacturer) extern struct nand_flash_dev nand_flash_ids[]; +extern const struct nand_manufacturer_ops samsung_nand_manuf_ops; + int nand_default_bbt(struct mtd_info *mtd); int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs); int nand_isreserved_bbt(struct mtd_info *mtd, loff_t offs); -- cgit v1.2.3 From 01389b6bd2f4f7649cdbb4a99a15d9e0c05d6f8c Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 8 Jun 2016 10:30:18 +0200 Subject: mtd: nand: Move Hynix specific init/detection logic in nand_hynix.c Move Hynix specific initialization and detection logic into nand_hynix.c. This is part of the "separate vendor specific code from core" cleanup process. Signed-off-by: Boris Brezillon Acked-by: Richard Weinberger --- include/linux/mtd/nand.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 2f83cb55392f..74e3a231cb56 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -1115,6 +1115,7 @@ nand_manufacturer_name(const struct nand_manufacturer *manufacturer) extern struct nand_flash_dev nand_flash_ids[]; extern const struct nand_manufacturer_ops samsung_nand_manuf_ops; +extern const struct nand_manufacturer_ops hynix_nand_manuf_ops; int nand_default_bbt(struct mtd_info *mtd); int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs); -- cgit v1.2.3 From 9b2d61f80b060ce3ea5af2a99e148b0b214932b2 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 8 Jun 2016 10:34:57 +0200 Subject: mtd: nand: Move Toshiba specific init/detection logic in nand_toshiba.c Move Toshiba specific initialization and detection logic into nand_toshiba.c. This is part of the "separate vendor specific code from core" cleanup process. Signed-off-by: Boris Brezillon Acked-by: Richard Weinberger --- include/linux/mtd/nand.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 74e3a231cb56..dd9e3b5ddd4f 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -1114,6 +1114,7 @@ nand_manufacturer_name(const struct nand_manufacturer *manufacturer) extern struct nand_flash_dev nand_flash_ids[]; +extern const struct nand_manufacturer_ops toshiba_nand_manuf_ops; extern const struct nand_manufacturer_ops samsung_nand_manuf_ops; extern const struct nand_manufacturer_ops hynix_nand_manuf_ops; -- cgit v1.2.3 From 10d4e75c36f6c16311dde1461f318210da357219 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 8 Jun 2016 10:38:57 +0200 Subject: mtd: nand: Move Micron specific init logic in nand_micron.c Move Micron specific initialization logic into nand_micron.c. This is part of the "separate vendor specific code from core" cleanup process. Signed-off-by: Boris Brezillon Acked-by: Richard Weinberger --- include/linux/mtd/nand.h | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index dd9e3b5ddd4f..7d0f18ecbf57 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -366,26 +366,6 @@ struct onfi_ext_param_page { */ } __packed; -struct nand_onfi_vendor_micron { - u8 two_plane_read; - u8 read_cache; - u8 read_unique_id; - u8 dq_imped; - u8 dq_imped_num_settings; - u8 dq_imped_feat_addr; - u8 rb_pulldown_strength; - u8 rb_pulldown_strength_feat_addr; - u8 rb_pulldown_strength_num_settings; - u8 otp_mode; - u8 otp_page_start; - u8 otp_data_prot_addr; - u8 otp_num_pages; - u8 otp_feat_addr; - u8 read_retry_options; - u8 reserved[72]; - u8 param_revision; -} __packed; - struct jedec_ecc_info { u8 ecc_bits; u8 codeword_size; @@ -1117,6 +1097,7 @@ extern struct nand_flash_dev nand_flash_ids[]; extern const struct nand_manufacturer_ops toshiba_nand_manuf_ops; extern const struct nand_manufacturer_ops samsung_nand_manuf_ops; extern const struct nand_manufacturer_ops hynix_nand_manuf_ops; +extern const struct nand_manufacturer_ops micron_nand_manuf_ops; int nand_default_bbt(struct mtd_info *mtd); int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs); -- cgit v1.2.3 From 229204da53b31d576fcc1c93a33626943ea8202c Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 8 Jun 2016 10:42:23 +0200 Subject: mtd: nand: Move AMD/Spansion specific init/detection logic in nand_amd.c Move AMD/Spansion specific initialization/detection logic into nand_amd.c. This is part of the "separate vendor specific code from core" cleanup process. Signed-off-by: Boris Brezillon Acked-by: Richard Weinberger --- include/linux/mtd/nand.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 7d0f18ecbf57..97dce42778e9 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -1098,6 +1098,7 @@ extern const struct nand_manufacturer_ops toshiba_nand_manuf_ops; extern const struct nand_manufacturer_ops samsung_nand_manuf_ops; extern const struct nand_manufacturer_ops hynix_nand_manuf_ops; extern const struct nand_manufacturer_ops micron_nand_manuf_ops; +extern const struct nand_manufacturer_ops amd_nand_manuf_ops; int nand_default_bbt(struct mtd_info *mtd); int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs); -- cgit v1.2.3 From 3b5206f4be9b65d2f0f85b3239cf117a1d0de7ce Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 8 Jun 2016 10:43:26 +0200 Subject: mtd: nand: Move Macronix specific initialization in nand_macronix.c Move Macronix specific initialization logic into nand_macronix.c. This is part of the "separate vendor specific code from core" cleanup process. Signed-off-by: Boris Brezillon --- include/linux/mtd/nand.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 97dce42778e9..c7de017c7f4c 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -1099,6 +1099,7 @@ extern const struct nand_manufacturer_ops samsung_nand_manuf_ops; extern const struct nand_manufacturer_ops hynix_nand_manuf_ops; extern const struct nand_manufacturer_ops micron_nand_manuf_ops; extern const struct nand_manufacturer_ops amd_nand_manuf_ops; +extern const struct nand_manufacturer_ops macronix_nand_manuf_ops; int nand_default_bbt(struct mtd_info *mtd); int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs); -- cgit v1.2.3 From 967c9cca2cc50569efc65945325c173cecba83bd Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Wed, 11 Mar 2015 14:39:39 +0100 Subject: tee: generic TEE subsystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initial patch for generic TEE subsystem. This subsystem provides: * Registration/un-registration of TEE drivers. * Shared memory between normal world and secure world. * Ioctl interface for interaction with user space. * Sysfs implementation_id of TEE driver A TEE (Trusted Execution Environment) driver is a driver that interfaces with a trusted OS running in some secure environment, for example, TrustZone on ARM cpus, or a separate secure co-processor etc. The TEE subsystem can serve a TEE driver for a Global Platform compliant TEE, but it's not limited to only Global Platform TEEs. This patch builds on other similar implementations trying to solve the same problem: * "optee_linuxdriver" by among others Jean-michel DELORME and Emmanuel MICHEL * "Generic TrustZone Driver" by Javier González Acked-by: Andreas Dannenberg Tested-by: Jerome Forissier (HiKey) Tested-by: Volodymyr Babchuk (RCAR H3) Tested-by: Scott Branden Reviewed-by: Javier González Signed-off-by: Jens Wiklander --- include/linux/tee_drv.h | 277 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 include/linux/tee_drv.h (limited to 'include/linux') diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h new file mode 100644 index 000000000000..0f175b8f6456 --- /dev/null +++ b/include/linux/tee_drv.h @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2015-2016, Linaro Limited + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef __TEE_DRV_H +#define __TEE_DRV_H + +#include +#include +#include +#include + +/* + * The file describes the API provided by the generic TEE driver to the + * specific TEE driver. + */ + +#define TEE_SHM_MAPPED 0x1 /* Memory mapped by the kernel */ +#define TEE_SHM_DMA_BUF 0x2 /* Memory with dma-buf handle */ + +struct tee_device; +struct tee_shm; +struct tee_shm_pool; + +/** + * struct tee_context - driver specific context on file pointer data + * @teedev: pointer to this drivers struct tee_device + * @list_shm: List of shared memory object owned by this context + * @data: driver specific context data, managed by the driver + */ +struct tee_context { + struct tee_device *teedev; + struct list_head list_shm; + void *data; +}; + +struct tee_param_memref { + size_t shm_offs; + size_t size; + struct tee_shm *shm; +}; + +struct tee_param_value { + u64 a; + u64 b; + u64 c; +}; + +struct tee_param { + u64 attr; + union { + struct tee_param_memref memref; + struct tee_param_value value; + } u; +}; + +/** + * struct tee_driver_ops - driver operations vtable + * @get_version: returns version of driver + * @open: called when the device file is opened + * @release: release this open file + * @open_session: open a new session + * @close_session: close a session + * @invoke_func: invoke a trusted function + * @cancel_req: request cancel of an ongoing invoke or open + * @supp_revc: called for supplicant to get a command + * @supp_send: called for supplicant to send a response + */ +struct tee_driver_ops { + void (*get_version)(struct tee_device *teedev, + struct tee_ioctl_version_data *vers); + int (*open)(struct tee_context *ctx); + void (*release)(struct tee_context *ctx); + int (*open_session)(struct tee_context *ctx, + struct tee_ioctl_open_session_arg *arg, + struct tee_param *param); + int (*close_session)(struct tee_context *ctx, u32 session); + int (*invoke_func)(struct tee_context *ctx, + struct tee_ioctl_invoke_arg *arg, + struct tee_param *param); + int (*cancel_req)(struct tee_context *ctx, u32 cancel_id, u32 session); + int (*supp_recv)(struct tee_context *ctx, u32 *func, u32 *num_params, + struct tee_param *param); + int (*supp_send)(struct tee_context *ctx, u32 ret, u32 num_params, + struct tee_param *param); +}; + +/** + * struct tee_desc - Describes the TEE driver to the subsystem + * @name: name of driver + * @ops: driver operations vtable + * @owner: module providing the driver + * @flags: Extra properties of driver, defined by TEE_DESC_* below + */ +#define TEE_DESC_PRIVILEGED 0x1 +struct tee_desc { + const char *name; + const struct tee_driver_ops *ops; + struct module *owner; + u32 flags; +}; + +/** + * tee_device_alloc() - Allocate a new struct tee_device instance + * @teedesc: Descriptor for this driver + * @dev: Parent device for this device + * @pool: Shared memory pool, NULL if not used + * @driver_data: Private driver data for this device + * + * Allocates a new struct tee_device instance. The device is + * removed by tee_device_unregister(). + * + * @returns a pointer to a 'struct tee_device' or an ERR_PTR on failure + */ +struct tee_device *tee_device_alloc(const struct tee_desc *teedesc, + struct device *dev, + struct tee_shm_pool *pool, + void *driver_data); + +/** + * tee_device_register() - Registers a TEE device + * @teedev: Device to register + * + * tee_device_unregister() need to be called to remove the @teedev if + * this function fails. + * + * @returns < 0 on failure + */ +int tee_device_register(struct tee_device *teedev); + +/** + * tee_device_unregister() - Removes a TEE device + * @teedev: Device to unregister + * + * This function should be called to remove the @teedev even if + * tee_device_register() hasn't been called yet. Does nothing if + * @teedev is NULL. + */ +void tee_device_unregister(struct tee_device *teedev); + +/** + * struct tee_shm_pool_mem_info - holds information needed to create a shared + * memory pool + * @vaddr: Virtual address of start of pool + * @paddr: Physical address of start of pool + * @size: Size in bytes of the pool + */ +struct tee_shm_pool_mem_info { + unsigned long vaddr; + phys_addr_t paddr; + size_t size; +}; + +/** + * tee_shm_pool_alloc_res_mem() - Create a shared memory pool from reserved + * memory range + * @priv_info: Information for driver private shared memory pool + * @dmabuf_info: Information for dma-buf shared memory pool + * + * Start and end of pools will must be page aligned. + * + * Allocation with the flag TEE_SHM_DMA_BUF set will use the range supplied + * in @dmabuf, others will use the range provided by @priv. + * + * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure. + */ +struct tee_shm_pool * +tee_shm_pool_alloc_res_mem(struct tee_shm_pool_mem_info *priv_info, + struct tee_shm_pool_mem_info *dmabuf_info); + +/** + * tee_shm_pool_free() - Free a shared memory pool + * @pool: The shared memory pool to free + * + * The must be no remaining shared memory allocated from this pool when + * this function is called. + */ +void tee_shm_pool_free(struct tee_shm_pool *pool); + +/** + * tee_get_drvdata() - Return driver_data pointer + * @returns the driver_data pointer supplied to tee_register(). + */ +void *tee_get_drvdata(struct tee_device *teedev); + +/** + * tee_shm_alloc() - Allocate shared memory + * @ctx: Context that allocates the shared memory + * @size: Requested size of shared memory + * @flags: Flags setting properties for the requested shared memory. + * + * Memory allocated as global shared memory is automatically freed when the + * TEE file pointer is closed. The @flags field uses the bits defined by + * TEE_SHM_* above. TEE_SHM_MAPPED must currently always be set. If + * TEE_SHM_DMA_BUF global shared memory will be allocated and associated + * with a dma-buf handle, else driver private memory. + * + * @returns a pointer to 'struct tee_shm' + */ +struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags); + +/** + * tee_shm_free() - Free shared memory + * @shm: Handle to shared memory to free + */ +void tee_shm_free(struct tee_shm *shm); + +/** + * tee_shm_put() - Decrease reference count on a shared memory handle + * @shm: Shared memory handle + */ +void tee_shm_put(struct tee_shm *shm); + +/** + * tee_shm_va2pa() - Get physical address of a virtual address + * @shm: Shared memory handle + * @va: Virtual address to tranlsate + * @pa: Returned physical address + * @returns 0 on success and < 0 on failure + */ +int tee_shm_va2pa(struct tee_shm *shm, void *va, phys_addr_t *pa); + +/** + * tee_shm_pa2va() - Get virtual address of a physical address + * @shm: Shared memory handle + * @pa: Physical address to tranlsate + * @va: Returned virtual address + * @returns 0 on success and < 0 on failure + */ +int tee_shm_pa2va(struct tee_shm *shm, phys_addr_t pa, void **va); + +/** + * tee_shm_get_va() - Get virtual address of a shared memory plus an offset + * @shm: Shared memory handle + * @offs: Offset from start of this shared memory + * @returns virtual address of the shared memory + offs if offs is within + * the bounds of this shared memory, else an ERR_PTR + */ +void *tee_shm_get_va(struct tee_shm *shm, size_t offs); + +/** + * tee_shm_get_pa() - Get physical address of a shared memory plus an offset + * @shm: Shared memory handle + * @offs: Offset from start of this shared memory + * @pa: Physical address to return + * @returns 0 if offs is within the bounds of this shared memory, else an + * error code. + */ +int tee_shm_get_pa(struct tee_shm *shm, size_t offs, phys_addr_t *pa); + +/** + * tee_shm_get_id() - Get id of a shared memory object + * @shm: Shared memory handle + * @returns id + */ +int tee_shm_get_id(struct tee_shm *shm); + +/** + * tee_shm_get_from_id() - Find shared memory object and increase reference + * count + * @ctx: Context owning the shared memory + * @id: Id of shared memory object + * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure + */ +struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id); + +#endif /*__TEE_DRV_H*/ -- cgit v1.2.3 From 3843832fc8cadc2d48ba4ea4cd350a696906ac42 Mon Sep 17 00:00:00 2001 From: Peter De Schrijver Date: Tue, 28 Feb 2017 17:19:24 +0200 Subject: clk: tegra: Handle UTMIPLL IDDQ Export UTMIPLL IDDQ functions. These will be needed when powergating the XUSB partition. Signed-off-by: BH Hsieh Signed-off-by: Peter De Schrijver Signed-off-by: Thierry Reding --- include/linux/clk/tegra.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h index 7007a5f48080..e17d32831e28 100644 --- a/include/linux/clk/tegra.h +++ b/include/linux/clk/tegra.h @@ -125,5 +125,7 @@ extern void tegra210_xusb_pll_hw_control_enable(void); extern void tegra210_xusb_pll_hw_sequence_start(void); extern void tegra210_sata_pll_hw_control_enable(void); extern void tegra210_sata_pll_hw_sequence_start(void); +extern void tegra210_put_utmipll_in_iddq(void); +extern void tegra210_put_utmipll_out_iddq(void); #endif /* __LINUX_CLK_TEGRA_H_ */ -- cgit v1.2.3 From 59af78d78db8bde6a63e09772aa44192f772fa96 Mon Sep 17 00:00:00 2001 From: Peter De Schrijver Date: Wed, 15 Mar 2017 17:42:05 +0200 Subject: clk: tegra: Add SATA seq input control This will be used by the powergating driver to ensure proper sequencer state when the SATA domain is powergated. Signed-off-by: Peter De Schrijver Signed-off-by: Thierry Reding --- include/linux/clk/tegra.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h index e17d32831e28..d23c9cf26993 100644 --- a/include/linux/clk/tegra.h +++ b/include/linux/clk/tegra.h @@ -125,6 +125,7 @@ extern void tegra210_xusb_pll_hw_control_enable(void); extern void tegra210_xusb_pll_hw_sequence_start(void); extern void tegra210_sata_pll_hw_control_enable(void); extern void tegra210_sata_pll_hw_sequence_start(void); +extern void tegra210_set_sata_pll_seq_sw(bool state); extern void tegra210_put_utmipll_in_iddq(void); extern void tegra210_put_utmipll_out_iddq(void); -- cgit v1.2.3 From 61012985eb132a2fa5e4a3eddbc33528334fa377 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 16 Mar 2017 16:23:55 +0200 Subject: iommu/vt-d: Use lo_hi_readq() / lo_hi_writeq() There is already helper functions to do 64-bit I/O on 32-bit machines or buses, thus we don't need to reinvent the wheel. Signed-off-by: Andy Shevchenko Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index c573a52ae440..485a5b48f038 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -30,6 +30,8 @@ #include #include #include +#include + #include #include @@ -72,24 +74,8 @@ #define OFFSET_STRIDE (9) -#ifdef CONFIG_64BIT #define dmar_readq(a) readq(a) #define dmar_writeq(a,v) writeq(v,a) -#else -static inline u64 dmar_readq(void __iomem *addr) -{ - u32 lo, hi; - lo = readl(addr); - hi = readl(addr + 4); - return (((u64) hi) << 32) + lo; -} - -static inline void dmar_writeq(void __iomem *addr, u64 val) -{ - writel((u32)val, addr); - writel((u32)(val >> 32), addr + 4); -} -#endif #define DMAR_VER_MAJOR(v) (((v) & 0xf0) >> 4) #define DMAR_VER_MINOR(v) ((v) & 0x0f) -- cgit v1.2.3 From 273df9635385b2156851c7ee49f40658d7bcb29d Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 16 Mar 2017 17:00:19 +0000 Subject: iommu/dma: Make PCI window reservation generic Now that we're applying the IOMMU API reserved regions to our IOVA domains, we shouldn't need to privately special-case PCI windows, or indeed anything else which isn't specific to our iommu-dma layer. However, since those aren't IOMMU-specific either, rather than start duplicating code into IOMMU drivers let's transform the existing function into an iommu_get_resv_regions() helper that they can share. Signed-off-by: Robin Murphy Signed-off-by: Joerg Roedel --- include/linux/dma-iommu.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h index 5725c94b1f12..b6635a46fc7c 100644 --- a/include/linux/dma-iommu.h +++ b/include/linux/dma-iommu.h @@ -71,6 +71,7 @@ int iommu_dma_mapping_error(struct device *dev, dma_addr_t dma_addr); /* The DMA API isn't _quite_ the whole story, though... */ void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg); +void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list); #else @@ -100,6 +101,10 @@ static inline void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg) { } +static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list) +{ +} + #endif /* CONFIG_IOMMU_DMA */ #endif /* __KERNEL__ */ #endif /* __DMA_IOMMU_H */ -- cgit v1.2.3 From e8bb4673596ea28fab287dbc417e8100d798cd40 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Mon, 27 Mar 2017 07:31:03 +0200 Subject: dmaengine: pl330: remove pdata based initialization This driver is now used only on platforms which support device tree, so it is safe to remove legacy platform data based initialization code. Signed-off-by: Marek Szyprowski Reviewed-by: Ulf Hansson Acked-by: Arnd Bergmann For plat-samsung: Acked-by: Krzysztof Kozlowski Signed-off-by: Vinod Koul --- include/linux/amba/pl330.h | 35 ----------------------------------- 1 file changed, 35 deletions(-) delete mode 100644 include/linux/amba/pl330.h (limited to 'include/linux') diff --git a/include/linux/amba/pl330.h b/include/linux/amba/pl330.h deleted file mode 100644 index fe93758e8403..000000000000 --- a/include/linux/amba/pl330.h +++ /dev/null @@ -1,35 +0,0 @@ -/* linux/include/linux/amba/pl330.h - * - * Copyright (C) 2010 Samsung Electronics Co. Ltd. - * Jaswinder Singh - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#ifndef __AMBA_PL330_H_ -#define __AMBA_PL330_H_ - -#include - -struct dma_pl330_platdata { - /* - * Number of valid peripherals connected to DMAC. - * This may be different from the value read from - * CR0, as the PL330 implementation might have 'holes' - * in the peri list or the peri could also be reached - * from another DMAC which the platform prefers. - */ - u8 nr_valid_peri; - /* Array of valid peripherals */ - u8 *peri_id; - /* Operational capabilities */ - dma_cap_mask_t cap_mask; - /* Bytes to allocate for MC buffer */ - unsigned mcbuf_sz; -}; - -extern bool pl330_filter(struct dma_chan *chan, void *param); -#endif /* __AMBA_PL330_H_ */ -- cgit v1.2.3 From bf616d21f41174389c6d720ae21bf40f154474c8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 4 Apr 2017 16:54:21 +0100 Subject: Annotate module params that specify hardware parameters (eg. ioport) Provided an annotation for module parameters that specify hardware parameters (such as io ports, iomem addresses, irqs, dma channels, fixed dma buffers and other types). This will enable such parameters to be locked down in the core parameter parser for secure boot support. I've also included annotations as to what sort of hardware configuration each module is dealing with for future use. Some of these are straightforward (ioport, iomem, irq, dma), but there are also: (1) drivers that switch the semantics of a parameter between ioport and iomem depending on a second parameter, (2) drivers that appear to reserve a CPU memory buffer at a fixed address, (3) other parameters, such as bus types and irq selection bitmasks. For the moment, the hardware configuration type isn't actually stored, though its validity is checked. Signed-off-by: David Howells --- include/linux/moduleparam.h | 65 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 52666d90ca94..6be1949ebcdf 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -60,9 +60,11 @@ struct kernel_param_ops { * Flags available for kernel_param * * UNSAFE - the parameter is dangerous and setting it will taint the kernel + * HWPARAM - Hardware param not permitted in lockdown mode */ enum { - KERNEL_PARAM_FL_UNSAFE = (1 << 0) + KERNEL_PARAM_FL_UNSAFE = (1 << 0), + KERNEL_PARAM_FL_HWPARAM = (1 << 1), }; struct kernel_param { @@ -451,6 +453,67 @@ extern int param_set_bint(const char *val, const struct kernel_param *kp); perm, -1, 0); \ __MODULE_PARM_TYPE(name, "array of " #type) +enum hwparam_type { + hwparam_ioport, /* Module parameter configures an I/O port */ + hwparam_iomem, /* Module parameter configures an I/O mem address */ + hwparam_ioport_or_iomem, /* Module parameter could be either, depending on other option */ + hwparam_irq, /* Module parameter configures an I/O port */ + hwparam_dma, /* Module parameter configures a DMA channel */ + hwparam_dma_addr, /* Module parameter configures a DMA buffer address */ + hwparam_other, /* Module parameter configures some other value */ +}; + +/** + * module_param_hw_named - A parameter representing a hw parameters + * @name: a valid C identifier which is the parameter name. + * @value: the actual lvalue to alter. + * @type: the type of the parameter + * @hwtype: what the value represents (enum hwparam_type) + * @perm: visibility in sysfs. + * + * Usually it's a good idea to have variable names and user-exposed names the + * same, but that's harder if the variable must be non-static or is inside a + * structure. This allows exposure under a different name. + */ +#define module_param_hw_named(name, value, type, hwtype, perm) \ + param_check_##type(name, &(value)); \ + __module_param_call(MODULE_PARAM_PREFIX, name, \ + ¶m_ops_##type, &value, \ + perm, -1, \ + KERNEL_PARAM_FL_HWPARAM | (hwparam_##hwtype & 0)); \ + __MODULE_PARM_TYPE(name, #type) + +#define module_param_hw(name, type, hwtype, perm) \ + module_param_hw_named(name, name, type, hwtype, perm) + +/** + * module_param_hw_array - A parameter representing an array of hw parameters + * @name: the name of the array variable + * @type: the type, as per module_param() + * @hwtype: what the value represents (enum hwparam_type) + * @nump: optional pointer filled in with the number written + * @perm: visibility in sysfs + * + * Input and output are as comma-separated values. Commas inside values + * don't work properly (eg. an array of charp). + * + * ARRAY_SIZE(@name) is used to determine the number of elements in the + * array, so the definition must be visible. + */ +#define module_param_hw_array(name, type, hwtype, nump, perm) \ + param_check_##type(name, &(name)[0]); \ + static const struct kparam_array __param_arr_##name \ + = { .max = ARRAY_SIZE(name), .num = nump, \ + .ops = ¶m_ops_##type, \ + .elemsize = sizeof(name[0]), .elem = name }; \ + __module_param_call(MODULE_PARAM_PREFIX, name, \ + ¶m_array_ops, \ + .arr = &__param_arr_##name, \ + perm, -1, \ + KERNEL_PARAM_FL_HWPARAM | (hwparam_##hwtype & 0)); \ + __MODULE_PARM_TYPE(name, "array of " #type) + + extern const struct kernel_param_ops param_array_ops; extern const struct kernel_param_ops param_ops_string; -- cgit v1.2.3 From adf5e5168bd51c42332ebaa709351fa6ed65ea73 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 27 Jan 2017 12:22:54 +0000 Subject: iommu: Better document the IOMMU_PRIV flag This is a fairly subtle thing - let's make sure it's described as clearly as possible to avoid potential misunderstandings. Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- include/linux/iommu.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 2e4de0deee53..88ec8c6580d3 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -32,10 +32,13 @@ #define IOMMU_NOEXEC (1 << 3) #define IOMMU_MMIO (1 << 4) /* e.g. things like MSI doorbells */ /* - * This is to make the IOMMU API setup privileged - * mapppings accessible by the master only at higher - * privileged execution level and inaccessible at - * less privileged levels. + * Where the bus hardware includes a privilege level as part of its access type + * markings, and certain devices are capable of issuing transactions marked as + * either 'supervisor' or 'user', the IOMMU_PRIV flag requests that the other + * given permission flags only apply to accesses at the higher privilege level, + * and that unprivileged transactions should have as little access as possible. + * This would usually imply the same permissions as kernel mappings on the CPU, + * if the IOMMU page table format is equivalent. */ #define IOMMU_PRIV (1 << 5) -- cgit v1.2.3 From b8c17e6664c461e4aed545a943304c3b32dd309c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Nov 2016 14:25:21 -0800 Subject: rcu: Maintain special bits at bottom of ->dynticks counter Currently, IPIs are used to force other CPUs to invalidate their TLBs in response to a kernel virtual-memory mapping change. This works, but degrades both battery lifetime (for idle CPUs) and real-time response (for nohz_full CPUs), and in addition results in unnecessary IPIs due to the fact that CPUs executing in usermode are unaffected by stale kernel mappings. It would be better to cause a CPU executing in usermode to wait until it is entering kernel mode to do the flush, first to avoid interrupting usemode tasks and second to handle multiple flush requests with a single flush in the case of a long-running user task. This commit therefore reserves a bit at the bottom of the ->dynticks counter, which is checked upon exit from extended quiescent states. If it is set, it is cleared and then a new rcu_eqs_special_exit() macro is invoked, which, if not supplied, is an empty single-pass do-while loop. If this bottom bit is set on -entry- to an extended quiescent state, then a WARN_ON_ONCE() triggers. This bottom bit may be set using a new rcu_eqs_special_set() function, which returns true if the bit was set, or false if the CPU turned out to not be in an extended quiescent state. Please note that this function refuses to set the bit for a non-nohz_full CPU when that CPU is executing in usermode because usermode execution is tracked by RCU as a dyntick-idle extended quiescent state only for nohz_full CPUs. Reported-by: Andy Lutomirski Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcutiny.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index b452953e21c8..6c9d941e3962 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -33,6 +33,11 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp) return 0; } +static inline bool rcu_eqs_special_set(int cpu) +{ + return false; /* Never flag non-existent other CPUs! */ +} + static inline unsigned long get_state_synchronize_rcu(void) { return 0; -- cgit v1.2.3 From 77e5849688670280b173bb9e0544e9da7b2acc36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 14 Jan 2017 13:32:50 -0800 Subject: rcu: Make arch select smp_mb__after_unlock_lock() strength The definition of smp_mb__after_unlock_lock() is currently smp_mb() for CONFIG_PPC and a no-op otherwise. It would be better to instead provide an architecture-selectable Kconfig option, and select the strength of smp_mb__after_unlock_lock() based on that option. This commit therefore creates ARCH_WEAK_RELEASE_ACQUIRE, has PPC select it, and bases the definition of smp_mb__after_unlock_lock() on this new ARCH_WEAK_RELEASE_ACQUIRE Kconfig option. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Will Deacon Cc: Boqun Feng Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Acked-by: Michael Ellerman Cc: Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index de88b33c0974..e6146d0074f8 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1127,11 +1127,11 @@ do { \ * if the UNLOCK and LOCK are executed by the same CPU or if the * UNLOCK and LOCK operate on the same lock variable. */ -#ifdef CONFIG_PPC +#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ -#else /* #ifdef CONFIG_PPC */ +#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #define smp_mb__after_unlock_lock() do { } while (0) -#endif /* #else #ifdef CONFIG_PPC */ +#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #endif /* __LINUX_RCUPDATE_H */ -- cgit v1.2.3 From 900b1028ec388e50c98200641ae4274794c807cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Feb 2017 14:32:54 -0800 Subject: srcu: Allow SRCU to access rcu_scheduler_active This is primarily a code-movement commit in preparation for allowing SRCU to handle early-boot SRCU grace periods. Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 6c9d941e3962..5219be250f00 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -217,14 +217,14 @@ static inline void exit_rcu(void) { } -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) extern int rcu_scheduler_active __read_mostly; void rcu_scheduler_starting(void); -#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ static inline void rcu_scheduler_starting(void) { } -#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) -- cgit v1.2.3 From c2a8ec0778b2ca0d360ba9b5cac7fcd5ddfe798f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Mar 2017 15:31:55 -0800 Subject: srcu: Move to state-based grace-period sequencing The current SRCU grace-period processing might never reach the last portion of srcu_advance_batches(). This is OK given the current implementation, as the first portion, up to the try_check_zero() following the srcu_flip() is sufficient to drive grace periods forward. However, it has the unfortunate side-effect of making it impossible to determine when a given grace period has ended, and it will be necessary to efficiently trace ends of grace periods in order to efficiently handle per-CPU SRCU callback lists. This commit therefore adds states to the SRCU grace-period processing, so that the end of a given SRCU grace period is marked by the transition to the SRCU_STATE_DONE state. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index a598cf3ac70c..f149a685896c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -48,7 +48,7 @@ struct srcu_struct { unsigned long completed; struct srcu_array __percpu *per_cpu_ref; spinlock_t queue_lock; /* protect ->batch_queue, ->running */ - bool running; + int srcu_state; /* callbacks just queued */ struct rcu_batch batch_queue; /* callbacks try to do the first check_zero */ @@ -62,6 +62,12 @@ struct srcu_struct { #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ }; +/* Values for -> state variable. */ +#define SRCU_STATE_IDLE 0 +#define SRCU_STATE_SCAN1 1 +#define SRCU_STATE_SCAN2 2 +#define SRCU_STATE_DONE 3 + #ifdef CONFIG_DEBUG_LOCK_ALLOC int __init_srcu_struct(struct srcu_struct *sp, const char *name, @@ -89,7 +95,7 @@ void process_srcu(struct work_struct *work); .completed = -300, \ .per_cpu_ref = &name##_srcu_array, \ .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ - .running = false, \ + .srcu_state = SRCU_STATE_IDLE, \ .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ -- cgit v1.2.3 From ac367c1c621b75689f6d5cd8301d364ba2c9f292 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 11 Mar 2017 07:14:06 -0800 Subject: srcu: Add grace-period sequence numbers This commit adds grace-period sequence numbers, which will be used to handle mid-boot grace periods and per-CPU callback lists. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index f149a685896c..047ac8c28a4e 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -46,6 +46,7 @@ struct rcu_batch { struct srcu_struct { unsigned long completed; + unsigned long srcu_gp_seq; struct srcu_array __percpu *per_cpu_ref; spinlock_t queue_lock; /* protect ->batch_queue, ->running */ int srcu_state; -- cgit v1.2.3 From 8660b7d8a545227fd9ee80508aa82528ea9947d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Mar 2017 16:48:18 -0700 Subject: srcu: Use rcu_segcblist to track SRCU callbacks This commit switches SRCU from custom-built callback queues to the new rcu_segcblist structure. This change associates grace-period sequence numbers with groups of callbacks, which will be needed for efficient processing of per-CPU callbacks. Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 678 ++++++++++++++++++++++++++++++++++++++++++ include/linux/srcu.h | 24 +- 2 files changed, 683 insertions(+), 19 deletions(-) create mode 100644 include/linux/rcu_segcblist.h (limited to 'include/linux') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h new file mode 100644 index 000000000000..74b1e7243955 --- /dev/null +++ b/include/linux/rcu_segcblist.h @@ -0,0 +1,678 @@ +/* + * RCU segmented callback lists + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Authors: Paul E. McKenney + */ + +#ifndef __KERNEL_RCU_SEGCBLIST_H +#define __KERNEL_RCU_SEGCBLIST_H + +/* Simple unsegmented callback lists. */ +struct rcu_cblist { + struct rcu_head *head; + struct rcu_head **tail; + long len; + long len_lazy; +}; + +#define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head } + +/* Initialize simple callback list. */ +static inline void rcu_cblist_init(struct rcu_cblist *rclp) +{ + rclp->head = NULL; + rclp->tail = &rclp->head; + rclp->len = 0; + rclp->len_lazy = 0; +} + +/* Is simple callback list empty? */ +static inline bool rcu_cblist_empty(struct rcu_cblist *rclp) +{ + return !rclp->head; +} + +/* Return number of callbacks in simple callback list. */ +static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) +{ + return rclp->len; +} + +/* Return number of lazy callbacks in simple callback list. */ +static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp) +{ + return rclp->len_lazy; +} + +/* + * Debug function to actually count the number of callbacks. + * If the number exceeds the limit specified, return -1. + */ +static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) +{ + int cnt = 0; + struct rcu_head **rhpp = &rclp->head; + + for (;;) { + if (!*rhpp) + return cnt; + if (++cnt > lim) + return -1; + rhpp = &(*rhpp)->next; + } +} + +/* + * Dequeue the oldest rcu_head structure from the specified callback + * list. This function assumes that the callback is non-lazy, but + * the caller can later invoke rcu_cblist_dequeued_lazy() if it + * finds otherwise (and if it cares about laziness). This allows + * different users to have different ways of determining laziness. + */ +static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) +{ + struct rcu_head *rhp; + + rhp = rclp->head; + if (!rhp) + return NULL; + rclp->len--; + rclp->head = rhp->next; + if (!rclp->head) + rclp->tail = &rclp->head; + return rhp; +} + +/* + * Account for the fact that a previously dequeued callback turned out + * to be marked as lazy. + */ +static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) +{ + rclp->len_lazy--; +} + +/* + * Interim function to return rcu_cblist head pointer. Longer term, the + * rcu_cblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) +{ + return rclp->head; +} + +/* + * Interim function to return rcu_cblist head pointer. Longer term, the + * rcu_cblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) +{ + WARN_ON_ONCE(rcu_cblist_empty(rclp)); + return rclp->tail; +} + +/* Complicated segmented callback lists. ;-) */ + +/* + * Index values for segments in rcu_segcblist structure. + * + * The segments are as follows: + * + * [head, *tails[RCU_DONE_TAIL]): + * Callbacks whose grace period has elapsed, and thus can be invoked. + * [*tails[RCU_DONE_TAIL], *tails[RCU_WAIT_TAIL]): + * Callbacks waiting for the current GP from the current CPU's viewpoint. + * [*tails[RCU_WAIT_TAIL], *tails[RCU_NEXT_READY_TAIL]): + * Callbacks that arrived before the next GP started, again from + * the current CPU's viewpoint. These can be handled by the next GP. + * [*tails[RCU_NEXT_READY_TAIL], *tails[RCU_NEXT_TAIL]): + * Callbacks that might have arrived after the next GP started. + * There is some uncertainty as to when a given GP starts and + * ends, but a CPU knows the exact times if it is the one starting + * or ending the GP. Other CPUs know that the previous GP ends + * before the next one starts. + * + * Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also + * empty. + * + * The ->gp_seq[] array contains the grace-period number at which the + * corresponding segment of callbacks will be ready to invoke. A given + * element of this array is meaningful only when the corresponding segment + * is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks + * are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have + * not yet been assigned a grace-period number). + */ +#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ +#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ +#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ +#define RCU_NEXT_TAIL 3 +#define RCU_CBLIST_NSEGS 4 + +struct rcu_segcblist { + struct rcu_head *head; + struct rcu_head **tails[RCU_CBLIST_NSEGS]; + unsigned long gp_seq[RCU_CBLIST_NSEGS]; + long len; + long len_lazy; +}; + +#define RCU_SEGCBLIST_INITIALIZER(n) \ +{ \ + .head = NULL, \ + .tails[RCU_DONE_TAIL] = &n.head, \ + .tails[RCU_WAIT_TAIL] = &n.head, \ + .tails[RCU_NEXT_READY_TAIL] = &n.head, \ + .tails[RCU_NEXT_TAIL] = &n.head, \ +} + +/* + * Initialize an rcu_segcblist structure. + */ +static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp) +{ + int i; + + BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); + BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); + rsclp->head = NULL; + for (i = 0; i < RCU_CBLIST_NSEGS; i++) + rsclp->tails[i] = &rsclp->head; + rsclp->len = 0; + rsclp->len_lazy = 0; +} + +/* + * Is the specified rcu_segcblist structure empty? + * + * But careful! The fact that the ->head field is NULL does not + * necessarily imply that there are no callbacks associated with + * this structure. When callbacks are being invoked, they are + * removed as a group. If callback invocation must be preempted, + * the remaining callbacks will be added back to the list. Either + * way, the counts are updated later. + * + * So it is often the case that rcu_segcblist_n_cbs() should be used + * instead. + */ +static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) +{ + return !rsclp->head; +} + +/* Return number of callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) +{ + return READ_ONCE(rsclp->len); +} + +/* Return number of lazy callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) +{ + return rsclp->len_lazy; +} + +/* Return number of lazy callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) +{ + return rsclp->len - rsclp->len_lazy; +} + +/* + * Is the specified rcu_segcblist enabled, for example, not corresponding + * to an offline or callback-offloaded CPU? + */ +static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) +{ + return !!rsclp->tails[RCU_NEXT_TAIL]; +} + +/* + * Disable the specified rcu_segcblist structure, so that callbacks can + * no longer be posted to it. This structure must be empty. + */ +static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp) +{ + WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); + WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); + rsclp->tails[RCU_NEXT_TAIL] = NULL; +} + +/* + * Is the specified segment of the specified rcu_segcblist structure + * empty of callbacks? + */ +static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) +{ + if (seg == RCU_DONE_TAIL) + return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; + return rsclp->tails[seg - 1] == rsclp->tails[seg]; +} + +/* + * Are all segments following the specified segment of the specified + * rcu_segcblist structure empty of callbacks? (The specified + * segment might well contain callbacks.) + */ +static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) +{ + return !*rsclp->tails[seg]; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * are ready to be invoked? + */ +static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * are still pending, that is, not yet ready to be invoked? + */ +static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL); +} + +/* + * Dequeue and return the first ready-to-invoke callback. If there + * are no ready-to-invoke callbacks, return NULL. Disables interrupts + * to avoid interference. Does not protect from interference from other + * CPUs or tasks. + */ +static inline struct rcu_head * +rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) +{ + unsigned long flags; + int i; + struct rcu_head *rhp; + + local_irq_save(flags); + if (!rcu_segcblist_ready_cbs(rsclp)) { + local_irq_restore(flags); + return NULL; + } + rhp = rsclp->head; + BUG_ON(!rhp); + rsclp->head = rhp->next; + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { + if (rsclp->tails[i] != &rhp->next) + break; + rsclp->tails[i] = &rsclp->head; + } + smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ + WRITE_ONCE(rsclp->len, rsclp->len - 1); + local_irq_restore(flags); + return rhp; +} + +/* + * Account for the fact that a previously dequeued callback turned out + * to be marked as lazy. + */ +static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) +{ + unsigned long flags; + + local_irq_save(flags); + rsclp->len_lazy--; + local_irq_restore(flags); +} + +/* + * Return a pointer to the first callback in the specified rcu_segcblist + * structure. This is useful for diagnostics. + */ +static inline struct rcu_head * +rcu_segcblist_first_cb(struct rcu_segcblist *rsclp) +{ + if (rcu_segcblist_is_enabled(rsclp)) + return rsclp->head; + return NULL; +} + +/* + * Return a pointer to the first pending callback in the specified + * rcu_segcblist structure. This is useful just after posting a given + * callback -- if that callback is the first pending callback, then + * you cannot rely on someone else having already started up the required + * grace period. + */ +static inline struct rcu_head * +rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) +{ + if (rcu_segcblist_is_enabled(rsclp)) + return *rsclp->tails[RCU_DONE_TAIL]; + return NULL; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * have not yet been processed beyond having been posted, that is, + * does it contain callbacks in its last segment? + */ +static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); +} + +/* + * Enqueue the specified callback onto the specified rcu_segcblist + * structure, updating accounting as needed. Note that the ->len + * field may be accessed locklessly, hence the WRITE_ONCE(). + * The ->len field is used by rcu_barrier() and friends to determine + * if it must post a callback on this structure, and it is OK + * for rcu_barrier() to sometimes post callbacks needlessly, but + * absolutely not OK for it to ever miss posting a callback. + */ +static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy) +{ + WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ + if (lazy) + rsclp->len_lazy++; + smp_mb(); /* Ensure counts are updated before callback is enqueued. */ + rhp->next = NULL; + *rsclp->tails[RCU_NEXT_TAIL] = rhp; + rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; +} + +/* + * Extract only the counts from the specified rcu_segcblist structure, + * and place them in the specified rcu_cblist structure. This function + * supports both callback orphaning and invocation, hence the separation + * of counts and callbacks. (Callbacks ready for invocation must be + * orphaned and adopted separately from pending callbacks, but counts + * apply to all callbacks. Locking must be used to make sure that + * both orphaned-callbacks lists are consistent.) + */ +static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rclp->len_lazy += rsclp->len_lazy; + rclp->len += rsclp->len; + rsclp->len_lazy = 0; + WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ +} + +/* + * Extract only those callbacks ready to be invoked from the specified + * rcu_segcblist structure and place them in the specified rcu_cblist + * structure. + */ +static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rcu_segcblist_ready_cbs(rsclp)) + return; /* Nothing to do. */ + *rclp->tail = rsclp->head; + rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; + *rsclp->tails[RCU_DONE_TAIL] = NULL; + rclp->tail = rsclp->tails[RCU_DONE_TAIL]; + for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) + if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) + rsclp->tails[i] = &rsclp->head; +} + +/* + * Extract only those callbacks still pending (not yet ready to be + * invoked) from the specified rcu_segcblist structure and place them in + * the specified rcu_cblist structure. Note that this loses information + * about any callbacks that might have been partway done waiting for + * their grace period. Too bad! They will have to start over. + */ +static inline void +rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rcu_segcblist_pend_cbs(rsclp)) + return; /* Nothing to do. */ + *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; + rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; + *rsclp->tails[RCU_DONE_TAIL] = NULL; + for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) + rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; +} + +/* + * Move the entire contents of the specified rcu_segcblist structure, + * counts, callbacks, and all, to the specified rcu_cblist structure. + * @@@ Why do we need this??? Moving early-boot CBs to NOCB lists? + * @@@ Memory barrier needed? (Not if only used at boot time...) + */ +static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rcu_segcblist_extract_done_cbs(rsclp, rclp); + rcu_segcblist_extract_pend_cbs(rsclp, rclp); + rcu_segcblist_extract_count(rsclp, rclp); +} + +/* + * Insert counts from the specified rcu_cblist structure in the + * specified rcu_segcblist structure. + */ +static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rsclp->len_lazy += rclp->len_lazy; + /* ->len sampled locklessly. */ + WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); + rclp->len_lazy = 0; + rclp->len = 0; +} + +/* + * Move callbacks from the specified rcu_cblist to the beginning of the + * done-callbacks segment of the specified rcu_segcblist. + */ +static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rclp->head) + return; /* No callbacks to move. */ + *rclp->tail = rsclp->head; + rsclp->head = rclp->head; + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) + if (&rsclp->head == rsclp->tails[i]) + rsclp->tails[i] = rclp->tail; + else + break; + rclp->head = NULL; + rclp->tail = &rclp->head; +} + +/* + * Move callbacks from the specified rcu_cblist to the end of the + * new-callbacks segment of the specified rcu_segcblist. + */ +static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + if (!rclp->head) + return; /* Nothing to do. */ + *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; + rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; + rclp->head = NULL; + rclp->tail = &rclp->head; +} + +/* + * Advance the callbacks in the specified rcu_segcblist structure based + * on the current value passed in for the grace-period counter. + */ +static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp, + unsigned long seq) +{ + int i, j; + + WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); + WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)); + + /* + * Find all callbacks whose ->gp_seq numbers indicate that they + * are ready to invoke, and put them into the RCU_DONE_TAIL segment. + */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { + if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) + break; + rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; + } + + /* If no callbacks moved, nothing more need be done. */ + if (i == RCU_WAIT_TAIL) + return; + + /* Clean up tail pointers that might have been misordered above. */ + for (j = RCU_WAIT_TAIL; j < i; j++) + rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; + + /* + * Callbacks moved, so clean up the misordered ->tails[] pointers + * that now point into the middle of the list of ready-to-invoke + * callbacks. The overall effect is to copy down the later pointers + * into the gap that was created by the now-ready segments. + */ + for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { + if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) + break; /* No more callbacks. */ + rsclp->tails[j] = rsclp->tails[i]; + rsclp->gp_seq[j] = rsclp->gp_seq[i]; + } +} + +/* + * "Accelerate" callbacks based on more-accurate grace-period information. + * The reason for this is that RCU does not synchronize the beginnings and + * ends of grace periods, and that callbacks are posted locally. This in + * turn means that the callbacks must be labelled conservatively early + * on, as getting exact information would degrade both performance and + * scalability. When more accurate grace-period information becomes + * available, previously posted callbacks can be "accelerated", marking + * them to complete at the end of the earlier grace period. + * + * This function operates on an rcu_segcblist structure, and also the + * grace-period sequence number at which new callbacks would become + * ready to invoke. + */ +static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, + unsigned long seq) +{ + int i; + + WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); + WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)); + + /* + * Find the segment preceding the oldest segment of callbacks + * whose ->gp_seq[] completion is at or after that passed in via + * "seq", skipping any empty segments. This oldest segment, along + * with any later segments, can be merged in with any newly arrived + * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq" + * as their ->gp_seq[] grace-period completion sequence number. + */ + for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) + if (rsclp->tails[i] != rsclp->tails[i - 1] && + ULONG_CMP_LT(rsclp->gp_seq[i], seq)) + break; + + /* + * If all the segments contain callbacks that correspond to + * earlier grace-period sequence numbers than "seq", leave. + * Assuming that the rcu_segcblist structure has enough + * segments in its arrays, this can only happen if some of + * the non-done segments contain callbacks that really are + * ready to invoke. This situation will get straightened + * out by the next call to rcu_segcblist_advance(). + * + * Also advance to the oldest segment of callbacks whose + * ->gp_seq[] completion is at or after that passed in via "seq", + * skipping any empty segments. + */ + if (++i >= RCU_NEXT_TAIL) + return false; + + /* + * Merge all later callbacks, including newly arrived callbacks, + * into the segment located by the for-loop above. Assign "seq" + * as the ->gp_seq[] value in order to correctly handle the case + * where there were no pending callbacks in the rcu_segcblist + * structure other than in the RCU_NEXT_TAIL segment. + */ + for (; i < RCU_NEXT_TAIL; i++) { + rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; + rsclp->gp_seq[i] = seq; + } + return true; +} + +/* + * Scan the specified rcu_segcblist structure for callbacks that need + * a grace period later than the one specified by "seq". We don't look + * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't + * have a grace-period sequence number. + */ +static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, + unsigned long seq) +{ + int i; + + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) + if (rsclp->tails[i - 1] != rsclp->tails[i] && + ULONG_CMP_LT(seq, rsclp->gp_seq[i])) + return true; + return false; +} + +/* + * Interim function to return rcu_segcblist head pointer. Longer term, the + * rcu_segcblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) +{ + return rsclp->head; +} + +/* + * Interim function to return rcu_segcblist head pointer. Longer term, the + * rcu_segcblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) +{ + WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); + return rsclp->tails[RCU_NEXT_TAIL]; +} + +#endif /* __KERNEL_RCU_SEGCBLIST_H */ diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 047ac8c28a4e..ad154a7bc114 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -22,7 +22,7 @@ * Lai Jiangshan * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt + * Documentation/RCU/ *.txt * */ @@ -32,31 +32,20 @@ #include #include #include +#include struct srcu_array { unsigned long lock_count[2]; unsigned long unlock_count[2]; }; -struct rcu_batch { - struct rcu_head *head, **tail; -}; - -#define RCU_BATCH_INIT(name) { NULL, &(name.head) } - struct srcu_struct { unsigned long completed; unsigned long srcu_gp_seq; struct srcu_array __percpu *per_cpu_ref; - spinlock_t queue_lock; /* protect ->batch_queue, ->running */ + spinlock_t queue_lock; /* protect ->srcu_cblist, ->srcu_state */ int srcu_state; - /* callbacks just queued */ - struct rcu_batch batch_queue; - /* callbacks try to do the first check_zero */ - struct rcu_batch batch_check0; - /* callbacks done with the first check_zero and the flip */ - struct rcu_batch batch_check1; - struct rcu_batch batch_done; + struct rcu_segcblist srcu_cblist; struct delayed_work work; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -97,10 +86,7 @@ void process_srcu(struct work_struct *work); .per_cpu_ref = &name##_srcu_array, \ .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ .srcu_state = SRCU_STATE_IDLE, \ - .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ - .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ - .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ - .batch_done = RCU_BATCH_INIT(name.batch_done), \ + .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\ .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ __SRCU_DEP_MAP_INIT(name) \ } -- cgit v1.2.3 From f2425b4efb0c69e77c0b9666b605ae4a1ecaae47 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Mar 2017 12:42:30 -0700 Subject: srcu: Move combining-tree definitions for SRCU's benefit This commit moves the C preprocessor code that defines the default shape of the rcu_node combining tree to a new include/linux/rcu_node_tree.h file as a first step towards enabling SRCU to create its own combining tree, which in turn enables SRCU to implement per-CPU callback handling, thus avoiding contention on the lock currently guarding the single list of callbacks. Note that users of SRCU still need to know the size of the srcu_struct structure, hence include/linux rather than kernel/rcu. This commit is code-movement only. Signed-off-by: Paul E. McKenney --- include/linux/rcu_node_tree.h | 102 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 include/linux/rcu_node_tree.h (limited to 'include/linux') diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h new file mode 100644 index 000000000000..b7eb97096b1c --- /dev/null +++ b/include/linux/rcu_node_tree.h @@ -0,0 +1,102 @@ +/* + * RCU node combining tree definitions. These are used to compute + * global attributes while avoiding common-case global contention. A key + * property that these computations rely on is a tournament-style approach + * where only one of the tasks contending a lower level in the tree need + * advance to the next higher level. If properly configured, this allows + * unlimited scalability while maintaining a constant level of contention + * on the root node. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Author: Paul E. McKenney + */ + +#ifndef __LINUX_RCU_NODE_TREE_H +#define __LINUX_RCU_NODE_TREE_H + +/* + * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and + * CONFIG_RCU_FANOUT_LEAF. + * In theory, it should be possible to add more levels straightforwardly. + * In practice, this did work well going from three levels to four. + * Of course, your mileage may vary. + */ + +#ifdef CONFIG_RCU_FANOUT +#define RCU_FANOUT CONFIG_RCU_FANOUT +#else /* #ifdef CONFIG_RCU_FANOUT */ +# ifdef CONFIG_64BIT +# define RCU_FANOUT 64 +# else +# define RCU_FANOUT 32 +# endif +#endif /* #else #ifdef CONFIG_RCU_FANOUT */ + +#ifdef CONFIG_RCU_FANOUT_LEAF +#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF +#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ +#define RCU_FANOUT_LEAF 16 +#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ + +#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) +#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) +#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) +#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) + +#if NR_CPUS <= RCU_FANOUT_1 +# define RCU_NUM_LVLS 1 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_NODES NUM_RCU_LVL_0 +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } +# define RCU_NODE_NAME_INIT { "rcu_node_0" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } +#elif NR_CPUS <= RCU_FANOUT_2 +# define RCU_NUM_LVLS 2 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } +#elif NR_CPUS <= RCU_FANOUT_3 +# define RCU_NUM_LVLS 3 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } +#elif NR_CPUS <= RCU_FANOUT_4 +# define RCU_NUM_LVLS 4 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } +#else +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" +#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ + +extern int rcu_num_lvls; +extern int rcu_num_nodes; + +#endif /* __LINUX_RCU_NODE_TREE_H */ -- cgit v1.2.3 From 2b34c43cc1671c59bad6dd1682ae3ee4f0919eb7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Mar 2017 14:29:53 -0700 Subject: srcu: Move rcu_init_levelspread() to rcu_tree_node.h This commit moves the rcu_init_levelspread() function from kernel/rcu/tree.c to kernel/rcu/rcu.h so that SRCU can access it. This is another step towards enabling SRCU to create its own combining tree. This commit is code-movement only, give or take knock-on adjustments. Signed-off-by: Paul E. McKenney --- include/linux/rcu_node_tree.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h index b7eb97096b1c..4b766b61e1a0 100644 --- a/include/linux/rcu_node_tree.h +++ b/include/linux/rcu_node_tree.h @@ -96,7 +96,4 @@ # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ -extern int rcu_num_lvls; -extern int rcu_num_nodes; - #endif /* __LINUX_RCU_NODE_TREE_H */ -- cgit v1.2.3 From 80a7956fe36c2ee40c6ff12c77926d267802b7c8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 Mar 2017 15:26:18 -0700 Subject: srcu: Merge ->srcu_state into ->srcu_gp_seq Updating ->srcu_state and ->srcu_gp_seq will lead to extremely complex race conditions given multiple callback queues, so this commit takes advantage of the two-bit state now available in rcu_seq counters to store the state in the bottom two bits of ->srcu_gp_seq. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index ad154a7bc114..e7dbc01b61a1 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -43,8 +43,7 @@ struct srcu_struct { unsigned long completed; unsigned long srcu_gp_seq; struct srcu_array __percpu *per_cpu_ref; - spinlock_t queue_lock; /* protect ->srcu_cblist, ->srcu_state */ - int srcu_state; + spinlock_t queue_lock; /* protect ->srcu_cblist */ struct rcu_segcblist srcu_cblist; struct delayed_work work; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -56,7 +55,6 @@ struct srcu_struct { #define SRCU_STATE_IDLE 0 #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 -#define SRCU_STATE_DONE 3 #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -85,7 +83,6 @@ void process_srcu(struct work_struct *work); .completed = -300, \ .per_cpu_ref = &name##_srcu_array, \ .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ - .srcu_state = SRCU_STATE_IDLE, \ .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\ .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ __SRCU_DEP_MAP_INIT(name) \ -- cgit v1.2.3 From f60d231a87c5c9f23f10e69996f396d46f5bf901 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Mar 2017 13:46:33 -0700 Subject: srcu: Crude control of expedited grace periods SRCU's implementation of expedited grace periods has always assumed that the SRCU instance is idle when the expedited request arrives. This commit improves this a bit by maintaining a count of the number of outstanding expedited requests, thus allowing prior non-expedited grace periods accommodate these requests by shifting to expedited mode. However, any non-expedited wait already in progress will still wait for the full duration. Improved control of expedited grace periods is planned, but one step at a time. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index e7dbc01b61a1..73a1b6296224 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -42,6 +42,7 @@ struct srcu_array { struct srcu_struct { unsigned long completed; unsigned long srcu_gp_seq; + atomic_t srcu_exp_cnt; struct srcu_array __percpu *per_cpu_ref; spinlock_t queue_lock; /* protect ->srcu_cblist */ struct rcu_segcblist srcu_cblist; -- cgit v1.2.3 From d8be81735aa89413b333de488251f0e64e2be591 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 25 Mar 2017 09:59:38 -0700 Subject: srcu: Create a tiny SRCU In response to automated complaints about modifications to SRCU increasing its size, this commit creates a tiny SRCU that is used in SMP=n && PREEMPT=n builds. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 69 +++++------------------------------- include/linux/srcutiny.h | 81 ++++++++++++++++++++++++++++++++++++++++++ include/linux/srcutree.h | 91 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 180 insertions(+), 61 deletions(-) create mode 100644 include/linux/srcutiny.h create mode 100644 include/linux/srcutree.h (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 73a1b6296224..907f09b14eda 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -34,28 +34,7 @@ #include #include -struct srcu_array { - unsigned long lock_count[2]; - unsigned long unlock_count[2]; -}; - -struct srcu_struct { - unsigned long completed; - unsigned long srcu_gp_seq; - atomic_t srcu_exp_cnt; - struct srcu_array __percpu *per_cpu_ref; - spinlock_t queue_lock; /* protect ->srcu_cblist */ - struct rcu_segcblist srcu_cblist; - struct delayed_work work; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -}; - -/* Values for -> state variable. */ -#define SRCU_STATE_IDLE 0 -#define SRCU_STATE_SCAN1 1 -#define SRCU_STATE_SCAN2 2 +struct srcu_struct; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -77,42 +56,13 @@ int init_srcu_struct(struct srcu_struct *sp); #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -void process_srcu(struct work_struct *work); - -#define __SRCU_STRUCT_INIT(name) \ - { \ - .completed = -300, \ - .per_cpu_ref = &name##_srcu_array, \ - .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ - .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\ - .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ - __SRCU_DEP_MAP_INIT(name) \ - } - -/* - * Define and initialize a srcu struct at build time. - * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. - * - * Note that although DEFINE_STATIC_SRCU() hides the name from other - * files, the per-CPU variable rules nevertheless require that the - * chosen name be globally unique. These rules also prohibit use of - * DEFINE_STATIC_SRCU() within a function. If these rules are too - * restrictive, declare the srcu_struct manually. For example, in - * each file: - * - * static struct srcu_struct my_srcu; - * - * Then, before the first use of each my_srcu, manually initialize it: - * - * init_srcu_struct(&my_srcu); - * - * See include/linux/percpu-defs.h for the rules on per-CPU variables. - */ -#define __DEFINE_SRCU(name, is_static) \ - static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ - is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) -#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) -#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) +#ifdef CONFIG_TINY_SRCU +#include +#elif defined(CONFIG_TREE_SRCU) +#include +#else +#error "Unknown SRCU implementation specified to kernel configuration" +#endif /** * call_srcu() - Queue a callback for invocation after an SRCU grace period @@ -138,9 +88,6 @@ void cleanup_srcu_struct(struct srcu_struct *sp); int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); void synchronize_srcu(struct srcu_struct *sp); -void synchronize_srcu_expedited(struct srcu_struct *sp); -unsigned long srcu_batches_completed(struct srcu_struct *sp); -void srcu_barrier(struct srcu_struct *sp); #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h new file mode 100644 index 000000000000..4f284e4f4d8c --- /dev/null +++ b/include/linux/srcutiny.h @@ -0,0 +1,81 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * tiny variant. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney + */ + +#ifndef _LINUX_SRCU_TINY_H +#define _LINUX_SRCU_TINY_H + +#include + +struct srcu_struct { + int srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ + struct swait_queue_head srcu_wq; + /* Last srcu_read_unlock() wakes GP. */ + unsigned long srcu_gp_seq; /* GP seq # for callback tagging. */ + struct rcu_segcblist srcu_cblist; + /* Pending SRCU callbacks. */ + int srcu_idx; /* Current reader array element. */ + bool srcu_gp_running; /* GP workqueue running? */ + bool srcu_gp_waiting; /* GP waiting for readers? */ + struct work_struct srcu_work; /* For driving grace periods. */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +}; + +void srcu_drive_gp(struct work_struct *wp); + +#define __SRCU_STRUCT_INIT(name) \ +{ \ + .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ + .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist), \ + .srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp), \ + __SRCU_DEP_MAP_INIT(name) \ +} + +/* + * This odd _STATIC_ arrangement is needed for API compatibility with + * Tree SRCU, which needs some per-CPU data. + */ +#define DEFINE_SRCU(name) \ + struct srcu_struct name = __SRCU_STRUCT_INIT(name) +#define DEFINE_STATIC_SRCU(name) \ + static struct srcu_struct name = __SRCU_STRUCT_INIT(name) + +void synchronize_srcu(struct srcu_struct *sp); + +static inline void synchronize_srcu_expedited(struct srcu_struct *sp) +{ + synchronize_srcu(sp); +} + +static inline void srcu_barrier(struct srcu_struct *sp) +{ + synchronize_srcu(sp); +} + +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) +{ + return 0; +} + +#endif diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h new file mode 100644 index 000000000000..f2b3bd6c6bc2 --- /dev/null +++ b/include/linux/srcutree.h @@ -0,0 +1,91 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * tree variant. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney + */ + +#ifndef _LINUX_SRCU_TREE_H +#define _LINUX_SRCU_TREE_H + +struct srcu_array { + unsigned long lock_count[2]; + unsigned long unlock_count[2]; +}; + +struct srcu_struct { + unsigned long completed; + unsigned long srcu_gp_seq; + atomic_t srcu_exp_cnt; + struct srcu_array __percpu *per_cpu_ref; + spinlock_t queue_lock; /* protect ->srcu_cblist */ + struct rcu_segcblist srcu_cblist; + struct delayed_work work; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +}; + +/* Values for -> state variable. */ +#define SRCU_STATE_IDLE 0 +#define SRCU_STATE_SCAN1 1 +#define SRCU_STATE_SCAN2 2 + +void process_srcu(struct work_struct *work); + +#define __SRCU_STRUCT_INIT(name) \ + { \ + .completed = -300, \ + .per_cpu_ref = &name##_srcu_array, \ + .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ + .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\ + .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ + __SRCU_DEP_MAP_INIT(name) \ + } + +/* + * Define and initialize a srcu struct at build time. + * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. + * + * Note that although DEFINE_STATIC_SRCU() hides the name from other + * files, the per-CPU variable rules nevertheless require that the + * chosen name be globally unique. These rules also prohibit use of + * DEFINE_STATIC_SRCU() within a function. If these rules are too + * restrictive, declare the srcu_struct manually. For example, in + * each file: + * + * static struct srcu_struct my_srcu; + * + * Then, before the first use of each my_srcu, manually initialize it: + * + * init_srcu_struct(&my_srcu); + * + * See include/linux/percpu-defs.h for the rules on per-CPU variables. + */ +#define __DEFINE_SRCU(name, is_static) \ + static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ + is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) +#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) +#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) + +void synchronize_srcu_expedited(struct srcu_struct *sp); +void srcu_barrier(struct srcu_struct *sp); +unsigned long srcu_batches_completed(struct srcu_struct *sp); + +#endif -- cgit v1.2.3 From dad81a2026841b5e2651aab58a7398c13cc05847 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 25 Mar 2017 17:23:44 -0700 Subject: srcu: Introduce CLASSIC_SRCU Kconfig option The TREE_SRCU rewrite is large and a bit on the non-simple side, so this commit helps reduce risk by allowing the old v4.11 SRCU algorithm to be selected using a new CLASSIC_SRCU Kconfig option that depends on RCU_EXPERT. The default is to use the new TREE_SRCU and TINY_SRCU algorithms, in order to help get these the testing that they need. However, if your users do not require the update-side scalability that is to be provided by TREE_SRCU, select RCU_EXPERT and then CLASSIC_SRCU to revert back to the old classic SRCU algorithm. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 2 + include/linux/srcuclassic.h | 101 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 include/linux/srcuclassic.h (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 907f09b14eda..167ad8831aaf 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -60,6 +60,8 @@ int init_srcu_struct(struct srcu_struct *sp); #include #elif defined(CONFIG_TREE_SRCU) #include +#elif defined(CONFIG_CLASSIC_SRCU) +#include #else #error "Unknown SRCU implementation specified to kernel configuration" #endif diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h new file mode 100644 index 000000000000..41cf99930f34 --- /dev/null +++ b/include/linux/srcuclassic.h @@ -0,0 +1,101 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * classic v4.11 variant. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney + */ + +#ifndef _LINUX_SRCU_CLASSIC_H +#define _LINUX_SRCU_CLASSIC_H + +struct srcu_array { + unsigned long lock_count[2]; + unsigned long unlock_count[2]; +}; + +struct rcu_batch { + struct rcu_head *head, **tail; +}; + +#define RCU_BATCH_INIT(name) { NULL, &(name.head) } + +struct srcu_struct { + unsigned long completed; + struct srcu_array __percpu *per_cpu_ref; + spinlock_t queue_lock; /* protect ->batch_queue, ->running */ + bool running; + /* callbacks just queued */ + struct rcu_batch batch_queue; + /* callbacks try to do the first check_zero */ + struct rcu_batch batch_check0; + /* callbacks done with the first check_zero and the flip */ + struct rcu_batch batch_check1; + struct rcu_batch batch_done; + struct delayed_work work; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +}; + +void process_srcu(struct work_struct *work); + +#define __SRCU_STRUCT_INIT(name) \ + { \ + .completed = -300, \ + .per_cpu_ref = &name##_srcu_array, \ + .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ + .running = false, \ + .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ + .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ + .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ + .batch_done = RCU_BATCH_INIT(name.batch_done), \ + .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ + __SRCU_DEP_MAP_INIT(name) \ + } + +/* + * Define and initialize a srcu struct at build time. + * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. + * + * Note that although DEFINE_STATIC_SRCU() hides the name from other + * files, the per-CPU variable rules nevertheless require that the + * chosen name be globally unique. These rules also prohibit use of + * DEFINE_STATIC_SRCU() within a function. If these rules are too + * restrictive, declare the srcu_struct manually. For example, in + * each file: + * + * static struct srcu_struct my_srcu; + * + * Then, before the first use of each my_srcu, manually initialize it: + * + * init_srcu_struct(&my_srcu); + * + * See include/linux/percpu-defs.h for the rules on per-CPU variables. + */ +#define __DEFINE_SRCU(name, is_static) \ + static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ + is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) +#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) +#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) + +void synchronize_srcu_expedited(struct srcu_struct *sp); +void srcu_barrier(struct srcu_struct *sp); +unsigned long srcu_batches_completed(struct srcu_struct *sp); + +#endif -- cgit v1.2.3 From 5f0d5a3ae7cff0d7fa943c199c3a2e44f23e1fac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Jan 2017 02:53:44 -0800 Subject: mm: Rename SLAB_DESTROY_BY_RCU to SLAB_TYPESAFE_BY_RCU A group of Linux kernel hackers reported chasing a bug that resulted from their assumption that SLAB_DESTROY_BY_RCU provided an existence guarantee, that is, that no block from such a slab would be reallocated during an RCU read-side critical section. Of course, that is not the case. Instead, SLAB_DESTROY_BY_RCU only prevents freeing of an entire slab of blocks. However, there is a phrase for this, namely "type safety". This commit therefore renames SLAB_DESTROY_BY_RCU to SLAB_TYPESAFE_BY_RCU in order to avoid future instances of this sort of confusion. Signed-off-by: Paul E. McKenney Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: Acked-by: Johannes Weiner Acked-by: Vlastimil Babka [ paulmck: Add comments mentioning the old name, as requested by Eric Dumazet, in order to help people familiar with the old name find the new one. ] Acked-by: David Rientjes --- include/linux/dma-fence.h | 4 ++-- include/linux/slab.h | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 6048fa404e57..a5195a7d6f77 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -229,7 +229,7 @@ static inline struct dma_fence *dma_fence_get_rcu(struct dma_fence *fence) * * Function returns NULL if no refcount could be obtained, or the fence. * This function handles acquiring a reference to a fence that may be - * reallocated within the RCU grace period (such as with SLAB_DESTROY_BY_RCU), + * reallocated within the RCU grace period (such as with SLAB_TYPESAFE_BY_RCU), * so long as the caller is using RCU on the pointer to the fence. * * An alternative mechanism is to employ a seqlock to protect a bunch of @@ -257,7 +257,7 @@ dma_fence_get_rcu_safe(struct dma_fence * __rcu *fencep) * have successfully acquire a reference to it. If it no * longer matches, we are holding a reference to some other * reallocated pointer. This is possible if the allocator - * is using a freelist like SLAB_DESTROY_BY_RCU where the + * is using a freelist like SLAB_TYPESAFE_BY_RCU where the * fence remains valid for the RCU grace period, but it * may be reallocated. When using such allocators, we are * responsible for ensuring the reference we get is to diff --git a/include/linux/slab.h b/include/linux/slab.h index 3c37a8c51921..04a7f7993e67 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -28,7 +28,7 @@ #define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ #define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ /* - * SLAB_DESTROY_BY_RCU - **WARNING** READ THIS! + * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * * This delays freeing the SLAB page by a grace period, it does _NOT_ * delay object freeing. This means that if you do kmem_cache_free() @@ -61,8 +61,10 @@ * * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. + * + * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ -#define SLAB_DESTROY_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ +#define SLAB_TYPESAFE_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ #define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ #define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ -- cgit v1.2.3 From 468d01bec544286bb5283f012b95b5b84636565b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Feb 2017 11:40:15 -0800 Subject: types: Update obsolete callback_head comment The comment header for callback_head (and thus for rcu_head) states that the bottom two bits of a pointer to these structures must be zero. This is obsolete: The new requirement is that only the bottom bit need be zero. This commit therefore updates this comment. Signed-off-by: Paul E. McKenney --- include/linux/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/types.h b/include/linux/types.h index 1e7bd24848fc..258099a4ed82 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -209,7 +209,7 @@ struct ustat { * naturally due ABI requirements, but some architectures (like CRIS) have * weird ABI and we need to ask it explicitly. * - * The alignment is required to guarantee that bits 0 and 1 of @next will be + * The alignment is required to guarantee that bit 0 of @next will be * clear under normal conditions -- as long as we use call_rcu(), * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. * -- cgit v1.2.3 From 48ac34666ff76843d8743db1cc78b303759916f1 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 27 Feb 2017 21:14:19 +0200 Subject: hlist_add_tail_rcu disable sparse warning sparse is unhappy about this code in hlist_add_tail_rcu: struct hlist_node *i, *last = NULL; for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i)) last = i; This is because hlist_next_rcu and hlist_next_rcu return __rcu pointers. It's a false positive - it's a write side primitive and so does not need to be called in a read side critical section. The following trivial patch disables the warning without changing the behaviour in any way. Note: __hlist_for_each_rcu would also remove the warning but it would be confusing since it calls rcu_derefence and is designed to run in the rcu read side critical section. Signed-off-by: Michael S. Tsirkin Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4f7a9561b8c4..b1fd8bf85fdc 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -509,7 +509,8 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n, { struct hlist_node *i, *last = NULL; - for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i)) + /* Note: write side code, so rcu accessors are not needed. */ + for (i = h->first; i; i = i->next) last = i; if (last) { -- cgit v1.2.3 From 4a67c9fde04fc1b6752fa68c495310ca3ed29eeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Fri, 31 Mar 2017 11:11:48 +0200 Subject: mtd: use dev_of_node helper in mtd_get_of_node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows better compile-time optimizations with CONFIG_OF disabled. Signed-off-by: Rafał Miłecki Acked-by: Boris Brezillon Signed-off-by: Brian Norris --- include/linux/mtd/mtd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index eebdc63cf6af..f8db5b2e4028 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -393,7 +393,7 @@ static inline void mtd_set_of_node(struct mtd_info *mtd, static inline struct device_node *mtd_get_of_node(struct mtd_info *mtd) { - return mtd->dev.of_node; + return dev_of_node(&mtd->dev); } static inline int mtd_oobavail(struct mtd_info *mtd, struct mtd_oob_ops *ops) -- cgit v1.2.3 From 3f1866779cf8338e1c8bd32e5f6f5424795ef191 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Mon, 10 Apr 2017 16:50:58 +0530 Subject: of: dma: Make of_dma_deconfigure() public As part of moving DMA initializing to probe time the of_dma_deconfigure() function will need to be called from different source files. Make it public and move it to drivers/of/device.c where the of_dma_configure() function is. Tested-by: Marek Szyprowski Reviewed-by: Robin Murphy Acked-by: Rob Herring Signed-off-by: Laurent Pinchart Signed-off-by: Joerg Roedel --- include/linux/of_device.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_device.h b/include/linux/of_device.h index c12dace043f3..af984551cc2b 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -56,6 +56,7 @@ static inline struct device_node *of_cpu_device_node_get(int cpu) } void of_dma_configure(struct device *dev, struct device_node *np); +void of_dma_deconfigure(struct device *dev); #else /* CONFIG_OF */ static inline int of_driver_match_device(struct device *dev, @@ -105,6 +106,8 @@ static inline struct device_node *of_cpu_device_node_get(int cpu) } static inline void of_dma_configure(struct device *dev, struct device_node *np) {} +static inline void of_dma_deconfigure(struct device *dev) +{} #endif /* CONFIG_OF */ #endif /* _LINUX_OF_DEVICE_H */ -- cgit v1.2.3 From 09515ef5ddad71c7820e5e428da418b709feeb26 Mon Sep 17 00:00:00 2001 From: Sricharan R Date: Mon, 10 Apr 2017 16:51:01 +0530 Subject: of/acpi: Configure dma operations at probe time for platform/amba/pci bus devices Configuring DMA ops at probe time will allow deferring device probe when the IOMMU isn't available yet. The dma_configure for the device is now called from the generic device_attach callback just before the bus/driver probe is called. This way, configuring the DMA ops for the device would be called at the same place for all bus_types, hence the deferred probing mechanism should work for all buses as well. pci_bus_add_devices (platform/amba)(_device_create/driver_register) | | pci_bus_add_device (device_add/driver_register) | | device_attach device_initial_probe | | __device_attach_driver __device_attach_driver | driver_probe_device | really_probe | dma_configure Similarly on the device/driver_unregister path __device_release_driver is called which inturn calls dma_deconfigure. This patch changes the dma ops configuration to probe time for both OF and ACPI based platform/amba/pci bus devices. Tested-by: Marek Szyprowski Tested-by: Hanjun Guo Reviewed-by: Robin Murphy Acked-by: Rob Herring Acked-by: Bjorn Helgaas (drivers/pci part) Acked-by: Rafael J. Wysocki Signed-off-by: Sricharan R Signed-off-by: Joerg Roedel --- include/linux/dma-mapping.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 0977317c6835..4f3eecedca2d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -728,6 +728,18 @@ dma_mark_declared_memory_occupied(struct device *dev, } #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ +#ifdef CONFIG_HAS_DMA +int dma_configure(struct device *dev); +void dma_deconfigure(struct device *dev); +#else +static inline int dma_configure(struct device *dev) +{ + return 0; +} + +static inline void dma_deconfigure(struct device *dev) {} +#endif + /* * Managed DMA API */ -- cgit v1.2.3 From 7b07cbefb68d486febf47e13b570fed53d9296b4 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Mon, 10 Apr 2017 16:51:02 +0530 Subject: iommu: of: Handle IOMMU lookup failure with deferred probing or error Failures to look up an IOMMU when parsing the DT iommus property need to be handled separately from the .of_xlate() failures to support deferred probing. The lack of a registered IOMMU can be caused by the lack of a driver for the IOMMU, the IOMMU device probe not having been performed yet, having been deferred, or having failed. The first case occurs when the device tree describes the bus master and IOMMU topology correctly but no device driver exists for the IOMMU yet or the device driver has not been compiled in. Return NULL, the caller will configure the device without an IOMMU. The second and third cases are handled by deferring the probe of the bus master device which will eventually get reprobed after the IOMMU. The last case is currently handled by deferring the probe of the bus master device as well. A mechanism to either configure the bus master device without an IOMMU or to fail the bus master device probe depending on whether the IOMMU is optional or mandatory would be a good enhancement. Tested-by: Marek Szyprowski Reviewed-by: Robin Murphy Acked-by: Rob Herring Signed-off-by: Laurent Pichart Signed-off-by: Sricharan R Signed-off-by: Joerg Roedel --- include/linux/of_device.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_device.h b/include/linux/of_device.h index af984551cc2b..2cacdd81062e 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -55,7 +55,7 @@ static inline struct device_node *of_cpu_device_node_get(int cpu) return of_node_get(cpu_dev->of_node); } -void of_dma_configure(struct device *dev, struct device_node *np); +int of_dma_configure(struct device *dev, struct device_node *np); void of_dma_deconfigure(struct device *dev); #else /* CONFIG_OF */ @@ -104,8 +104,11 @@ static inline struct device_node *of_cpu_device_node_get(int cpu) { return NULL; } -static inline void of_dma_configure(struct device *dev, struct device_node *np) -{} + +static inline int of_dma_configure(struct device *dev, struct device_node *np) +{ + return 0; +} static inline void of_dma_deconfigure(struct device *dev) {} #endif /* CONFIG_OF */ -- cgit v1.2.3 From 5a1bb638d5677053c7addcb228b56da6fccb5d68 Mon Sep 17 00:00:00 2001 From: Sricharan R Date: Mon, 10 Apr 2017 16:51:03 +0530 Subject: drivers: acpi: Handle IOMMU lookup failure with deferred probing or error This is an equivalent to the DT's handling of the iommu master's probe with deferred probing when the corrsponding iommu is not probed yet. The lack of a registered IOMMU can be caused by the lack of a driver for the IOMMU, the IOMMU device probe not having been performed yet, having been deferred, or having failed. The first case occurs when the firmware describes the bus master and IOMMU topology correctly but no device driver exists for the IOMMU yet or the device driver has not been compiled in. Return NULL, the caller will configure the device without an IOMMU. The second and third cases are handled by deferring the probe of the bus master device which will eventually get reprobed after the IOMMU. The last case is currently handled by deferring the probe of the bus master device as well. A mechanism to either configure the bus master device without an IOMMU or to fail the bus master device probe depending on whether the IOMMU is optional or mandatory would be a good enhancement. Tested-by: Hanjun Guo Reviewed-by: Robin Murphy [Lorenzo: Added fixes for dma_coherent_mask overflow, acpi_dma_configure called multiple times for same device] Signed-off-by: Lorenzo Pieralisi Signed-off-by: Sricharan R Signed-off-by: Joerg Roedel --- include/linux/acpi.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 9b05886f9773..79d06ef654c9 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -762,8 +762,11 @@ static inline enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev) return DEV_DMA_NOT_SUPPORTED; } -static inline void acpi_dma_configure(struct device *dev, - enum dev_dma_attr attr) { } +static inline int acpi_dma_configure(struct device *dev, + enum dev_dma_attr attr) +{ + return 0; +} static inline void acpi_dma_deconfigure(struct device *dev) { } -- cgit v1.2.3 From 316ca8804ea84a782d5ba2163711ebb22116ff5a Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Mon, 10 Apr 2017 16:51:06 +0530 Subject: ACPI/IORT: Remove linker section for IORT entries probing The IORT linker section introduced by commit 34ceea275f62 ("ACPI/IORT: Introduce linker section for IORT entries probing") was needed to make sure SMMU drivers are registered (and therefore probed) in the kernel before devices using the SMMU have a chance to probe in turn. Through the introduction of deferred IOMMU configuration the linker section based IORT probing infrastructure is not needed any longer, in that device/SMMU probe dependencies are managed through the probe deferral mechanism, making the IORT linker section infrastructure unused, so that it can be removed. Remove the unused IORT linker section probing infrastructure from the kernel to complete the ACPI IORT IOMMU configure probe deferral mechanism implementation. Tested-by: Hanjun Guo Reviewed-by: Robin Murphy Signed-off-by: Lorenzo Pieralisi Cc: Sricharan R Signed-off-by: Joerg Roedel --- include/linux/acpi_iort.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index 77e08099e554..f167e1d045ff 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -52,7 +52,4 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev) { return NULL; } #endif -#define IORT_ACPI_DECLARE(name, table_id, fn) \ - ACPI_DECLARE_PROBE_ENTRY(iort, name, table_id, 0, NULL, 0, fn) - #endif /* __ACPI_IORT_H__ */ -- cgit v1.2.3 From 49a57ef7f8492ef985ee1ecdb927ca78a6b2f308 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Wed, 12 Apr 2017 00:21:27 -0500 Subject: iommu/omap: Drop legacy-style device support All the supported boards that have OMAP IOMMU devices do support DT boot only now. So, drop the support for the non-DT legacy-style devices from the OMAP IOMMU driver. Couple of the fields from the iommu platform data would no longer be required, so they have also been cleaned up. The IOMMU platform data is still needed though for performing reset management properly in a multi-arch environment. Signed-off-by: Suman Anna Signed-off-by: Joerg Roedel --- include/linux/platform_data/iommu-omap.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/iommu-omap.h b/include/linux/platform_data/iommu-omap.h index 0496d171700a..a40fc0f4f9de 100644 --- a/include/linux/platform_data/iommu-omap.h +++ b/include/linux/platform_data/iommu-omap.h @@ -30,10 +30,7 @@ struct omap_iommu_arch_data { }; struct iommu_platform_data { - const char *name; const char *reset_name; - int nr_tlb_entries; - int (*assert_reset)(struct platform_device *pdev, const char *name); int (*deassert_reset)(struct platform_device *pdev, const char *name); }; -- cgit v1.2.3 From e73b7afe4e8ca5ec4304a9e1d5009755a85fff91 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 12 Apr 2017 00:21:28 -0500 Subject: iommu/omap: Move data structures to omap-iommu.h The internal data-structures are scattered over various header and C files. Consolidate them in omap-iommu.h. While at this, add the kerneldoc comment for the missing iommu domain variable and revise the iommu_arch_data name. Signed-off-by: Joerg Roedel [s-anna@ti.com: revise kerneldoc comments] Signed-off-by: Suman Anna Signed-off-by: Joerg Roedel --- include/linux/platform_data/iommu-omap.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/iommu-omap.h b/include/linux/platform_data/iommu-omap.h index a40fc0f4f9de..e8b12dbf6170 100644 --- a/include/linux/platform_data/iommu-omap.h +++ b/include/linux/platform_data/iommu-omap.h @@ -12,23 +12,6 @@ #include -#define MMU_REG_SIZE 256 - -/** - * struct iommu_arch_data - omap iommu private data - * @name: name of the iommu device - * @iommu_dev: handle of the iommu device - * - * This is an omap iommu private data object, which binds an iommu user - * to its iommu device. This object should be placed at the iommu user's - * dev_archdata so generic IOMMU API can be used without having to - * utilize omap-specific plumbing anymore. - */ -struct omap_iommu_arch_data { - const char *name; - struct omap_iommu *iommu_dev; -}; - struct iommu_platform_data { const char *reset_name; int (*assert_reset)(struct platform_device *pdev, const char *name); -- cgit v1.2.3 From 518662e0fcb9fa241fe90a337b59bc5066b2a930 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2017 12:22:09 +1000 Subject: NFS: fix usage of mempools. When passed GFP flags that allow sleeping (such as GFP_NOIO), mempool_alloc() will never return NULL, it will wait until memory is available. This means that we don't need to handle failure, but that we do need to ensure one thread doesn't call mempool_alloc() twice on the one pool without queuing or freeing the first allocation. If multiple threads did this during times of high memory pressure, the pool could be exhausted and a deadlock could result. pnfs_generic_alloc_ds_commits() attempts to allocate from the nfs_commit_mempool while already holding an allocation from that pool. This is not safe. So change nfs_commitdata_alloc() to take a flag that indicates whether failure is acceptable. In pnfs_generic_alloc_ds_commits(), accept failure and handle it as we currently do. Else where, do not accept failure, and do not handle it. Even when failure is acceptable, we want to succeed if possible. That means both - using an entry from the pool if there is one - waiting for direct reclaim is there isn't. We call mempool_alloc(GFP_NOWAIT) to achieve the first, then kmem_cache_alloc(GFP_NOIO|__GFP_NORETRY) to achieve the second. Each of these can fail, but together they do the best they can without blocking indefinitely. The objects returned by kmem_cache_alloc() will still be freed by mempool_free(). This is safe as mempool_alloc() uses exactly the same function to allocate objects (since the mempool was created with mempool_create_slab_pool()). The object returned by mempool_alloc() and kmem_cache_alloc() are indistinguishable so mempool_free() will handle both identically, either adding to the pool or calling kmem_cache_free(). Also, don't test for failure when allocating from nfs_wdata_mempool. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 287f34161086..1b29915247b2 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -502,7 +502,7 @@ extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); -extern struct nfs_commit_data *nfs_commitdata_alloc(void); +extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail); extern void nfs_commit_free(struct nfs_commit_data *data); static inline int -- cgit v1.2.3 From fbe77c30e9abcb3429380dec622439991a718e31 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 19 Apr 2017 10:11:35 -0400 Subject: NFS: move rw_mode to nfs_pageio_header Let's try to have it in a cacheline in nfs4_proc_pgio_rpc_prepare(). Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- include/linux/nfs_page.h | 4 ++-- include/linux/nfs_xdr.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 957049f72290..6f01e28bba27 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -64,7 +64,6 @@ struct nfs_pageio_ops { }; struct nfs_rw_ops { - const fmode_t rw_mode; struct nfs_pgio_header *(*rw_alloc_header)(void); void (*rw_free_header)(struct nfs_pgio_header *); int (*rw_done)(struct rpc_task *, struct nfs_pgio_header *, @@ -124,7 +123,8 @@ extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, const struct nfs_pgio_completion_ops *compl_ops, const struct nfs_rw_ops *rw_ops, size_t bsize, - int how); + int how, + gfp_t gfp_flags); extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, struct nfs_page *); extern int nfs_pageio_resend(struct nfs_pageio_descriptor *, diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 348f7c158084..51e27f9746ee 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1427,6 +1427,7 @@ struct nfs_pgio_header { struct list_head pages; struct nfs_page *req; struct nfs_writeverf verf; /* Used for writes */ + fmode_t rw_mode; struct pnfs_layout_segment *lseg; loff_t io_start; const struct rpc_call_ops *mds_ops; -- cgit v1.2.3 From 6ade8694f471d847500c7cec152cc15171cef5d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Apr 2017 17:30:06 -0700 Subject: kvm: Move srcu_struct fields to end of struct kvm Parallelizing SRCU callback handling will increase the size of srcu_struct, which will move the kvm structure's kvm_arch field out of reach of powerpc's current assembly code, which will result in the following sort of build error: arch/powerpc/kvm/book3s_hv_rmhandlers.S:617: Error: operand out of range (0x000000000000b328 is not between 0xffffffffffff8000 and 0x0000000000007fff) This commit moves the srcu_struct fields in the kvm structure to follow the kvm_arch field, which will allow powerpc's assembly code to continue to be able to reach the kvm_arch field. Reported-by: Stephen Rothwell Reported-by: Michael Ellerman Reported-by: kbuild test robot Suggested-by: Paolo Bonzini Signed-off-by: Paul E. McKenney Tested-by: Michael Ellerman Acked-by: Paolo Bonzini [ paulmck: Moved this commit to precede SRCU callback parallelization, and reworded the commit log into future tense, all in the name of bisectability. ] --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2c14ad9809da..96c8e29c6442 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -375,8 +375,6 @@ struct kvm { struct mutex slots_lock; struct mm_struct *mm; /* userspace tied to this vm */ struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM]; - struct srcu_struct srcu; - struct srcu_struct irq_srcu; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; /* @@ -429,6 +427,8 @@ struct kvm { struct list_head devices; struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; + struct srcu_struct srcu; + struct srcu_struct irq_srcu; }; #define kvm_err(fmt, ...) \ -- cgit v1.2.3 From da915ad5cf25b5f5d358dd3670c3378d8ae8c03e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Apr 2017 09:01:53 -0700 Subject: srcu: Parallelize callback handling Peter Zijlstra proposed using SRCU to reduce mmap_sem contention [1,2], however, there are workloads that could result in a high volume of concurrent invocations of call_srcu(), which with current SRCU would result in excessive lock contention on the srcu_struct structure's ->queue_lock, which protects SRCU's callback lists. This commit therefore moves SRCU to per-CPU callback lists, thus greatly reducing contention. Because a given SRCU instance no longer has a single centralized callback list, starting grace periods and invoking callbacks are both more complex than in the single-list Classic SRCU implementation. Starting grace periods and handling callbacks are now handled using an srcu_node tree that is in some ways similar to the rcu_node trees used by RCU-bh, RCU-preempt, and RCU-sched (for example, the srcu_node tree shape is controlled by exactly the same Kconfig options and boot parameters that control the shape of the rcu_node tree). In addition, the old per-CPU srcu_array structure is now named srcu_data and contains an rcu_segcblist structure named ->srcu_cblist for its callbacks (and a spinlock to protect this). The srcu_struct gets an srcu_gp_seq that is used to associate callback segments with the corresponding completion-time grace-period number. These completion-time grace-period numbers are propagated up the srcu_node tree so that the grace-period workqueue handler can determine whether additional grace periods are needed on the one hand and where to look for callbacks that are ready to be invoked. The srcu_barrier() function must now wait on all instances of the per-CPU ->srcu_cblist. Because each ->srcu_cblist is protected by ->lock, srcu_barrier() can remotely add the needed callbacks. In theory, it could also remotely start grace periods, but in practice doing so is complex and racy. And interestingly enough, it is never necessary for srcu_barrier() to start a grace period because srcu_barrier() only enqueues a callback when a callback is already present--and it turns out that a grace period has to have already been started for this pre-existing callback. Furthermore, it is only the callback that srcu_barrier() needs to wait on, not any particular grace period. Therefore, a new rcu_segcblist_entrain() function enqueues the srcu_barrier() function's callback into the same segment occupied by the last pre-existing callback in the list. The special case where all the pre-existing callbacks are on a different list (because they are in the process of being invoked) is handled by enqueuing srcu_barrier()'s callback into the RCU_DONE_TAIL segment, relying on the done-callbacks check that takes place after all callbacks are inovked. Note that the readers use the same algorithm as before. Note that there is a separate srcu_idx that tells the readers what counter to increment. This unfortunately cannot be combined with srcu_gp_seq because they need to be incremented at different times. This commit introduces some ugly #ifdefs in rcutorture. These will go away when I feel good enough about Tree SRCU to ditch Classic SRCU. Some crude performance comparisons, courtesy of a quickly hacked rcuperf asynchronous-grace-period capability: Callback Queuing Overhead ------------------------- # CPUS Classic SRCU Tree SRCU ------ ------------ --------- 2 0.349 us 0.342 us 16 31.66 us 0.4 us 41 --------- 0.417 us The times are the 90th percentiles, a statistic that was chosen to reject the overheads of the occasional srcu_barrier() call needed to avoid OOMing the test machine. The rcuperf test hangs when running Classic SRCU at 41 CPUs, hence the line of dashes. Despite the hacks to both the rcuperf code and that statistics, this is a convincing demonstration of Tree SRCU's performance and scalability advantages. [1] https://lwn.net/Articles/309030/ [2] https://patchwork.kernel.org/patch/5108281/ Signed-off-by: Paul E. McKenney [ paulmck: Fix initialization if synchronize_srcu_expedited() called first. ] --- include/linux/rcu_segcblist.h | 42 ++++++++++++++++++++--- include/linux/srcutree.h | 80 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 102 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 74b1e7243955..ced8f313fd05 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -401,6 +401,37 @@ static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; } +/* + * Entrain the specified callback onto the specified rcu_segcblist at + * the end of the last non-empty segment. If the entire rcu_segcblist + * is empty, make no change, but return false. + * + * This is intended for use by rcu_barrier()-like primitives, -not- + * for normal grace-period use. IMPORTANT: The callback you enqueue + * will wait for all prior callbacks, NOT necessarily for a grace + * period. You have been warned. + */ +static inline bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy) +{ + int i; + + if (rcu_segcblist_n_cbs(rsclp) == 0) + return false; + WRITE_ONCE(rsclp->len, rsclp->len + 1); + if (lazy) + rsclp->len_lazy++; + smp_mb(); /* Ensure counts are updated before callback is entrained. */ + rhp->next = NULL; + for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) + if (rsclp->tails[i] != rsclp->tails[i - 1]) + break; + *rsclp->tails[i] = rhp; + for (; i <= RCU_NEXT_TAIL; i++) + rsclp->tails[i] = &rhp->next; + return true; +} + /* * Extract only the counts from the specified rcu_segcblist structure, * and place them in the specified rcu_cblist structure. This function @@ -537,7 +568,8 @@ static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp, int i, j; WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); - WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)); + if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) + return; /* * Find all callbacks whose ->gp_seq numbers indicate that they @@ -582,8 +614,9 @@ static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp, * them to complete at the end of the earlier grace period. * * This function operates on an rcu_segcblist structure, and also the - * grace-period sequence number at which new callbacks would become - * ready to invoke. + * grace-period sequence number seq at which new callbacks would become + * ready to invoke. Returns true if there are callbacks that won't be + * ready to invoke until seq, false otherwise. */ static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) @@ -591,7 +624,8 @@ static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, int i; WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); - WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)); + if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) + return false; /* * Find the segment preceding the oldest segment of callbacks diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index f2b3bd6c6bc2..0400e211aa44 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -24,25 +24,75 @@ #ifndef _LINUX_SRCU_TREE_H #define _LINUX_SRCU_TREE_H -struct srcu_array { - unsigned long lock_count[2]; - unsigned long unlock_count[2]; +#include +#include + +struct srcu_node; +struct srcu_struct; + +/* + * Per-CPU structure feeding into leaf srcu_node, similar in function + * to rcu_node. + */ +struct srcu_data { + /* Read-side state. */ + unsigned long srcu_lock_count[2]; /* Locks per CPU. */ + unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */ + + /* Update-side state. */ + spinlock_t lock ____cacheline_internodealigned_in_smp; + struct rcu_segcblist srcu_cblist; /* List of callbacks.*/ + unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ + bool srcu_cblist_invoking; /* Invoking these CBs? */ + struct delayed_work work; /* Context for CB invoking. */ + struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */ + struct srcu_node *mynode; /* Leaf srcu_node. */ + int cpu; + struct srcu_struct *sp; }; +/* + * Node in SRCU combining tree, similar in function to rcu_data. + */ +struct srcu_node { + spinlock_t lock; + unsigned long srcu_have_cbs[4]; /* GP seq for children */ + /* having CBs, but only */ + /* is > ->srcu_gq_seq. */ + struct srcu_node *srcu_parent; /* Next up in tree. */ + int grplo; /* Least CPU for node. */ + int grphi; /* Biggest CPU for node. */ +}; + +/* + * Per-SRCU-domain structure, similar in function to rcu_state. + */ struct srcu_struct { - unsigned long completed; - unsigned long srcu_gp_seq; - atomic_t srcu_exp_cnt; - struct srcu_array __percpu *per_cpu_ref; - spinlock_t queue_lock; /* protect ->srcu_cblist */ - struct rcu_segcblist srcu_cblist; + struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */ + struct srcu_node *level[RCU_NUM_LVLS + 1]; + /* First node at each level. */ + struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ + spinlock_t gp_lock; /* protect ->srcu_cblist */ + struct mutex srcu_gp_mutex; /* Serialize GP work. */ + unsigned int srcu_idx; /* Current rdr array element. */ + unsigned long srcu_gp_seq; /* Grace-period seq #. */ + unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ + atomic_t srcu_exp_cnt; /* # ongoing expedited GPs. */ + struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ + unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ + struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ + struct completion srcu_barrier_completion; + /* Awaken barrier rq at end. */ + atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */ + /* callback for the barrier */ + /* operation. */ struct delayed_work work; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ }; -/* Values for -> state variable. */ +/* Values for state variable (bottom bits of ->srcu_gp_seq). */ #define SRCU_STATE_IDLE 0 #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 @@ -51,11 +101,9 @@ void process_srcu(struct work_struct *work); #define __SRCU_STRUCT_INIT(name) \ { \ - .completed = -300, \ - .per_cpu_ref = &name##_srcu_array, \ - .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ - .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\ - .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ + .sda = &name##_srcu_data, \ + .gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock), \ + .srcu_gp_seq_needed = 0 - 1, \ __SRCU_DEP_MAP_INIT(name) \ } @@ -79,7 +127,7 @@ void process_srcu(struct work_struct *work); * See include/linux/percpu-defs.h for the rules on per-CPU variables. */ #define __DEFINE_SRCU(name, is_static) \ - static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ + static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\ is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) -- cgit v1.2.3 From bcbfdd01dce5556a952fae84ef16fd0f12525e7b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Apr 2017 15:50:41 -0700 Subject: rcu: Make non-preemptive schedule be Tasks RCU quiescent state Currently, a call to schedule() acts as a Tasks RCU quiescent state only if a context switch actually takes place. However, just the call to schedule() guarantees that the calling task has moved off of whatever tracing trampoline that it might have been one previously. This commit therefore plumbs schedule()'s "preempt" parameter into rcu_note_context_switch(), which then records the Tasks RCU quiescent state, but only if this call to schedule() was -not- due to a preemption. To avoid adding overhead to the common-case context-switch path, this commit hides the rcu_note_context_switch() check under an existing non-common-case check. Suggested-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 11 ++++++++--- include/linux/rcutiny.h | 13 +++++++++---- include/linux/rcutree.h | 5 +++-- 3 files changed, 20 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index e6146d0074f8..f531b29207da 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -363,15 +363,20 @@ static inline void rcu_init_nohz(void) #ifdef CONFIG_TASKS_RCU #define TASKS_RCU(x) x extern struct srcu_struct tasks_rcu_exit_srcu; -#define rcu_note_voluntary_context_switch(t) \ +#define rcu_note_voluntary_context_switch_lite(t) \ do { \ - rcu_all_qs(); \ if (READ_ONCE((t)->rcu_tasks_holdout)) \ WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) +#define rcu_note_voluntary_context_switch(t) \ + do { \ + rcu_all_qs(); \ + rcu_note_voluntary_context_switch_lite(t); \ + } while (0) #else /* #ifdef CONFIG_TASKS_RCU */ #define TASKS_RCU(x) do { } while (0) -#define rcu_note_voluntary_context_switch(t) rcu_all_qs() +#define rcu_note_voluntary_context_switch_lite(t) do { } while (0) +#define rcu_note_voluntary_context_switch(t) rcu_all_qs() #endif /* #else #ifdef CONFIG_TASKS_RCU */ /** diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 5219be250f00..74d9c3a1feee 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -92,10 +92,11 @@ static inline void kfree_call_rcu(struct rcu_head *head, call_rcu(head, func); } -static inline void rcu_note_context_switch(void) -{ - rcu_sched_qs(); -} +#define rcu_note_context_switch(preempt) \ + do { \ + rcu_sched_qs(); \ + rcu_note_voluntary_context_switch_lite(current); \ + } while (0) /* * Take advantage of the fact that there is only one CPU, which @@ -242,6 +243,10 @@ static inline bool rcu_is_watching(void) #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ +static inline void rcu_request_urgent_qs_task(struct task_struct *t) +{ +} + static inline void rcu_all_qs(void) { barrier(); /* Avoid RCU read-side critical sections leaking across. */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 63a4e4cf40a5..0bacb6b2af69 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -30,7 +30,7 @@ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H -void rcu_note_context_switch(void); +void rcu_note_context_switch(bool preempt); int rcu_needs_cpu(u64 basem, u64 *nextevt); void rcu_cpu_stall_reset(void); @@ -41,7 +41,7 @@ void rcu_cpu_stall_reset(void); */ static inline void rcu_virt_note_context_switch(int cpu) { - rcu_note_context_switch(); + rcu_note_context_switch(false); } void synchronize_rcu_bh(void); @@ -108,6 +108,7 @@ void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; bool rcu_is_watching(void); +void rcu_request_urgent_qs_task(struct task_struct *t); void rcu_all_qs(void); -- cgit v1.2.3 From 50f2112cf7a3e62a8d33838eb205d5fef306457a Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Tue, 11 Apr 2017 12:50:09 -0400 Subject: locks: Set FL_CLOSE when removing flock locks on close() Set FL_CLOSE in fl_flags as in locks_remove_posix() when clearing locks. NFS will check for this flag to ensure an unlock is sent in a following patch. Fuse handles flock and posix locks differently for FL_CLOSE, and so requires a fixup to retain the existing behavior for flock. Signed-off-by: Benjamin Coddington Reviewed-by: Jeff Layton Acked-by: Miklos Szeredi Signed-off-by: Trond Myklebust --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7251f7bb45e8..72061aa65405 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -909,6 +909,8 @@ static inline struct file *get_file(struct file *f) #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ #define FL_LAYOUT 2048 /* outstanding pNFS layout */ +#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) + /* * Special return value from posix_lock_file() and vfs_lock_file() for * asynchronous locking. -- cgit v1.2.3 From 7d6ddf88c4db372689c8aa65ea652d0514d66c06 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Tue, 11 Apr 2017 12:50:10 -0400 Subject: NFS: Add an iocounter wait function for async RPC tasks By sleeping on a new NFS Unlock-On-Close waitqueue, rpc tasks may wait for a lock context's iocounter to reach zero. The rpc waitqueue is only woken when the open_context has the NFS_CONTEXT_UNLOCK flag set in order to mitigate spurious wake-ups for any iocounter reaching zero. Signed-off-by: Benjamin Coddington Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 1 + include/linux/nfs_fs_sb.h | 1 + include/linux/nfs_page.h | 1 + 3 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 1b29915247b2..9aa044e76820 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -76,6 +76,7 @@ struct nfs_open_context { #define NFS_CONTEXT_ERROR_WRITE (0) #define NFS_CONTEXT_RESEND_WRITES (1) #define NFS_CONTEXT_BAD (2) +#define NFS_CONTEXT_UNLOCK (3) int error; struct list_head list; diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index b34097c67848..2a70f34dffe8 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -222,6 +222,7 @@ struct nfs_server { u32 mountd_version; unsigned short mountd_port; unsigned short mountd_protocol; + struct rpc_wait_queue uoc_rpcwaitq; }; /* Server capabilities */ diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 6f01e28bba27..247cc3d3498f 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -141,6 +141,7 @@ extern int nfs_page_group_lock(struct nfs_page *, bool); extern void nfs_page_group_lock_wait(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); +extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *); /* * Lock the page of an asynchronous request -- cgit v1.2.3 From b1ece737f44f91dca8f4829cf0b442e752e406db Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Tue, 11 Apr 2017 12:50:11 -0400 Subject: lockd: Introduce nlmclnt_operations NFS would enjoy the ability to modify the behavior of the NLM client's unlock RPC task in order to delay the transmission of the unlock until IO that was submitted under that lock has completed. This ability can ensure that the NLM client will always complete the transmission of an unlock even if the waiting caller has been interrupted with fatal signal. For this purpose, a pointer to a struct nlmclnt_operations can be assigned in a nfs_module's nfs_rpc_ops that will install those nlmclnt_operations on the nlm_host. The struct nlmclnt_operations defines three callback operations that will be used in a following patch: nlmclnt_alloc_call - used to call back after a successful allocation of a struct nlm_rqst in nlmclnt_proc(). nlmclnt_unlock_prepare - used to call back during NLM unlock's rpc_call_prepare. The NLM client defers calling rpc_call_start() until this callback returns false. nlmclnt_release_call - used to call back when the NLM client's struct nlm_rqst is freed. Signed-off-by: Benjamin Coddington Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- include/linux/lockd/bind.h | 24 ++++++++++++++++++++++-- include/linux/lockd/lockd.h | 2 ++ include/linux/nfs_xdr.h | 1 + 3 files changed, 25 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 140edab64446..05728396a1a1 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -18,6 +18,7 @@ /* Dummy declarations */ struct svc_rqst; +struct rpc_task; /* * This is the set of functions for lockd->nfsd communication @@ -43,6 +44,7 @@ struct nlmclnt_initdata { u32 nfs_version; int noresvport; struct net *net; + const struct nlmclnt_operations *nlmclnt_ops; }; /* @@ -52,8 +54,26 @@ struct nlmclnt_initdata { extern struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init); extern void nlmclnt_done(struct nlm_host *host); -extern int nlmclnt_proc(struct nlm_host *host, int cmd, - struct file_lock *fl); +/* + * NLM client operations provide a means to modify RPC processing of NLM + * requests. Callbacks receive a pointer to data passed into the call to + * nlmclnt_proc(). + */ +struct nlmclnt_operations { + /* Called on successful allocation of nlm_rqst, use for allocation or + * reference counting. */ + void (*nlmclnt_alloc_call)(void *); + + /* Called in rpc_task_prepare for unlock. A return value of true + * indicates the callback has put the task to sleep on a waitqueue + * and NLM should not call rpc_call_start(). */ + bool (*nlmclnt_unlock_prepare)(struct rpc_task*, void *); + + /* Called when the nlm_rqst is freed, callbacks should clean up here */ + void (*nlmclnt_release_call)(void *); +}; + +extern int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *data); extern int lockd_up(struct net *net); extern void lockd_down(struct net *net); diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index b37dee3acaba..41f7b6a04d69 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -69,6 +69,7 @@ struct nlm_host { char *h_addrbuf; /* address eyecatcher */ struct net *net; /* host net */ char nodename[UNX_MAXNODENAME + 1]; + const struct nlmclnt_operations *h_nlmclnt_ops; /* Callback ops for NLM users */ }; /* @@ -142,6 +143,7 @@ struct nlm_rqst { struct nlm_block * a_block; unsigned int a_retries; /* Retry count */ u8 a_owner[NLMCLNT_OHSIZE]; + void * a_callback_data; /* sent to nlmclnt_operations callbacks */ }; /* diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 51e27f9746ee..677c6b91dfcd 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1551,6 +1551,7 @@ struct nfs_rpc_ops { const struct inode_operations *dir_inode_ops; const struct inode_operations *file_inode_ops; const struct file_operations *file_ops; + const struct nlmclnt_operations *nlmclnt_ops; int (*getroot) (struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); -- cgit v1.2.3 From b62ea4112ce3746664dcc2f232d03461f0e6f3c7 Mon Sep 17 00:00:00 2001 From: Martin Kaiser Date: Fri, 21 Apr 2017 16:47:11 +0200 Subject: video: fbdev: imxfb: support AUS mode Some displays require setting AUS mode in the LDCD AUS Mode Control Register to work with the imxfb driver. Like the value of the Panel Configuration Register, the AUS mode setting depends on the display mode. Allow setting AUS mode from the device tree by adding a boolean property. Make this property optional to keep the DT ABI stable. AUS mode can be set only on imx21 and compatible chipsets. Signed-off-by: Martin Kaiser Cc: Sascha Hauer Cc: Rob Herring Cc: Mark Rutland Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/platform_data/video-imxfb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/video-imxfb.h b/include/linux/platform_data/video-imxfb.h index a5c0a71ec914..cf9348b376ac 100644 --- a/include/linux/platform_data/video-imxfb.h +++ b/include/linux/platform_data/video-imxfb.h @@ -50,6 +50,7 @@ struct imx_fb_videomode { struct fb_videomode mode; u32 pcr; + bool aus_mode; unsigned char bpp; }; -- cgit v1.2.3 From 4f757f3cbf54edef7b75c68d6d6d2f1a0ca08d2e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 15 Apr 2017 17:31:22 -0400 Subject: make sure that mntns_install() doesn't end up with referral for root new flag: LOOKUP_DOWN. If the starting point is overmounted, cross into whatever's mounted on top, triggering referrals et.al. Use that instead of follow_down_one() loop in mntns_install(), handle errors properly. Signed-off-by: Al Viro --- include/linux/namei.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/namei.h b/include/linux/namei.h index f29abda31e6d..8b4794e83196 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -44,6 +44,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_JUMPED 0x1000 #define LOOKUP_ROOT 0x2000 #define LOOKUP_EMPTY 0x4000 +#define LOOKUP_DOWN 0x8000 extern int path_pts(struct path *path); -- cgit v1.2.3 From cf0c3e68aa81f992b0301f62e341b710d385bf68 Mon Sep 17 00:00:00 2001 From: Jeroen Hofstee Date: Fri, 21 Apr 2017 15:21:11 +0900 Subject: kbuild: fix asm-offset generation to work with clang KBuild abuses the asm statement to write to a file and clang chokes about these invalid asm statements. Hack it even more by fooling this is actual valid asm code. [masahiro: Import Jeroen's work for U-Boot: http://patchwork.ozlabs.org/patch/375026/ Tweak sed script a little to avoid garbage '#' for GCC case, like #define NR_PAGEFLAGS 23 /* __NR_PAGEFLAGS # */ ] Signed-off-by: Jeroen Hofstee Signed-off-by: Masahiro Yamada Reviewed-by: Matthias Kaehlcke Tested-by: Matthias Kaehlcke --- include/linux/kbuild.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kbuild.h b/include/linux/kbuild.h index 22a72198c14b..4e80f3a9ad58 100644 --- a/include/linux/kbuild.h +++ b/include/linux/kbuild.h @@ -2,14 +2,14 @@ #define __LINUX_KBUILD_H #define DEFINE(sym, val) \ - asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + asm volatile("\n.ascii \"->" #sym " %0 " #val "\"" : : "i" (val)) -#define BLANK() asm volatile("\n->" : : ) +#define BLANK() asm volatile("\n.ascii \"->\"" : : ) #define OFFSET(sym, str, mem) \ DEFINE(sym, offsetof(struct str, mem)) #define COMMENT(x) \ - asm volatile("\n->#" x) + asm volatile("\n.ascii \"->#" x "\"") #endif -- cgit v1.2.3 From f107d7a43923a83d837b3ea3c7b7de58cd014bbd Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Thu, 16 Mar 2017 09:02:42 +0100 Subject: mtd: nand: Remove unused chip->write_page() hook The last/only user of the chip->write_page() hook (the Atmel NAND controller driver) has been reworked and is no longer specifying a custom ->write_page() implementation. Drop this hook before someone else start abusing it. Signed-off-by: Boris Brezillon Reviewed-by: Masahiro Yamada --- include/linux/mtd/nand.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index c7de017c7f4c..40657939797c 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -828,7 +828,6 @@ struct nand_manufacturer_ops { * @errstat: [OPTIONAL] hardware specific function to perform * additional error status checks (determine if errors are * correctable). - * @write_page: [REPLACEABLE] High-level page write function * @manufacturer: [INTERN] Contains manufacturer information */ @@ -854,9 +853,6 @@ struct nand_chip { int (*scan_bbt)(struct mtd_info *mtd); int (*errstat)(struct mtd_info *mtd, struct nand_chip *this, int state, int status, int page); - int (*write_page)(struct mtd_info *mtd, struct nand_chip *chip, - uint32_t offset, int data_len, const uint8_t *buf, - int oob_required, int page, int cached, int raw); int (*onfi_set_features)(struct mtd_info *mtd, struct nand_chip *chip, int feature_addr, uint8_t *subfeature_para); int (*onfi_get_features)(struct mtd_info *mtd, struct nand_chip *chip, -- cgit v1.2.3 From 07604686e808cd93d352172806a7828860f048f5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 30 Mar 2017 15:45:47 +0900 Subject: mtd: nand: relax ecc.read_page() return value for uncorrectable ECC The comment for ecc.read_page() requires that it should return "0 if bitflips uncorrectable". Actually, drivers could return positive values when uncorrectable bitflips occur. For example, nand_read_page_swecc() is the case. If ecc.correct() returns -EBADMSG for the first ECC sector, and a positive value for the second one, nand_read_page_swecc() returns a positive max_bitflips and increments ecc_stats.failed for the same page. The requirement can be relaxed by tweaking nand_do_read_ops(). Move the max_bitflips calculation below the retry. Signed-off-by: Masahiro Yamada Suggested-by: Boris Brezillon Signed-off-by: Boris Brezillon --- include/linux/mtd/nand.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 40657939797c..9e0c93c44bef 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -516,7 +516,7 @@ static inline void nand_hw_control_init(struct nand_hw_control *nfc) * out-of-band data). * @read_page: function to read a page according to the ECC generator * requirements; returns maximum number of bitflips corrected in - * any single ECC step, 0 if bitflips uncorrectable, -EIO hw error + * any single ECC step, -EIO hw error * @read_subpage: function to read parts of the page covered by ECC; * returns same as read_page() * @write_subpage: function to write parts of the page covered by ECC. -- cgit v1.2.3 From 477544c62a84d3bacd9f90ba75ffc16c04d78071 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 30 Mar 2017 17:15:05 +0900 Subject: mtd: nand: allow drivers to request minimum alignment for passed buffer In some cases, nand_do_{read,write}_ops is passed with unaligned ops->datbuf. Drivers using DMA will be unhappy about unaligned buffer. The new struct member, buf_align, represents the minimum alignment the driver require for the buffer. If the buffer passed from the upper MTD layer does not have enough alignment, nand_do_*_ops will use bufpoi. Signed-off-by: Masahiro Yamada Signed-off-by: Boris Brezillon --- include/linux/mtd/nand.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 9e0c93c44bef..8f67b1581683 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -755,6 +755,7 @@ struct nand_manufacturer_ops { * setting the read-retry mode. Mostly needed for MLC NAND. * @ecc: [BOARDSPECIFIC] ECC control structure * @buffers: buffer structure for read/write + * @buf_align: minimum buffer alignment required by a platform * @hwcontrol: platform-specific hardware control structure * @erase: [REPLACEABLE] erase function * @scan_bbt: [REPLACEABLE] function to scan bad block table @@ -905,6 +906,7 @@ struct nand_chip { struct nand_ecc_ctrl ecc; struct nand_buffers *buffers; + unsigned long buf_align; struct nand_hw_control hwcontrol; uint8_t *bbt; -- cgit v1.2.3 From 51f567777799c9d85a778302b9eb61cf15214a98 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 6 Apr 2017 22:36:31 -0400 Subject: nfsd: check for oversized NFSv2/v3 arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A client can append random data to the end of an NFSv2 or NFSv3 RPC call without our complaining; we'll just stop parsing at the end of the expected data and ignore the rest. Encoded arguments and replies are stored together in an array of pages, and if a call is too large it could leave inadequate space for the reply. This is normally OK because NFS RPC's typically have either short arguments and long replies (like READ) or long arguments and short replies (like WRITE). But a client that sends an incorrectly long reply can violate those assumptions. This was observed to cause crashes. So, insist that the argument not be any longer than we expect. Also, several operations increment rq_next_page in the decode routine before checking the argument size, which can leave rq_next_page pointing well past the end of the page array, causing trouble later in svc_free_pages. As followup we may also want to rewrite the encoding routines to check more carefully that they aren't running off the end of the page array. Reported-by: Tuomas Haanpää Reported-by: Ari Kauppi Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index e770abeed32d..6ef19cf658b4 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -336,8 +336,7 @@ xdr_argsize_check(struct svc_rqst *rqstp, __be32 *p) { char *cp = (char *)p; struct kvec *vec = &rqstp->rq_arg.head[0]; - return cp >= (char*)vec->iov_base - && cp <= (char*)vec->iov_base + vec->iov_len; + return cp == (char *)vec->iov_base + vec->iov_len; } static inline int -- cgit v1.2.3 From 17f5f7f506aaca985b95df7ef7fc2ff49c36a8e9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Apr 2017 13:05:36 -0400 Subject: svcrdma: Move send_wr to svc_rdma_op_ctxt Clean up: Move the ib_send_wr off the stack, and move common code to post a Send Work Request into a helper. This is a refactoring change only. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index b105f73e3ca2..287db5c179d8 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -85,6 +85,7 @@ struct svc_rdma_op_ctxt { enum dma_data_direction direction; int count; unsigned int mapped_sges; + struct ib_send_wr send_wr; struct ib_sge sge[RPCSVC_MAXPAGES]; struct page *pages[RPCSVC_MAXPAGES]; }; @@ -227,6 +228,9 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *, /* svc_rdma_sendto.c */ extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *, struct svc_rdma_req_map *, bool); +extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, + int num_sge, u32 inv_rkey); extern int svc_rdma_sendto(struct svc_rqst *); extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, int); -- cgit v1.2.3 From 6e6092ca305ad785c605d7e313727aad96c228a5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Apr 2017 13:05:44 -0400 Subject: svcrdma: Add svc_rdma_map_reply_hdr() Introduce a helper to DMA-map a reply's transport header before sending it. This will in part replace the map vector cache. Signed-off-by: Chuck Lever Reviewed-by: Christoph Hellwig Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 287db5c179d8..002a46d1faa1 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -228,6 +228,9 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *, /* svc_rdma_sendto.c */ extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *, struct svc_rdma_req_map *, bool); +extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, + __be32 *rdma_resp, unsigned int len); extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, struct svc_rdma_op_ctxt *ctxt, int num_sge, u32 inv_rkey); -- cgit v1.2.3 From b623589dbacbc786c2fffc85113a1dc1a331e2ca Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Apr 2017 13:05:52 -0400 Subject: svcrdma: Eliminate RPCRDMA_SQ_DEPTH_MULT The Send Queue depth is temporarily reduced to 1 SQE per credit. The new rdma_rw API does an internal computation, during QP creation, to increase the depth of the Send Queue to handle RDMA Read and Write operations. This change has to come before the NFSD code paths are updated to use the rdma_rw API. Without this patch, rdma_rw_init_qp() increases the size of the SQ too much, resulting in memory allocation failures during QP creation. Signed-off-by: Chuck Lever Reviewed-by: Christoph Hellwig Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 002a46d1faa1..11d5aa123f17 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -182,7 +182,6 @@ struct svcxprt_rdma { /* The default ORD value is based on two outstanding full-size writes with a * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ. */ #define RPCRDMA_ORD (64/4) -#define RPCRDMA_SQ_DEPTH_MULT 8 #define RPCRDMA_MAX_REQUESTS 32 #define RPCRDMA_MAX_REQ_SIZE 4096 -- cgit v1.2.3 From f13193f50b64e2e0c87706b838d6b9895626a892 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Apr 2017 13:06:16 -0400 Subject: svcrdma: Introduce local rdma_rw API helpers The plan is to replace the local bespoke code that constructs and posts RDMA Read and Write Work Requests with calls to the rdma_rw API. This shares code with other RDMA-enabled ULPs that manages the gory details of buffer registration and posting Work Requests. Some design notes: o The structure of RPC-over-RDMA transport headers is flexible, allowing multiple segments per Reply with arbitrary alignment, each with a unique R_key. Write and Send WRs continue to be built and posted in separate code paths. However, one whole chunk (with one or more RDMA segments apiece) gets exactly one ib_post_send and one work completion. o svc_xprt reference counting is modified, since a chain of rdma_rw_ctx structs generates one completion, no matter how many Write WRs are posted. o The current code builds the transport header as it is construct- ing Write WRs. I've replaced that with marshaling of transport header data items in a separate step. This is because the exact structure of client-provided segments may not align with the components of the server's reply xdr_buf, or the pages in the page list. Thus parts of each client-provided segment may be written at different points in the send path. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 11d5aa123f17..ca08671fb7e2 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -145,12 +145,15 @@ struct svcxprt_rdma { u32 sc_max_requests; /* Max requests */ u32 sc_max_bc_requests;/* Backward credits */ int sc_max_req_size; /* Size of each RQ WR buf */ + u8 sc_port_num; struct ib_pd *sc_pd; spinlock_t sc_ctxt_lock; struct list_head sc_ctxts; int sc_ctxt_used; + spinlock_t sc_rw_ctxt_lock; + struct list_head sc_rw_ctxts; spinlock_t sc_map_lock; struct list_head sc_maps; @@ -224,6 +227,14 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *, struct svc_rdma_op_ctxt *, int *, u32 *, u32, u32, u64, bool); +/* svc_rdma_rw.c */ +extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma); +extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, + __be32 *wr_ch, struct xdr_buf *xdr); +extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, + __be32 *rp_ch, bool writelist, + struct xdr_buf *xdr); + /* svc_rdma_sendto.c */ extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *, struct svc_rdma_req_map *, bool); -- cgit v1.2.3 From 9a6a180b7867ceceeeab88a6f011bac23174b939 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Apr 2017 13:06:25 -0400 Subject: svcrdma: Use rdma_rw API in RPC reply path The current svcrdma sendto code path posts one RDMA Write WR at a time. Each of these Writes typically carries a small number of pages (for instance, up to 30 pages for mlx4 devices). That means a 1MB NFS READ reply requires 9 ib_post_send() calls for the Write WRs, and one for the Send WR carrying the actual RPC Reply message. Instead, use the new rdma_rw API. The details of Write WR chain construction and memory registration are taken care of in the RDMA core. svcrdma can focus on the details of the RPC-over-RDMA protocol. This gives three main benefits: 1. All Write WRs for one RDMA segment are posted in a single chain. As few as one ib_post_send() for each Write chunk. 2. The Write path can now use FRWR to register the Write buffers. If the device's maximum page list depth is large, this means a single Write WR is needed for each RPC's Write chunk data. 3. The new code introduces support for RPCs that carry both a Write list and a Reply chunk. This combination can be used for an NFSv4 READ where the data payload is large, and thus is removed from the Payload Stream, but the Payload Stream is still larger than the inline threshold. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index ca08671fb7e2..599ee03ee3fb 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -212,7 +212,6 @@ extern int svc_rdma_xdr_decode_req(struct xdr_buf *); extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, struct rpcrdma_msg *, enum rpcrdma_errcode, __be32 *); -extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int); extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int); extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int, __be32, __be64, u32); -- cgit v1.2.3 From 6b19cc5ca2f78ebc88f5d39ba6a94197bb392fcc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Apr 2017 13:06:33 -0400 Subject: svcrdma: Clean up RDMA_ERROR path Now that svc_rdma_sendto has been renovated, svc_rdma_send_error can be refactored to reduce code duplication and remove C structure- based XDR encoding. It is also relocated to the source file that contains its only caller. This is a refactoring change only. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/rpc_rdma.h | 3 +++ include/linux/sunrpc/svc_rdma.h | 5 ----- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index 245fc59b7324..b7e85b341a54 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -143,6 +143,9 @@ enum rpcrdma_proc { #define rdma_done cpu_to_be32(RDMA_DONE) #define rdma_error cpu_to_be32(RDMA_ERROR) +#define err_vers cpu_to_be32(ERR_VERS) +#define err_chunk cpu_to_be32(ERR_CHUNK) + /* * Private extension to RPC-over-RDMA Version One. * Message passed during RDMA-CM connection set-up. diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 599ee03ee3fb..a770d200f607 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -209,9 +209,6 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, /* svc_rdma_marshal.c */ extern int svc_rdma_xdr_decode_req(struct xdr_buf *); -extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, - struct rpcrdma_msg *, - enum rpcrdma_errcode, __be32 *); extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int); extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int, __be32, __be64, u32); @@ -244,8 +241,6 @@ extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, struct svc_rdma_op_ctxt *ctxt, int num_sge, u32 inv_rkey); extern int svc_rdma_sendto(struct svc_rqst *); -extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, - int); /* svc_rdma_transport.c */ extern void svc_rdma_wc_send(struct ib_cq *, struct ib_wc *); -- cgit v1.2.3 From f5821c76b2c9c2fb98b276c0bf6a101bfe9050a3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Apr 2017 13:06:49 -0400 Subject: svcrdma: Clean up RPC-over-RDMA backchannel reply processing Replace C structure-based XDR decoding with pointer arithmetic. Pointer arithmetic is considered more portable. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index a770d200f607..44d642bbfce6 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -204,7 +204,7 @@ static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma, /* svc_rdma_backchannel.c */ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, - struct rpcrdma_msg *rmsgp, + __be32 *rdma_resp, struct xdr_buf *rcvbuf); /* svc_rdma_marshal.c */ -- cgit v1.2.3 From ded8d19641a605232ab48f5d27f542648beba3cc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Apr 2017 13:06:57 -0400 Subject: svcrdma: Reduce size of sge array in struct svc_rdma_op_ctxt The sge array in struct svc_rdma_op_ctxt is no longer used for sending RDMA Write WRs. It need only accommodate the construction of Send and Receive WRs. The maximum inline size is the largest payload it needs to handle now. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 44d642bbfce6..e84b77556784 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -48,6 +48,12 @@ #include #define SVCRDMA_DEBUG +/* Default and maximum inline threshold sizes */ +enum { + RPCRDMA_DEF_INLINE_THRESH = 4096, + RPCRDMA_MAX_INLINE_THRESH = 65536 +}; + /* RPC/RDMA parameters and stats */ extern unsigned int svcrdma_ord; extern unsigned int svcrdma_max_requests; @@ -86,7 +92,7 @@ struct svc_rdma_op_ctxt { int count; unsigned int mapped_sges; struct ib_send_wr send_wr; - struct ib_sge sge[RPCSVC_MAXPAGES]; + struct ib_sge sge[1 + RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE]; struct page *pages[RPCSVC_MAXPAGES]; }; @@ -186,7 +192,6 @@ struct svcxprt_rdma { * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ. */ #define RPCRDMA_ORD (64/4) #define RPCRDMA_MAX_REQUESTS 32 -#define RPCRDMA_MAX_REQ_SIZE 4096 /* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our * current NFSv4.1 implementation supports one backchannel slot. -- cgit v1.2.3 From 68cc4636bbbca89b9fedcf46d8b6bee444fc5e4e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Apr 2017 13:07:05 -0400 Subject: svcrdma: Remove unused RDMA Write completion handler Clean up. All RDMA Write completions are now handled by svc_rdma_wc_write_ctx. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index e84b77556784..f58c5349beb7 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -249,7 +249,6 @@ extern int svc_rdma_sendto(struct svc_rqst *); /* svc_rdma_transport.c */ extern void svc_rdma_wc_send(struct ib_cq *, struct ib_wc *); -extern void svc_rdma_wc_write(struct ib_cq *, struct ib_wc *); extern void svc_rdma_wc_reg(struct ib_cq *, struct ib_wc *); extern void svc_rdma_wc_read(struct ib_cq *, struct ib_wc *); extern void svc_rdma_wc_inv(struct ib_cq *, struct ib_wc *); -- cgit v1.2.3 From 2cf32924c68a22783e6f630e1b5345a80aa1a376 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Apr 2017 13:07:13 -0400 Subject: svcrdma: Remove the req_map cache req_maps are no longer used by the send path and can thus be removed. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index f58c5349beb7..479bb7f65233 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -96,23 +96,6 @@ struct svc_rdma_op_ctxt { struct page *pages[RPCSVC_MAXPAGES]; }; -/* - * NFS_ requests are mapped on the client side by the chunk lists in - * the RPCRDMA header. During the fetching of the RPC from the client - * and the writing of the reply to the client, the memory in the - * client and the memory in the server must be mapped as contiguous - * vaddr/len for access by the hardware. These data strucures keep - * these mappings. - * - * For an RDMA_WRITE, the 'sge' maps the RPC REPLY. For RDMA_READ, the - * 'sge' in the svc_rdma_req_map maps the server side RPC reply and the - * 'ch' field maps the read-list of the RPCRDMA header to the 'sge' - * mapping of the reply. - */ -struct svc_rdma_chunk_sge { - int start; /* sge no for this chunk */ - int count; /* sge count for this chunk */ -}; struct svc_rdma_fastreg_mr { struct ib_mr *mr; struct scatterlist *sg; @@ -121,15 +104,7 @@ struct svc_rdma_fastreg_mr { enum dma_data_direction direction; struct list_head frmr_list; }; -struct svc_rdma_req_map { - struct list_head free; - unsigned long count; - union { - struct kvec sge[RPCSVC_MAXPAGES]; - struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES]; - unsigned long lkey[RPCSVC_MAXPAGES]; - }; -}; + #define RDMACTXT_F_LAST_CTXT 2 #define SVCRDMA_DEVCAP_FAST_REG 1 /* fast mr registration */ @@ -160,8 +135,6 @@ struct svcxprt_rdma { int sc_ctxt_used; spinlock_t sc_rw_ctxt_lock; struct list_head sc_rw_ctxts; - spinlock_t sc_map_lock; - struct list_head sc_maps; struct list_head sc_rq_dto_q; spinlock_t sc_rq_dto_lock; @@ -237,8 +210,6 @@ extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, struct xdr_buf *xdr); /* svc_rdma_sendto.c */ -extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *, - struct svc_rdma_req_map *, bool); extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, struct svc_rdma_op_ctxt *ctxt, __be32 *rdma_resp, unsigned int len); @@ -259,9 +230,6 @@ extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt); -extern struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *); -extern void svc_rdma_put_req_map(struct svcxprt_rdma *, - struct svc_rdma_req_map *); extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *); extern void svc_rdma_put_frmr(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *); -- cgit v1.2.3 From dadf3e435debb85dfcf28c157012047153a21a97 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Apr 2017 13:07:21 -0400 Subject: svcrdma: Clean out old XDR encoders Clean up: These have been replaced and are no longer used. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 479bb7f65233..f3787d800ba4 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -187,10 +187,6 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, /* svc_rdma_marshal.c */ extern int svc_rdma_xdr_decode_req(struct xdr_buf *); -extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int); -extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int, - __be32, __be64, u32); -extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp); /* svc_rdma_recvfrom.c */ extern int svc_rdma_recvfrom(struct svc_rqst *); -- cgit v1.2.3 From c373fff7bd252ec36e8a895c58a584088f1d38bc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 26 Apr 2017 12:26:22 -0400 Subject: NFSv4: Don't special case "launder" If the client receives a fatal server error from nfs_pageio_add_request(), then we should always truncate the page on which the error occurred. Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 9aa044e76820..bb0eb2c9acca 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -500,24 +500,12 @@ extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned */ extern int nfs_sync_inode(struct inode *inode); extern int nfs_wb_all(struct inode *inode); -extern int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder); +extern int nfs_wb_page(struct inode *inode, struct page *page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail); extern void nfs_commit_free(struct nfs_commit_data *data); -static inline int -nfs_wb_launder_page(struct inode *inode, struct page *page) -{ - return nfs_wb_single_page(inode, page, true); -} - -static inline int -nfs_wb_page(struct inode *inode, struct page *page) -{ - return nfs_wb_single_page(inode, page, false); -} - static inline int nfs_have_writebacks(struct inode *inode) { -- cgit v1.2.3 From c7e88067c1ae89e7bcbed070fb2c4e30bc39b51f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Apr 2017 16:01:46 -0700 Subject: srcu: Exact tracking of srcu_data structures containing callbacks The current Tree SRCU implementation schedules a workqueue for every srcu_data covered by a given leaf srcu_node structure having callbacks, even if only one of those srcu_data structures actually contains callbacks. This is clearly inefficient for workloads that don't feature callbacks everywhere all the time. This commit therefore adds an array of masks that are used by the leaf srcu_node structures to track exactly which srcu_data structures contain callbacks. Signed-off-by: Paul E. McKenney Tested-by: Mike Galbraith --- include/linux/srcutree.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 0400e211aa44..94515ff226fb 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -47,6 +47,8 @@ struct srcu_data { struct delayed_work work; /* Context for CB invoking. */ struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */ struct srcu_node *mynode; /* Leaf srcu_node. */ + unsigned long grpmask; /* Mask for leaf srcu_node */ + /* ->srcu_data_have_cbs[]. */ int cpu; struct srcu_struct *sp; }; @@ -59,6 +61,8 @@ struct srcu_node { unsigned long srcu_have_cbs[4]; /* GP seq for children */ /* having CBs, but only */ /* is > ->srcu_gq_seq. */ + unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs */ + /* have CBs for given GP? */ struct srcu_node *srcu_parent; /* Next up in tree. */ int grplo; /* Least CPU for node. */ int grphi; /* Biggest CPU for node. */ -- cgit v1.2.3 From 7f6733c3c648ddd6cf459c1b80ad388a95452955 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Apr 2017 17:17:35 -0700 Subject: srcu: Make rcutorture writer stalls print SRCU GP state In the past, SRCU was simple enough that there was little point in making the rcutorture writer stall messages print the SRCU grace-period number state. With the advent of Tree SRCU, this has changed. This commit therefore makes Classic, Tiny, and Tree SRCU report this state to rcutorture as needed. Signed-off-by: Paul E. McKenney Tested-by: Mike Galbraith --- include/linux/srcuclassic.h | 14 ++++++++++++++ include/linux/srcutiny.h | 12 ++++++++++++ include/linux/srcutree.h | 4 ++++ 3 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h index 41cf99930f34..5753f7322262 100644 --- a/include/linux/srcuclassic.h +++ b/include/linux/srcuclassic.h @@ -98,4 +98,18 @@ void synchronize_srcu_expedited(struct srcu_struct *sp); void srcu_barrier(struct srcu_struct *sp); unsigned long srcu_batches_completed(struct srcu_struct *sp); +static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + if (test_type != SRCU_FLAVOR) + return; + *flags = 0; + *completed = sp->completed; + *gpnum = *completed; + if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head) + (*gpnum)++; +} + #endif diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 4f284e4f4d8c..42311ee0334f 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -78,4 +78,16 @@ static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) return 0; } +static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + if (test_type != SRCU_FLAVOR) + return; + *flags = 0; + *completed = sp->srcu_gp_seq; + *gpnum = *completed; +} + #endif diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 94515ff226fb..3865717df124 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -140,4 +140,8 @@ void synchronize_srcu_expedited(struct srcu_struct *sp); void srcu_barrier(struct srcu_struct *sp); unsigned long srcu_batches_completed(struct srcu_struct *sp); +void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, unsigned long *completed); + #endif -- cgit v1.2.3 From bfd20f1cc85010d2f2d77e544da05cd8c149ba9b Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 26 Apr 2017 09:18:35 -0700 Subject: x86, iommu/vt-d: Add an option to disable Intel IOMMU force on IOMMU harms performance signficantly when we run very fast networking workloads. It's 40GB networking doing XDP test. Software overhead is almost unaware, but it's the IOTLB miss (based on our analysis) which kills the performance. We observed the same performance issue even with software passthrough (identity mapping), only the hardware passthrough survives. The pps with iommu (with software passthrough) is only about ~30% of that without it. This is a limitation in hardware based on our observation, so we'd like to disable the IOMMU force on, but we do want to use TBOOT and we can sacrifice the DMA security bought by IOMMU. I must admit I know nothing about TBOOT, but TBOOT guys (cc-ed) think not eabling IOMMU is totally ok. So introduce a new boot option to disable the force on. It's kind of silly we need to run into intel_iommu_init even without force on, but we need to disable TBOOT PMR registers. For system without the boot option, nothing is changed. Signed-off-by: Shaohua Li Signed-off-by: Joerg Roedel --- include/linux/dma_remapping.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h index 187c10299722..90884072fa73 100644 --- a/include/linux/dma_remapping.h +++ b/include/linux/dma_remapping.h @@ -39,6 +39,7 @@ extern int iommu_calculate_agaw(struct intel_iommu *iommu); extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu); extern int dmar_disabled; extern int intel_iommu_enabled; +extern int intel_iommu_tboot_noforce; #else static inline int iommu_calculate_agaw(struct intel_iommu *iommu) { -- cgit v1.2.3 From 1e9a038b7fe9a8c10ef1238f4e695d5fbe0dd594 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Apr 2017 16:02:09 -0700 Subject: srcu: Expedited grace periods with reduced memory contention Commit f60d231a87c5 ("srcu: Crude control of expedited grace periods") introduced a per-srcu_struct atomic counter to track outstanding requests for grace periods. This works, but represents a memory-contention bottleneck. This commit therefore uses the srcu_node combining tree to remove this bottleneck. This commit adds new ->srcu_gp_seq_needed_exp fields to the srcu_data, srcu_node, and srcu_struct structures, which track the farthest-in-the-future grace period that must be expedited, which in turn requires that all nearer-term grace periods also be expedited. Requests for expediting start with the srcu_data structure, run up through the srcu_node tree, and end at the srcu_struct structure. Note that it may be necessary to expedite a grace period that just now started, and this is handled by a new srcu_funnel_exp_start() function, which is invoked when the grace period itself is already in its way, but when that grace period was not marked as expedited. A new srcu_get_delay() function returns zero if there is at least one expedited SRCU grace period in flight, or SRCU_INTERVAL otherwise. This function is used to calculate delays: Normal grace periods are allowed to extend in order to cover more requests with a given grace-period computation, which decreases per-request overhead. Signed-off-by: Paul E. McKenney Tested-by: Mike Galbraith --- include/linux/srcutree.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 3865717df124..86df48d3e97b 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -43,6 +43,7 @@ struct srcu_data { spinlock_t lock ____cacheline_internodealigned_in_smp; struct rcu_segcblist srcu_cblist; /* List of callbacks.*/ unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ + unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ bool srcu_cblist_invoking; /* Invoking these CBs? */ struct delayed_work work; /* Context for CB invoking. */ struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */ @@ -63,6 +64,7 @@ struct srcu_node { /* is > ->srcu_gq_seq. */ unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs */ /* have CBs for given GP? */ + unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ struct srcu_node *srcu_parent; /* Next up in tree. */ int grplo; /* Least CPU for node. */ int grphi; /* Biggest CPU for node. */ @@ -81,7 +83,7 @@ struct srcu_struct { unsigned int srcu_idx; /* Current rdr array element. */ unsigned long srcu_gp_seq; /* Grace-period seq #. */ unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ - atomic_t srcu_exp_cnt; /* # ongoing expedited GPs. */ + unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ -- cgit v1.2.3 From 22607d66bbc3e81140d3bcf08894f4378eb36428 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Apr 2017 14:03:11 -0700 Subject: srcu: Specify auto-expedite holdoff time On small systems, in the absence of readers, expedited SRCU grace periods can complete in less than a microsecond. This means that an eight-CPU system can have all CPUs doing synchronize_srcu() in a tight loop and almost always expedite. This might actually be desirable in some situations, but in general it is a good way to needlessly burn CPU cycles. And in those situations where it is desirable, your friend is the function synchronize_srcu_expedited(). For other situations, this commit adds a kernel parameter that specifies a holdoff between completing the last SRCU grace period and auto-expediting the next. If the next grace period starts before the holdoff expires, auto-expediting is disabled. The holdoff is 50 microseconds by default, and can be tuned to the desired number of nanoseconds. A value of zero disables auto-expediting. Signed-off-by: Paul E. McKenney Tested-by: Mike Galbraith --- include/linux/srcutree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 86df48d3e97b..32e86d85fd11 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -84,6 +84,7 @@ struct srcu_struct { unsigned long srcu_gp_seq; /* Grace-period seq #. */ unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ + unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ -- cgit v1.2.3 From e8245c1b1a3bb8474f91c69ccd13637d3589bb2c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 26 Apr 2017 15:34:06 +0200 Subject: iommu: Include device.h in iommu.h We make use of 'struct device' in iommu.h, so include device.h to make it available explicitly. Re-order the other headers while at it. Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 6a6de187ddc0..3b4fe4b79d20 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -19,11 +19,13 @@ #ifndef __LINUX_IOMMU_H #define __LINUX_IOMMU_H +#include +#include +#include #include #include #include -#include -#include + #include #define IOMMU_READ (1 << 0) -- cgit v1.2.3 From 207c6e36f122ebb1164d611c9f34f128313f47d5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 26 Apr 2017 15:39:28 +0200 Subject: iommu: Move report_iommu_fault() to iommu.c The function is in no fast-path, there is no need for it to be static inline in a header file. This also removes the need to include iommu trace-points in iommu.h. Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 41 ++--------------------------------------- 1 file changed, 2 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3b4fe4b79d20..abaa0ca848bc 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -330,46 +330,9 @@ extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr, phys_addr_t offset, u64 size, int prot); extern void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr); -/** - * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework - * @domain: the iommu domain where the fault has happened - * @dev: the device where the fault has happened - * @iova: the faulting address - * @flags: mmu fault flags (e.g. IOMMU_FAULT_READ/IOMMU_FAULT_WRITE/...) - * - * This function should be called by the low-level IOMMU implementations - * whenever IOMMU faults happen, to allow high-level users, that are - * interested in such events, to know about them. - * - * This event may be useful for several possible use cases: - * - mere logging of the event - * - dynamic TLB/PTE loading - * - if restarting of the faulting device is required - * - * Returns 0 on success and an appropriate error code otherwise (if dynamic - * PTE/TLB loading will one day be supported, implementations will be able - * to tell whether it succeeded or not according to this return value). - * - * Specifically, -ENOSYS is returned if a fault handler isn't installed - * (though fault handlers can also return -ENOSYS, in case they want to - * elicit the default behavior of the IOMMU drivers). - */ -static inline int report_iommu_fault(struct iommu_domain *domain, - struct device *dev, unsigned long iova, int flags) -{ - int ret = -ENOSYS; - /* - * if upper layers showed interest and installed a fault handler, - * invoke it. - */ - if (domain->handler) - ret = domain->handler(domain, dev, iova, flags, - domain->handler_token); - - trace_io_page_fault(dev, iova, flags); - return ret; -} +extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev, + unsigned long iova, int flags); static inline size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, struct scatterlist *sg, -- cgit v1.2.3 From 5af50993850a48ba749b122173d789ea90976c72 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 5 Apr 2017 17:54:56 +1000 Subject: KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller This patch makes KVM capable of using the XIVE interrupt controller to provide the standard PAPR "XICS" style hypercalls. It is necessary for proper operations when the host uses XIVE natively. This has been lightly tested on an actual system, including PCI pass-through with a TG3 device. Signed-off-by: Benjamin Herrenschmidt [mpe: Cleanup pr_xxx(), unsplit pr_xxx() strings, etc., fix build failures by adding KVM_XIVE which depends on KVM_XICS and XIVE, and adding empty stubs for the kvm_xive_xxx() routines, fixup subject, integrate fixes from Paul for building PR=y HV=n] Signed-off-by: Michael Ellerman --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2c14ad9809da..d1a6e554ee68 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1165,7 +1165,6 @@ int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type); void kvm_unregister_device_ops(u32 type); extern struct kvm_device_ops kvm_mpic_ops; -extern struct kvm_device_ops kvm_xics_ops; extern struct kvm_device_ops kvm_arm_vgic_v2_ops; extern struct kvm_device_ops kvm_arm_vgic_v3_ops; -- cgit v1.2.3 From ed6473ddc704a2005b9900ca08e236ebb2d8540a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 26 Apr 2017 11:55:27 -0400 Subject: NFSv4: Fix callback server shutdown We want to use kthread_stop() in order to ensure the threads are shut down before we tear down the nfs_callback_info in nfs_callback_down. Tested-and-reviewed-by: Kinglong Mee Reported-by: Kinglong Mee Fixes: bb6aeba736ba9 ("NFSv4.x: Switch to using svc_set_num_threads()...") Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 6ef19cf658b4..94631026f79c 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -473,6 +473,7 @@ void svc_pool_map_put(void); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, struct svc_serv_ops *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); +int svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int); int svc_pool_stats_open(struct svc_serv *serv, struct file *file); void svc_destroy(struct svc_serv *); void svc_shutdown_net(struct svc_serv *, struct net *); -- cgit v1.2.3 From 461a6946b1f93f6720577fb06aa78e8cbd9291c9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 26 Apr 2017 15:46:20 +0200 Subject: iommu: Remove pci.h include from trace/events/iommu.h The include file does not need any PCI specifics, so remove that include. Also fix the places that relied on it. Signed-off-by: Joerg Roedel --- include/linux/dma-iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h index 5725c94b1f12..abd946569515 100644 --- a/include/linux/dma-iommu.h +++ b/include/linux/dma-iommu.h @@ -20,6 +20,7 @@ #include #ifdef CONFIG_IOMMU_DMA +#include #include #include -- cgit v1.2.3 From 208480bb273e15f42711bd47f70dc0fbfa2570b8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 26 Apr 2017 15:49:57 +0200 Subject: iommu: Remove trace-events include from iommu.h It is not needed there anymore. All places needing it are fixed. Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index abaa0ca848bc..dda8717545e9 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -26,8 +26,6 @@ #include #include -#include - #define IOMMU_READ (1 << 0) #define IOMMU_WRITE (1 << 1) #define IOMMU_CACHE (1 << 2) /* DMA cache coherency */ -- cgit v1.2.3 From 917362135b8a5c0680acf08807e9fc6179eb6c79 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 14 Apr 2017 20:32:49 +0200 Subject: power: supply: max17042_battery: Add default platform_data fallback data Some x86 machines use a max17047 fuel-gauge and x86 might be missing platform_data if not provided by SFI. This commit adds default platform_data as fallback option so that the driver can work on boards where no platform_data is provided. Since not all boards have a thermistor hooked up, set temp_min to 0 and change the health checks from temp <= temp_min to temp < temp_min to not trigger on such boards (where temp reads 0). Signed-off-by: Hans de Goede Reviewed-by: Krzysztof Kozlowski Signed-off-by: Sebastian Reichel --- include/linux/power/max17042_battery.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h index 522757ac9cd4..3489fb0f9099 100644 --- a/include/linux/power/max17042_battery.h +++ b/include/linux/power/max17042_battery.h @@ -24,8 +24,12 @@ #define __MAX17042_BATTERY_H_ #define MAX17042_STATUS_BattAbsent (1 << 3) -#define MAX17042_BATTERY_FULL (100) +#define MAX17042_BATTERY_FULL (95) /* Recommend. FullSOCThr value */ #define MAX17042_DEFAULT_SNS_RESISTOR (10000) +#define MAX17042_DEFAULT_VMIN (3000) +#define MAX17042_DEFAULT_VMAX (4500) /* LiHV cell max */ +#define MAX17042_DEFAULT_TEMP_MIN (0) /* For sys without temp sensor */ +#define MAX17042_DEFAULT_TEMP_MAX (700) /* 70 degrees Celcius */ #define MAX17042_CHARACTERIZATION_DATA_SIZE 48 -- cgit v1.2.3 From a9df22c00d7c2c9c2944c62f1b819de6c214660f Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 14 Apr 2017 20:32:51 +0200 Subject: power: supply: max17042_battery: Add support for the STATUS property Userspace prefers the driver having a status property over having to guess itself. Specifically this will properly make the GNOME3 UI (and likely others) properly show discharging / charging / full status, instead of always showing discharging as status. Note that in the case there is no charger driver supplying the max17042, then a status of unknown will get returned. At least upower treats this the same as not having a status attribute, so in this case nothing changes from a userspace pov. Signed-off-by: Hans de Goede Reviewed-by: Krzysztof Kozlowski Signed-off-by: Sebastian Reichel --- include/linux/power/max17042_battery.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h index 3489fb0f9099..a7ed29baf44a 100644 --- a/include/linux/power/max17042_battery.h +++ b/include/linux/power/max17042_battery.h @@ -31,6 +31,9 @@ #define MAX17042_DEFAULT_TEMP_MIN (0) /* For sys without temp sensor */ #define MAX17042_DEFAULT_TEMP_MAX (700) /* 70 degrees Celcius */ +/* Consider RepCap which is less then 10 units below FullCAP full */ +#define MAX17042_FULL_THRESHOLD 10 + #define MAX17042_CHARACTERIZATION_DATA_SIZE 48 enum max17042_register { -- cgit v1.2.3 From 45753c5f315749711b935a2506ee5c10eef5c23d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 2 May 2017 10:31:18 +0200 Subject: srcu: Debloat the header Linus noticed that the has huge inline functions which should not be inline at all. As a first step in cleaning this up, move them all to kernel/rcu/ and only keep an absolute minimum of data type defines in the header: before: -rw-r--r-- 1 mingo mingo 22284 May 2 10:25 include/linux/rcu_segcblist.h after: -rw-r--r-- 1 mingo mingo 3180 May 2 10:22 include/linux/rcu_segcblist.h More can be done, such as uninlining the large functions, which inlining is unjustified even if it's an RCU internal matter. Reported-by: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 628 +----------------------------------------- 1 file changed, 3 insertions(+), 625 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index ced8f313fd05..ba4d2621d9ca 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -20,8 +20,8 @@ * Authors: Paul E. McKenney */ -#ifndef __KERNEL_RCU_SEGCBLIST_H -#define __KERNEL_RCU_SEGCBLIST_H +#ifndef __INCLUDE_LINUX_RCU_SEGCBLIST_H +#define __INCLUDE_LINUX_RCU_SEGCBLIST_H /* Simple unsegmented callback lists. */ struct rcu_cblist { @@ -33,102 +33,6 @@ struct rcu_cblist { #define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head } -/* Initialize simple callback list. */ -static inline void rcu_cblist_init(struct rcu_cblist *rclp) -{ - rclp->head = NULL; - rclp->tail = &rclp->head; - rclp->len = 0; - rclp->len_lazy = 0; -} - -/* Is simple callback list empty? */ -static inline bool rcu_cblist_empty(struct rcu_cblist *rclp) -{ - return !rclp->head; -} - -/* Return number of callbacks in simple callback list. */ -static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) -{ - return rclp->len; -} - -/* Return number of lazy callbacks in simple callback list. */ -static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp) -{ - return rclp->len_lazy; -} - -/* - * Debug function to actually count the number of callbacks. - * If the number exceeds the limit specified, return -1. - */ -static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) -{ - int cnt = 0; - struct rcu_head **rhpp = &rclp->head; - - for (;;) { - if (!*rhpp) - return cnt; - if (++cnt > lim) - return -1; - rhpp = &(*rhpp)->next; - } -} - -/* - * Dequeue the oldest rcu_head structure from the specified callback - * list. This function assumes that the callback is non-lazy, but - * the caller can later invoke rcu_cblist_dequeued_lazy() if it - * finds otherwise (and if it cares about laziness). This allows - * different users to have different ways of determining laziness. - */ -static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) -{ - struct rcu_head *rhp; - - rhp = rclp->head; - if (!rhp) - return NULL; - rclp->len--; - rclp->head = rhp->next; - if (!rclp->head) - rclp->tail = &rclp->head; - return rhp; -} - -/* - * Account for the fact that a previously dequeued callback turned out - * to be marked as lazy. - */ -static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) -{ - rclp->len_lazy--; -} - -/* - * Interim function to return rcu_cblist head pointer. Longer term, the - * rcu_cblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) -{ - return rclp->head; -} - -/* - * Interim function to return rcu_cblist head pointer. Longer term, the - * rcu_cblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) -{ - WARN_ON_ONCE(rcu_cblist_empty(rclp)); - return rclp->tail; -} - /* Complicated segmented callback lists. ;-) */ /* @@ -183,530 +87,4 @@ struct rcu_segcblist { .tails[RCU_NEXT_TAIL] = &n.head, \ } -/* - * Initialize an rcu_segcblist structure. - */ -static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp) -{ - int i; - - BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); - BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); - rsclp->head = NULL; - for (i = 0; i < RCU_CBLIST_NSEGS; i++) - rsclp->tails[i] = &rsclp->head; - rsclp->len = 0; - rsclp->len_lazy = 0; -} - -/* - * Is the specified rcu_segcblist structure empty? - * - * But careful! The fact that the ->head field is NULL does not - * necessarily imply that there are no callbacks associated with - * this structure. When callbacks are being invoked, they are - * removed as a group. If callback invocation must be preempted, - * the remaining callbacks will be added back to the list. Either - * way, the counts are updated later. - * - * So it is often the case that rcu_segcblist_n_cbs() should be used - * instead. - */ -static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) -{ - return !rsclp->head; -} - -/* Return number of callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) -{ - return READ_ONCE(rsclp->len); -} - -/* Return number of lazy callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) -{ - return rsclp->len_lazy; -} - -/* Return number of lazy callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) -{ - return rsclp->len - rsclp->len_lazy; -} - -/* - * Is the specified rcu_segcblist enabled, for example, not corresponding - * to an offline or callback-offloaded CPU? - */ -static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) -{ - return !!rsclp->tails[RCU_NEXT_TAIL]; -} - -/* - * Disable the specified rcu_segcblist structure, so that callbacks can - * no longer be posted to it. This structure must be empty. - */ -static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp) -{ - WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); - WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); - WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); - rsclp->tails[RCU_NEXT_TAIL] = NULL; -} - -/* - * Is the specified segment of the specified rcu_segcblist structure - * empty of callbacks? - */ -static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) -{ - if (seg == RCU_DONE_TAIL) - return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; - return rsclp->tails[seg - 1] == rsclp->tails[seg]; -} - -/* - * Are all segments following the specified segment of the specified - * rcu_segcblist structure empty of callbacks? (The specified - * segment might well contain callbacks.) - */ -static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) -{ - return !*rsclp->tails[seg]; -} - -/* - * Does the specified rcu_segcblist structure contain callbacks that - * are ready to be invoked? - */ -static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) -{ - return rcu_segcblist_is_enabled(rsclp) && - &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; -} - -/* - * Does the specified rcu_segcblist structure contain callbacks that - * are still pending, that is, not yet ready to be invoked? - */ -static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) -{ - return rcu_segcblist_is_enabled(rsclp) && - !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL); -} - -/* - * Dequeue and return the first ready-to-invoke callback. If there - * are no ready-to-invoke callbacks, return NULL. Disables interrupts - * to avoid interference. Does not protect from interference from other - * CPUs or tasks. - */ -static inline struct rcu_head * -rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) -{ - unsigned long flags; - int i; - struct rcu_head *rhp; - - local_irq_save(flags); - if (!rcu_segcblist_ready_cbs(rsclp)) { - local_irq_restore(flags); - return NULL; - } - rhp = rsclp->head; - BUG_ON(!rhp); - rsclp->head = rhp->next; - for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { - if (rsclp->tails[i] != &rhp->next) - break; - rsclp->tails[i] = &rsclp->head; - } - smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ - WRITE_ONCE(rsclp->len, rsclp->len - 1); - local_irq_restore(flags); - return rhp; -} - -/* - * Account for the fact that a previously dequeued callback turned out - * to be marked as lazy. - */ -static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) -{ - unsigned long flags; - - local_irq_save(flags); - rsclp->len_lazy--; - local_irq_restore(flags); -} - -/* - * Return a pointer to the first callback in the specified rcu_segcblist - * structure. This is useful for diagnostics. - */ -static inline struct rcu_head * -rcu_segcblist_first_cb(struct rcu_segcblist *rsclp) -{ - if (rcu_segcblist_is_enabled(rsclp)) - return rsclp->head; - return NULL; -} - -/* - * Return a pointer to the first pending callback in the specified - * rcu_segcblist structure. This is useful just after posting a given - * callback -- if that callback is the first pending callback, then - * you cannot rely on someone else having already started up the required - * grace period. - */ -static inline struct rcu_head * -rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) -{ - if (rcu_segcblist_is_enabled(rsclp)) - return *rsclp->tails[RCU_DONE_TAIL]; - return NULL; -} - -/* - * Does the specified rcu_segcblist structure contain callbacks that - * have not yet been processed beyond having been posted, that is, - * does it contain callbacks in its last segment? - */ -static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) -{ - return rcu_segcblist_is_enabled(rsclp) && - !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); -} - -/* - * Enqueue the specified callback onto the specified rcu_segcblist - * structure, updating accounting as needed. Note that the ->len - * field may be accessed locklessly, hence the WRITE_ONCE(). - * The ->len field is used by rcu_barrier() and friends to determine - * if it must post a callback on this structure, and it is OK - * for rcu_barrier() to sometimes post callbacks needlessly, but - * absolutely not OK for it to ever miss posting a callback. - */ -static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy) -{ - WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ - if (lazy) - rsclp->len_lazy++; - smp_mb(); /* Ensure counts are updated before callback is enqueued. */ - rhp->next = NULL; - *rsclp->tails[RCU_NEXT_TAIL] = rhp; - rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; -} - -/* - * Entrain the specified callback onto the specified rcu_segcblist at - * the end of the last non-empty segment. If the entire rcu_segcblist - * is empty, make no change, but return false. - * - * This is intended for use by rcu_barrier()-like primitives, -not- - * for normal grace-period use. IMPORTANT: The callback you enqueue - * will wait for all prior callbacks, NOT necessarily for a grace - * period. You have been warned. - */ -static inline bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy) -{ - int i; - - if (rcu_segcblist_n_cbs(rsclp) == 0) - return false; - WRITE_ONCE(rsclp->len, rsclp->len + 1); - if (lazy) - rsclp->len_lazy++; - smp_mb(); /* Ensure counts are updated before callback is entrained. */ - rhp->next = NULL; - for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) - if (rsclp->tails[i] != rsclp->tails[i - 1]) - break; - *rsclp->tails[i] = rhp; - for (; i <= RCU_NEXT_TAIL; i++) - rsclp->tails[i] = &rhp->next; - return true; -} - -/* - * Extract only the counts from the specified rcu_segcblist structure, - * and place them in the specified rcu_cblist structure. This function - * supports both callback orphaning and invocation, hence the separation - * of counts and callbacks. (Callbacks ready for invocation must be - * orphaned and adopted separately from pending callbacks, but counts - * apply to all callbacks. Locking must be used to make sure that - * both orphaned-callbacks lists are consistent.) - */ -static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp) -{ - rclp->len_lazy += rsclp->len_lazy; - rclp->len += rsclp->len; - rsclp->len_lazy = 0; - WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ -} - -/* - * Extract only those callbacks ready to be invoked from the specified - * rcu_segcblist structure and place them in the specified rcu_cblist - * structure. - */ -static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp) -{ - int i; - - if (!rcu_segcblist_ready_cbs(rsclp)) - return; /* Nothing to do. */ - *rclp->tail = rsclp->head; - rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; - *rsclp->tails[RCU_DONE_TAIL] = NULL; - rclp->tail = rsclp->tails[RCU_DONE_TAIL]; - for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) - if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) - rsclp->tails[i] = &rsclp->head; -} - -/* - * Extract only those callbacks still pending (not yet ready to be - * invoked) from the specified rcu_segcblist structure and place them in - * the specified rcu_cblist structure. Note that this loses information - * about any callbacks that might have been partway done waiting for - * their grace period. Too bad! They will have to start over. - */ -static inline void -rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp) -{ - int i; - - if (!rcu_segcblist_pend_cbs(rsclp)) - return; /* Nothing to do. */ - *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; - rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; - *rsclp->tails[RCU_DONE_TAIL] = NULL; - for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) - rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; -} - -/* - * Move the entire contents of the specified rcu_segcblist structure, - * counts, callbacks, and all, to the specified rcu_cblist structure. - * @@@ Why do we need this??? Moving early-boot CBs to NOCB lists? - * @@@ Memory barrier needed? (Not if only used at boot time...) - */ -static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp) -{ - rcu_segcblist_extract_done_cbs(rsclp, rclp); - rcu_segcblist_extract_pend_cbs(rsclp, rclp); - rcu_segcblist_extract_count(rsclp, rclp); -} - -/* - * Insert counts from the specified rcu_cblist structure in the - * specified rcu_segcblist structure. - */ -static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp) -{ - rsclp->len_lazy += rclp->len_lazy; - /* ->len sampled locklessly. */ - WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); - rclp->len_lazy = 0; - rclp->len = 0; -} - -/* - * Move callbacks from the specified rcu_cblist to the beginning of the - * done-callbacks segment of the specified rcu_segcblist. - */ -static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp) -{ - int i; - - if (!rclp->head) - return; /* No callbacks to move. */ - *rclp->tail = rsclp->head; - rsclp->head = rclp->head; - for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) - if (&rsclp->head == rsclp->tails[i]) - rsclp->tails[i] = rclp->tail; - else - break; - rclp->head = NULL; - rclp->tail = &rclp->head; -} - -/* - * Move callbacks from the specified rcu_cblist to the end of the - * new-callbacks segment of the specified rcu_segcblist. - */ -static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp) -{ - if (!rclp->head) - return; /* Nothing to do. */ - *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; - rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; - rclp->head = NULL; - rclp->tail = &rclp->head; -} - -/* - * Advance the callbacks in the specified rcu_segcblist structure based - * on the current value passed in for the grace-period counter. - */ -static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp, - unsigned long seq) -{ - int i, j; - - WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); - if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) - return; - - /* - * Find all callbacks whose ->gp_seq numbers indicate that they - * are ready to invoke, and put them into the RCU_DONE_TAIL segment. - */ - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { - if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) - break; - rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; - } - - /* If no callbacks moved, nothing more need be done. */ - if (i == RCU_WAIT_TAIL) - return; - - /* Clean up tail pointers that might have been misordered above. */ - for (j = RCU_WAIT_TAIL; j < i; j++) - rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; - - /* - * Callbacks moved, so clean up the misordered ->tails[] pointers - * that now point into the middle of the list of ready-to-invoke - * callbacks. The overall effect is to copy down the later pointers - * into the gap that was created by the now-ready segments. - */ - for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { - if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) - break; /* No more callbacks. */ - rsclp->tails[j] = rsclp->tails[i]; - rsclp->gp_seq[j] = rsclp->gp_seq[i]; - } -} - -/* - * "Accelerate" callbacks based on more-accurate grace-period information. - * The reason for this is that RCU does not synchronize the beginnings and - * ends of grace periods, and that callbacks are posted locally. This in - * turn means that the callbacks must be labelled conservatively early - * on, as getting exact information would degrade both performance and - * scalability. When more accurate grace-period information becomes - * available, previously posted callbacks can be "accelerated", marking - * them to complete at the end of the earlier grace period. - * - * This function operates on an rcu_segcblist structure, and also the - * grace-period sequence number seq at which new callbacks would become - * ready to invoke. Returns true if there are callbacks that won't be - * ready to invoke until seq, false otherwise. - */ -static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, - unsigned long seq) -{ - int i; - - WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); - if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) - return false; - - /* - * Find the segment preceding the oldest segment of callbacks - * whose ->gp_seq[] completion is at or after that passed in via - * "seq", skipping any empty segments. This oldest segment, along - * with any later segments, can be merged in with any newly arrived - * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq" - * as their ->gp_seq[] grace-period completion sequence number. - */ - for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) - if (rsclp->tails[i] != rsclp->tails[i - 1] && - ULONG_CMP_LT(rsclp->gp_seq[i], seq)) - break; - - /* - * If all the segments contain callbacks that correspond to - * earlier grace-period sequence numbers than "seq", leave. - * Assuming that the rcu_segcblist structure has enough - * segments in its arrays, this can only happen if some of - * the non-done segments contain callbacks that really are - * ready to invoke. This situation will get straightened - * out by the next call to rcu_segcblist_advance(). - * - * Also advance to the oldest segment of callbacks whose - * ->gp_seq[] completion is at or after that passed in via "seq", - * skipping any empty segments. - */ - if (++i >= RCU_NEXT_TAIL) - return false; - - /* - * Merge all later callbacks, including newly arrived callbacks, - * into the segment located by the for-loop above. Assign "seq" - * as the ->gp_seq[] value in order to correctly handle the case - * where there were no pending callbacks in the rcu_segcblist - * structure other than in the RCU_NEXT_TAIL segment. - */ - for (; i < RCU_NEXT_TAIL; i++) { - rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; - rsclp->gp_seq[i] = seq; - } - return true; -} - -/* - * Scan the specified rcu_segcblist structure for callbacks that need - * a grace period later than the one specified by "seq". We don't look - * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't - * have a grace-period sequence number. - */ -static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, - unsigned long seq) -{ - int i; - - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) - if (rsclp->tails[i - 1] != rsclp->tails[i] && - ULONG_CMP_LT(seq, rsclp->gp_seq[i])) - return true; - return false; -} - -/* - * Interim function to return rcu_segcblist head pointer. Longer term, the - * rcu_segcblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) -{ - return rsclp->head; -} - -/* - * Interim function to return rcu_segcblist head pointer. Longer term, the - * rcu_segcblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) -{ - WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); - return rsclp->tails[RCU_NEXT_TAIL]; -} - -#endif /* __KERNEL_RCU_SEGCBLIST_H */ +#endif /* __INCLUDE_LINUX_RCU_SEGCBLIST_H */ -- cgit v1.2.3 From 9b2bbdb227588455afcc3b03475fa9b0a35d83af Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 6 Mar 2017 18:19:39 +0200 Subject: virtio: wrap find_vqs We are going to add more parameters to find_vqs, let's wrap the call so we don't need to tweak all drivers every time. Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 8355bab175e1..47f3d805c290 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -179,6 +179,15 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev, return vq; } +static inline +int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], vq_callback_t *callbacks[], + const char * const names[], + struct irq_affinity *desc) +{ + return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, desc); +} + /** * virtio_device_ready - enable vq use in probe function * @vdev: the device -- cgit v1.2.3 From f94682dde5ed23eed13533a37dfce942e60ade4e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 6 Mar 2017 18:32:29 +0200 Subject: virtio: add context flag to find vqs Allows maintaining extra context per vq. For ease of use, passing in NULL is legal and disables the feature for all vqs. Includes fixes by Christian for s390, acked by Cornelia. Signed-off-by: Christian Borntraeger Acked-by: Cornelia Huck Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 18 +++++++++++++++--- include/linux/virtio_ring.h | 3 +++ 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 47f3d805c290..0133d8a12ccd 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -72,7 +72,8 @@ struct virtio_config_ops { void (*reset)(struct virtio_device *vdev); int (*find_vqs)(struct virtio_device *, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], struct irq_affinity *desc); + const char * const names[], const bool *ctx, + struct irq_affinity *desc); void (*del_vqs)(struct virtio_device *); u64 (*get_features)(struct virtio_device *vdev); int (*finalize_features)(struct virtio_device *vdev); @@ -173,7 +174,8 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev, vq_callback_t *callbacks[] = { c }; const char *names[] = { n }; struct virtqueue *vq; - int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL); + int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL, + NULL); if (err < 0) return ERR_PTR(err); return vq; @@ -185,7 +187,17 @@ int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs, const char * const names[], struct irq_affinity *desc) { - return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, desc); + return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL, desc); +} + +static inline +int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], vq_callback_t *callbacks[], + const char * const names[], const bool *ctx, + struct irq_affinity *desc) +{ + return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, + desc); } /** diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index e8d36938f09a..270cfa81830e 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -71,6 +71,7 @@ struct virtqueue *vring_create_virtqueue(unsigned int index, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, + bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); @@ -80,6 +81,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, struct vring vring, struct virtio_device *vdev, bool weak_barriers, + bool ctx, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name); @@ -93,6 +95,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, + bool ctx, void *pages, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), -- cgit v1.2.3 From 5a08b04f637921e44ba767c07c74b0535504ab71 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 7 Feb 2017 06:15:13 +0200 Subject: virtio: allow extra context per descriptor Allow extra context per descriptor. To avoid slow down for data path, this disables use of indirect descriptors for this vq. Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 7edfbdb55a99..ed04753278d4 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -44,6 +44,12 @@ int virtqueue_add_inbuf(struct virtqueue *vq, void *data, gfp_t gfp); +int virtqueue_add_inbuf_ctx(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + void *ctx, + gfp_t gfp); + int virtqueue_add_sgs(struct virtqueue *vq, struct scatterlist *sgs[], unsigned int out_sgs, @@ -59,6 +65,9 @@ bool virtqueue_notify(struct virtqueue *vq); void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); +void *virtqueue_get_buf_ctx(struct virtqueue *vq, unsigned int *len, + void **ctx); + void virtqueue_disable_cb(struct virtqueue *vq); bool virtqueue_enable_cb(struct virtqueue *vq); -- cgit v1.2.3 From 74da4a0f574d11ed60dbe50a1e5e942e20476590 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 3 Mar 2017 18:16:07 +0100 Subject: libceph, ceph: always advertise all supported features No reason to hide CephFS-specific features in the rbd case. Recent feature bits mix RADOS and CephFS-specific stuff together anyway. Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_features.h | 4 ++++ include/linux/ceph/libceph.h | 5 +---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index ae2f66833762..fd8b2953c78f 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -105,8 +105,10 @@ static inline u64 ceph_sanitize_features(u64 features) */ #define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_FLOCK | \ CEPH_FEATURE_SUBSCRIBE2 | \ CEPH_FEATURE_RECONNECT_SEQ | \ + CEPH_FEATURE_DIRLAYOUTHASH | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC | \ @@ -114,11 +116,13 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_MSG_AUTH | \ CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_REPLY_CREATE_INODE | \ + CEPH_FEATURE_MDSENC | \ CEPH_FEATURE_OSDHASHPSPOOL | \ CEPH_FEATURE_OSD_CACHEPOOL | \ CEPH_FEATURE_CRUSH_V2 | \ CEPH_FEATURE_EXPORT_PEER | \ CEPH_FEATURE_OSDMAP_ENC | \ + CEPH_FEATURE_MDS_INLINE_DATA | \ CEPH_FEATURE_CRUSH_TUNABLES3 | \ CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ CEPH_FEATURE_MSGR_KEEPALIVE2 | \ diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 88cd5dc8e238..cecbf5a26e5a 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -262,10 +262,7 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client); extern void ceph_destroy_options(struct ceph_options *opt); extern int ceph_compare_options(struct ceph_options *new_opt, struct ceph_client *client); -extern struct ceph_client *ceph_create_client(struct ceph_options *opt, - void *private, - u64 supported_features, - u64 required_features); +struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private); struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); u64 ceph_client_gid(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client); -- cgit v1.2.3 From 06dfa96399a9a3280dd81e47f8696aa89f1783e7 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 17 Mar 2017 14:10:27 +0200 Subject: libceph: convert ceph_snap_context.nref from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index cecbf5a26e5a..3229ae6c7846 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -161,7 +162,7 @@ struct ceph_client { * dirtied. */ struct ceph_snap_context { - atomic_t nref; + refcount_t nref; u64 seq; u32 num_snaps; u64 snaps[]; -- cgit v1.2.3 From 02113a0f14e20bd8e675d7cec16db6eaaf2b2380 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 17 Mar 2017 14:10:28 +0200 Subject: libceph: convert ceph_osd.o_ref from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index c125b5d9e13c..d6a625e75040 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -27,7 +28,7 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); /* a given osd we're communicating with */ struct ceph_osd { - atomic_t o_ref; + refcount_t o_ref; struct ceph_osd_client *o_osdc; int o_osd; int o_incarnation; -- cgit v1.2.3 From 0e1a5ee6577e43e5be55369d398107080b360941 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 17 Mar 2017 14:10:29 +0200 Subject: libceph: convert ceph_pagelist.refcnt from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: Ilya Dryomov --- include/linux/ceph/pagelist.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h index 13d71fe18b0c..75a7db21457d 100644 --- a/include/linux/ceph/pagelist.h +++ b/include/linux/ceph/pagelist.h @@ -2,7 +2,7 @@ #define __FS_CEPH_PAGELIST_H #include -#include +#include #include #include @@ -13,7 +13,7 @@ struct ceph_pagelist { size_t room; struct list_head free_list; size_t num_pages_free; - atomic_t refcnt; + refcount_t refcnt; }; struct ceph_pagelist_cursor { @@ -30,7 +30,7 @@ static inline void ceph_pagelist_init(struct ceph_pagelist *pl) pl->room = 0; INIT_LIST_HEAD(&pl->free_list); pl->num_pages_free = 0; - atomic_set(&pl->refcnt, 1); + refcount_set(&pl->refcnt, 1); } extern void ceph_pagelist_release(struct ceph_pagelist *pl); -- cgit v1.2.3 From 76201b6354bb3aa31c7ba2bd42b9cbb8dda71c44 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 28 Mar 2017 17:04:13 +0800 Subject: ceph: allow connecting to mds whose rank >= mdsmap::m_max_mds mdsmap::m_max_mds is the expected count of active mds. It's not the max rank of active mds. User can decrease mdsmap::m_max_mds, but does not stop mds whose rank >= mdsmap::m_max_mds. Signed-off-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- include/linux/ceph/mdsmap.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 8ed5dc505fbb..d5f783f3226a 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -25,6 +25,7 @@ struct ceph_mdsmap { u32 m_session_autoclose; /* seconds */ u64 m_max_file_size; u32 m_max_mds; /* size of m_addr, m_state arrays */ + int m_num_mds; struct ceph_mds_info *m_info; /* which object pools file data can be stored in */ @@ -40,7 +41,7 @@ struct ceph_mdsmap { static inline struct ceph_entity_addr * ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) { - if (w >= m->m_max_mds) + if (w >= m->m_num_mds) return NULL; return &m->m_info[w].addr; } @@ -48,14 +49,14 @@ ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) { BUG_ON(w < 0); - if (w >= m->m_max_mds) + if (w >= m->m_num_mds) return CEPH_MDS_STATE_DNE; return m->m_info[w].state; } static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) { - if (w >= 0 && w < m->m_max_mds) + if (w >= 0 && w < m->m_num_mds) return m->m_info[w].laggy; return false; } -- cgit v1.2.3 From 79162547b76e4979b21ef80c9629ada94a51a59b Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 5 Apr 2017 12:54:05 -0400 Subject: ceph: make seeky readdir more efficient Current cephfs client uses string to indicate start position of readdir. The string is last entry of previous readdir reply. This approach does not work for seeky readdir because we can not easily convert the new postion to a string. For seeky readdir, mds needs to return dentries from the beginning. Client keeps retrying if the reply does not contain the dentry it wants. In current version of ceph, mds sorts CDentry in its cache in hash order. Client also uses dentry hash to compose dir postion. For seeky readdir, if client passes the hash part of dir postion to mds. mds can avoid replying useless dentries. Signed-off-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index f4b2ee18f38c..1787e4a8e251 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -365,6 +365,7 @@ extern const char *ceph_mds_op_name(int op); #define CEPH_READDIR_FRAG_END (1<<0) #define CEPH_READDIR_FRAG_COMPLETE (1<<8) #define CEPH_READDIR_HASH_ORDER (1<<9) +#define CEPH_READDIR_OFFSET_HASH (1<<10) union ceph_mds_request_args { struct { @@ -384,6 +385,7 @@ union ceph_mds_request_args { __le32 max_entries; /* how many dentries to grab */ __le32 max_bytes; __le16 flags; + __le32 offset_hash; } __attribute__ ((packed)) readdir; struct { __le32 mode; -- cgit v1.2.3 From aa26d662b9d44e7f5b0d109e892e537a23471863 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 4 Apr 2017 08:39:36 -0400 Subject: libceph: remove req->r_replay_version Nothing uses this anymore with the removal of the ack vs. commit code. Remove the field and just encode zeroes into place in the request encoding. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d6a625e75040..3fc9e7754a9b 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -192,7 +192,6 @@ struct ceph_osd_request { unsigned long r_stamp; /* jiffies, send or check time */ unsigned long r_start_stamp; /* jiffies */ int r_attempts; - struct ceph_eversion r_replay_version; /* aka reassert_version */ u32 r_last_force_resend; u32 r_map_dne_bound; -- cgit v1.2.3 From a1f4020aab10a6bddb2d061c960ebe52cdfa30b5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 4 Apr 2017 08:39:37 -0400 Subject: libceph: allow requests to return immediately on full conditions if caller wishes Usually, when the osd map is flagged as full or the pool is at quota, write requests just hang. This is not what we want for cephfs, where it would be better to simply report -ENOSPC back to userland instead of stalling. If the caller knows that it will want an immediate error return instead of blocking on a full or at-quota error condition then allow it to set a flag to request that behavior. Set that flag in ceph_osdc_new_request (since ceph.ko is the only caller), and on any other write request from ceph.ko. A later patch will deal with requests that were submitted before the new map showing the full condition came in. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 3fc9e7754a9b..8cf644197b1a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -187,6 +187,7 @@ struct ceph_osd_request { struct timespec r_mtime; /* ditto */ u64 r_data_offset; /* ditto */ bool r_linger; /* don't resend on failure */ + bool r_abort_on_full; /* return ENOSPC when full */ /* internal */ unsigned long r_stamp; /* jiffies, send or check time */ -- cgit v1.2.3 From 58eb7932ae4d671d2a2377a1779eda96a2789b11 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 18 Apr 2017 09:21:16 -0400 Subject: libceph: add an epoch_barrier field to struct ceph_osd_client Cephfs can get cap update requests that contain a new epoch barrier in them. When that happens we want to pause all OSD traffic until the right map epoch arrives. Add an epoch_barrier field to ceph_osd_client that is protected by the osdc->lock rwsem. When the barrier is set, and the current OSD map epoch is below that, pause the request target when submitting the request or when revisiting it. Add a way for upper layers (cephfs) to update the epoch_barrier as well. If we get a new map, compare the new epoch against the barrier before kicking requests and request another map if the map epoch is still lower than the one we want. If we get a map with a full pool, or at quota condition, then set the barrier to the current epoch value. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 8cf644197b1a..85650b415e73 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -267,6 +267,7 @@ struct ceph_osd_client { struct rb_root osds; /* osds */ struct list_head osd_lru; /* idle osds */ spinlock_t osd_lru_lock; + u32 epoch_barrier; struct ceph_osd homeless_osd; atomic64_t last_tid; /* tid of last request */ u64 last_linger_id; @@ -305,6 +306,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg); extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); +void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); extern void osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u32 flags); -- cgit v1.2.3 From 14bb211d324d6c8140167bd6b2b8a80757348a2f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 13 Apr 2017 12:17:38 +0200 Subject: rbd: support updating the lock cookie without releasing the lock As we no longer release the lock before potentially raising BLACKLISTED in rbd_reregister_watch(), the "either locked or blacklisted" assert in rbd_queue_workfn() needs to go: we can be both locked and blacklisted at that point now. Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- include/linux/ceph/cls_lock_client.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/cls_lock_client.h b/include/linux/ceph/cls_lock_client.h index 84884d8d4710..0594d3bba774 100644 --- a/include/linux/ceph/cls_lock_client.h +++ b/include/linux/ceph/cls_lock_client.h @@ -37,6 +37,11 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc, struct ceph_object_locator *oloc, char *lock_name, char *cookie, struct ceph_entity_name *locker); +int ceph_cls_set_cookie(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + char *lock_name, u8 type, char *old_cookie, + char *tag, char *new_cookie); void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers); -- cgit v1.2.3 From f775ff7d89f33fc9ba63f6f70df3bcc98c2d9828 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Thu, 27 Apr 2017 18:34:00 +0200 Subject: ceph: fix file open flags on ppc64 The file open flags (O_foo) are platform specific and should never go out to an interface that is not local to the system. Unfortunately these flags have leaked out onto the wire in the cephfs implementation. That lead to bogus flags getting transmitted on ppc64. This patch converts the kernel view of flags to the ceph view of file open flags. Fixes: 124e68e74 ("ceph: file operations") Signed-off-by: Alexander Graf Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 1787e4a8e251..ad078ebe25d6 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -367,6 +367,18 @@ extern const char *ceph_mds_op_name(int op); #define CEPH_READDIR_HASH_ORDER (1<<9) #define CEPH_READDIR_OFFSET_HASH (1<<10) +/* + * open request flags + */ +#define CEPH_O_RDONLY 00000000 +#define CEPH_O_WRONLY 00000001 +#define CEPH_O_RDWR 00000002 +#define CEPH_O_CREAT 00000100 +#define CEPH_O_EXCL 00000200 +#define CEPH_O_TRUNC 00001000 +#define CEPH_O_DIRECTORY 00200000 +#define CEPH_O_NOFOLLOW 00400000 + union ceph_mds_request_args { struct { __le32 mask; /* CEPH_CAP_* */ -- cgit v1.2.3 From 44f0aeec203738bf34f4b7e16b745c8c71fe0f06 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 2 Apr 2017 16:50:33 +0200 Subject: dmaengine: pl080: Cut some unused defines There is no in-kernel code using these indexed register defines, and their offsets are clearly defined right below. Cut them. Signed-off-by: Linus Walleij Signed-off-by: Vinod Koul --- include/linux/amba/pl080.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amba/pl080.h b/include/linux/amba/pl080.h index 91b84a7f0539..800429f4b997 100644 --- a/include/linux/amba/pl080.h +++ b/include/linux/amba/pl080.h @@ -46,16 +46,8 @@ /* Per channel configuration registers */ -#define PL080_Cx_STRIDE (0x20) +/* Per channel configuration registers */ #define PL080_Cx_BASE(x) ((0x100 + (x * 0x20))) -#define PL080_Cx_SRC_ADDR(x) ((0x100 + (x * 0x20))) -#define PL080_Cx_DST_ADDR(x) ((0x104 + (x * 0x20))) -#define PL080_Cx_LLI(x) ((0x108 + (x * 0x20))) -#define PL080_Cx_CONTROL(x) ((0x10C + (x * 0x20))) -#define PL080_Cx_CONFIG(x) ((0x110 + (x * 0x20))) -#define PL080S_Cx_CONTROL2(x) ((0x110 + (x * 0x20))) -#define PL080S_Cx_CONFIG(x) ((0x114 + (x * 0x20))) - #define PL080_CH_SRC_ADDR (0x00) #define PL080_CH_DST_ADDR (0x04) #define PL080_CH_LLI (0x08) -- cgit v1.2.3 From ded091fee6806b7120f475d89c151d611758a395 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 2 Apr 2017 16:50:53 +0200 Subject: dmaengine: pl08x: Use the BIT() macro consistently This makes the driver shift bits with BIT() which is used on other places in the driver. Signed-off-by: Linus Walleij Signed-off-by: Vinod Koul --- include/linux/amba/pl080.h | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amba/pl080.h b/include/linux/amba/pl080.h index 800429f4b997..580b5323a717 100644 --- a/include/linux/amba/pl080.h +++ b/include/linux/amba/pl080.h @@ -38,9 +38,9 @@ #define PL080_SOFT_LSREQ (0x2C) #define PL080_CONFIG (0x30) -#define PL080_CONFIG_M2_BE (1 << 2) -#define PL080_CONFIG_M1_BE (1 << 1) -#define PL080_CONFIG_ENABLE (1 << 0) +#define PL080_CONFIG_M2_BE BIT(2) +#define PL080_CONFIG_M1_BE BIT(1) +#define PL080_CONFIG_ENABLE BIT(0) #define PL080_SYNC (0x34) @@ -58,18 +58,18 @@ #define PL080_LLI_ADDR_MASK (0x3fffffff << 2) #define PL080_LLI_ADDR_SHIFT (2) -#define PL080_LLI_LM_AHB2 (1 << 0) +#define PL080_LLI_LM_AHB2 BIT(0) -#define PL080_CONTROL_TC_IRQ_EN (1 << 31) +#define PL080_CONTROL_TC_IRQ_EN BIT(31) #define PL080_CONTROL_PROT_MASK (0x7 << 28) #define PL080_CONTROL_PROT_SHIFT (28) -#define PL080_CONTROL_PROT_CACHE (1 << 30) -#define PL080_CONTROL_PROT_BUFF (1 << 29) -#define PL080_CONTROL_PROT_SYS (1 << 28) -#define PL080_CONTROL_DST_INCR (1 << 27) -#define PL080_CONTROL_SRC_INCR (1 << 26) -#define PL080_CONTROL_DST_AHB2 (1 << 25) -#define PL080_CONTROL_SRC_AHB2 (1 << 24) +#define PL080_CONTROL_PROT_CACHE BIT(30) +#define PL080_CONTROL_PROT_BUFF BIT(29) +#define PL080_CONTROL_PROT_SYS BIT(28) +#define PL080_CONTROL_DST_INCR BIT(27) +#define PL080_CONTROL_SRC_INCR BIT(26) +#define PL080_CONTROL_DST_AHB2 BIT(25) +#define PL080_CONTROL_SRC_AHB2 BIT(24) #define PL080_CONTROL_DWIDTH_MASK (0x7 << 21) #define PL080_CONTROL_DWIDTH_SHIFT (21) #define PL080_CONTROL_SWIDTH_MASK (0x7 << 18) @@ -95,20 +95,20 @@ #define PL080_WIDTH_16BIT (0x1) #define PL080_WIDTH_32BIT (0x2) -#define PL080N_CONFIG_ITPROT (1 << 20) -#define PL080N_CONFIG_SECPROT (1 << 19) -#define PL080_CONFIG_HALT (1 << 18) -#define PL080_CONFIG_ACTIVE (1 << 17) /* RO */ -#define PL080_CONFIG_LOCK (1 << 16) -#define PL080_CONFIG_TC_IRQ_MASK (1 << 15) -#define PL080_CONFIG_ERR_IRQ_MASK (1 << 14) +#define PL080N_CONFIG_ITPROT BIT(20) +#define PL080N_CONFIG_SECPROT BIT(19) +#define PL080_CONFIG_HALT BIT(18) +#define PL080_CONFIG_ACTIVE BIT(17) /* RO */ +#define PL080_CONFIG_LOCK BIT(16) +#define PL080_CONFIG_TC_IRQ_MASK BIT(15) +#define PL080_CONFIG_ERR_IRQ_MASK BIT(14) #define PL080_CONFIG_FLOW_CONTROL_MASK (0x7 << 11) #define PL080_CONFIG_FLOW_CONTROL_SHIFT (11) #define PL080_CONFIG_DST_SEL_MASK (0xf << 6) #define PL080_CONFIG_DST_SEL_SHIFT (6) #define PL080_CONFIG_SRC_SEL_MASK (0xf << 1) #define PL080_CONFIG_SRC_SEL_SHIFT (1) -#define PL080_CONFIG_ENABLE (1 << 0) +#define PL080_CONFIG_ENABLE BIT(0) #define PL080_FLOW_MEM2MEM (0x0) #define PL080_FLOW_MEM2PER (0x1) -- cgit v1.2.3 From 2be83da85a64773efaa407639de81bd1377f880e Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 4 May 2017 12:34:32 +0100 Subject: thermal: devfreq_cooling: add new interface for direct power read This patch introduces a new interface for device drivers connected to devfreq_cooling in the thermal framework: get_real_power(). Some devices have more sophisticated methods (like power counters) to approximate the actual power that they use. In the previous implementation we had a pre-calculated power table which was then scaled by 'utilization' ('busy_time' and 'total_time' taken from devfreq 'last_status'). With this new interface the driver can provide more precise data regarding actual power to the thermal governor every time the power budget is calculated. We then use this value and calculate the real resource utilization scaling factor. Reviewed-by: Chris Diamand Acked-by: Javi Merino Signed-off-by: Lukasz Luba --- include/linux/devfreq_cooling.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/devfreq_cooling.h b/include/linux/devfreq_cooling.h index c35d0c0e0ada..4635f95000a4 100644 --- a/include/linux/devfreq_cooling.h +++ b/include/linux/devfreq_cooling.h @@ -34,6 +34,23 @@ * If get_dynamic_power() is NULL, then the * dynamic power is calculated as * @dyn_power_coeff * frequency * voltage^2 + * @get_real_power: When this is set, the framework uses it to ask the + * device driver for the actual power. + * Some devices have more sophisticated methods + * (like power counters) to approximate the actual power + * that they use. + * This function provides more accurate data to the + * thermal governor. When the driver does not provide + * such function, framework just uses pre-calculated + * table and scale the power by 'utilization' + * (based on 'busy_time' and 'total_time' taken from + * devfreq 'last_status'). + * The value returned by this function must be lower + * or equal than the maximum power value + * for the current state + * (which can be found in power_table[state]). + * When this interface is used, the power_table holds + * max total (static + dynamic) power value for each OPP. */ struct devfreq_cooling_power { unsigned long (*get_static_power)(struct devfreq *devfreq, @@ -41,6 +58,8 @@ struct devfreq_cooling_power { unsigned long (*get_dynamic_power)(struct devfreq *devfreq, unsigned long freq, unsigned long voltage); + int (*get_real_power)(struct devfreq *df, u32 *power, + unsigned long freq, unsigned long voltage); unsigned long dyn_power_coeff; }; -- cgit v1.2.3 From 8a537ece3d946227e4afa81eae0e43fa47439c7d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 26 Apr 2017 23:22:09 +0200 Subject: PM / wakeup: Integrate mechanism to abort transitions in progress The system wakeup framework is not very consistent with respect to the way it handles suspend-to-idle and generally wakeup events occurring during transitions to system low-power states. First off, system transitions in progress are aborted by the event reporting helpers like pm_wakeup_event() only if the wakeup_count sysfs attribute is in use (as documented), but there are cases in which system-wide transitions should be aborted even if that is not the case. For example, a wakeup signal from a designated wakeup device during system-wide PM transition, it should cause the transition to be aborted right away. Moreover, there is a freeze_wake() call in wakeup_source_activate(), but that really is only effective after suspend_freeze_state has been set to FREEZE_STATE_ENTER by freeze_enter(). However, it is very unlikely that wakeup_source_activate() will ever be called at that time, as it could only be triggered by a IRQF_NO_SUSPEND interrupt handler, so wakeups from suspend-to-idle don't really occur in wakeup_source_activate(). At the same time there is a way to abort a system suspend in progress (or wake up the system from suspend-to-idle), which is by calling pm_system_wakeup(), but in turn that doesn't cause any wakeup source objects to be activated, so it will not be covered by wakeup source statistics and will not prevent the system from suspending again immediately (in case autosleep is used, for example). Consequently, if anyone wants to abort system transitions in progress and allow the wakeup_count mechanism to work, they need to use both pm_system_wakeup() and pm_wakeup_event(), say, at the same time which is awkward. For the above reasons, make it possible to trigger pm_system_wakeup() from within wakeup_source_activate() and provide a new pm_wakeup_hard_event() helper to do so within the wakeup framework. Signed-off-by: Rafael J. Wysocki --- include/linux/pm_wakeup.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index a3447932df1f..4c2cba7ec1d4 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -106,8 +106,8 @@ extern void __pm_stay_awake(struct wakeup_source *ws); extern void pm_stay_awake(struct device *dev); extern void __pm_relax(struct wakeup_source *ws); extern void pm_relax(struct device *dev); -extern void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec); -extern void pm_wakeup_event(struct device *dev, unsigned int msec); +extern void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard); +extern void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard); #else /* !CONFIG_PM_SLEEP */ @@ -182,9 +182,11 @@ static inline void __pm_relax(struct wakeup_source *ws) {} static inline void pm_relax(struct device *dev) {} -static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec) {} +static inline void pm_wakeup_ws_event(struct wakeup_source *ws, + unsigned int msec, bool hard) {} -static inline void pm_wakeup_event(struct device *dev, unsigned int msec) {} +static inline void pm_wakeup_dev_event(struct device *dev, unsigned int msec, + bool hard) {} #endif /* !CONFIG_PM_SLEEP */ @@ -201,4 +203,19 @@ static inline void wakeup_source_trash(struct wakeup_source *ws) wakeup_source_drop(ws); } +static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec) +{ + return pm_wakeup_ws_event(ws, msec, false); +} + +static inline void pm_wakeup_event(struct device *dev, unsigned int msec) +{ + return pm_wakeup_dev_event(dev, msec, false); +} + +static inline void pm_wakeup_hard_event(struct device *dev) +{ + return pm_wakeup_dev_event(dev, 0, true); +} + #endif /* _LINUX_PM_WAKEUP_H */ -- cgit v1.2.3 From eed4d47efe9508b855b09754cf6de4325d8a2f0d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 26 Apr 2017 23:23:03 +0200 Subject: ACPI / sleep: Ignore spurious SCI wakeups from suspend-to-idle The ACPI SCI (System Control Interrupt) is set up as a wakeup IRQ during suspend-to-idle transitions and, consequently, any events signaled through it wake up the system from that state. However, on some systems some of the events signaled via the ACPI SCI while suspended to idle should not cause the system to wake up. In fact, quite often they should just be discarded. Arguably, systems should not resume entirely on such events, but in order to decide which events really should cause the system to resume and which are spurious, it is necessary to resume up to the point when ACPI SCIs are actually handled and processed, which is after executing dpm_resume_noirq() in the system resume path. For this reasons, add a loop around freeze_enter() in which the platforms can process events signaled via multiplexed IRQ lines like the ACPI SCI and add suspend-to-idle hooks that can be used for this purpose to struct platform_freeze_ops. In the ACPI case, the ->wake hook is used for checking if the SCI has triggered while suspended and deferring the interrupt-induced system wakeup until the events signaled through it are actually processed sufficiently to decide whether or not the system should resume. In turn, the ->sync hook allows all of the relevant event queues to be flushed so as to prevent events from being missed due to race conditions. In addition to that, some ACPI code processing wakeup events needs to be modified to use the "hard" version of wakeup triggers, so that it will cause a system resume to happen on device-induced wakeup events even if the "soft" mechanism to prevent the system from suspending is not enabled (that also helps to catch device-induced wakeup events occurring during suspend transitions in progress). Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index d9718378a8be..0b1cf32edfd7 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -189,6 +189,8 @@ struct platform_suspend_ops { struct platform_freeze_ops { int (*begin)(void); int (*prepare)(void); + void (*wake)(void); + void (*sync)(void); void (*restore)(void); void (*end)(void); }; @@ -428,7 +430,8 @@ extern unsigned int pm_wakeup_irq; extern bool pm_wakeup_pending(void); extern void pm_system_wakeup(void); -extern void pm_wakeup_clear(void); +extern void pm_system_cancel_wakeup(void); +extern void pm_wakeup_clear(bool reset); extern void pm_system_irq_wakeup(unsigned int irq_number); extern bool pm_get_wakeup_count(unsigned int *count, bool block); extern bool pm_save_wakeup_count(unsigned int count); @@ -478,7 +481,7 @@ static inline int unregister_pm_notifier(struct notifier_block *nb) static inline bool pm_wakeup_pending(void) { return false; } static inline void pm_system_wakeup(void) {} -static inline void pm_wakeup_clear(void) {} +static inline void pm_wakeup_clear(bool reset) {} static inline void pm_system_irq_wakeup(unsigned int irq_number) {} static inline void lock_system_sleep(void) {} -- cgit v1.2.3 From 71afe470e20db133b30730cfa856e5d6854312e9 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 13 Apr 2017 09:06:20 +0200 Subject: KVM: arm64: vgic-its: Introduce migration ABI infrastructure We plan to support different migration ABIs, ie. characterizing the ITS table layout format in guest RAM. For example, a new ABI will be needed if vLPIs get supported for nested use case. So let's introduce an array of supported ABIs (at the moment a single ABI is supported though). The following characteristics are foreseen to vary with the ABI: size of table entries, save/restore operation, the way abi settings are applied. By default the MAX_ABI_REV is applied on its creation. In subsequent patches we will introduce a way for the userspace to change the ABI in use. The entry sizes now are set according to the ABI version and not hardcoded anymore. Signed-off-by: Eric Auger Reviewed-by: Christoffer Dall --- include/linux/irqchip/arm-gic-v3.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 97cbca19430d..81ebe437ccc3 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -132,6 +132,9 @@ #define GIC_BASER_SHAREABILITY(reg, type) \ (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT) +/* encode a size field of width @w containing @n - 1 units */ +#define GIC_ENCODE_SZ(n, w) (((unsigned long)(n) - 1) & GENMASK_ULL(((w) - 1), 0)) + #define GICR_PROPBASER_SHAREABILITY_SHIFT (10) #define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT (7) #define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT (56) @@ -232,6 +235,7 @@ #define GITS_CTLR_QUIESCENT (1U << 31) #define GITS_TYPER_PLPIS (1UL << 0) +#define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT 4 #define GITS_TYPER_IDBITS_SHIFT 8 #define GITS_TYPER_DEVBITS_SHIFT 13 #define GITS_TYPER_DEVBITS(r) ((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1) @@ -290,6 +294,7 @@ #define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7) #define GITS_BASER_ENTRY_SIZE_SHIFT (48) #define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0x1f) + 1) +#define GITS_BASER_ENTRY_SIZE_MASK GENMASK_ULL(52, 48) #define GITS_BASER_SHAREABILITY_SHIFT (10) #define GITS_BASER_InnerShareable \ GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) -- cgit v1.2.3 From ab01c6bdacc43c41c6b326889f4358f5afc38bf9 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 23 Mar 2017 15:14:00 +0100 Subject: KVM: arm64: vgic-its: Implement vgic_mmio_uaccess_write_its_iidr The GITS_IIDR revision field is used to encode the migration ABI revision. So we need to restore it to check the table layout is readable by the destination. By writing the IIDR, userspace thus forces the ABI revision to be used and this must be less than or equal to the max revision KVM supports. Signed-off-by: Eric Auger Reviewed-by: Christoffer Dall --- include/linux/irqchip/arm-gic-v3.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 81ebe437ccc3..2eaea308f003 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -242,6 +242,11 @@ #define GITS_TYPER_PTA (1UL << 19) #define GITS_TYPER_HWCOLLCNT_SHIFT 24 +#define GITS_IIDR_REV_SHIFT 12 +#define GITS_IIDR_REV_MASK (0xf << GITS_IIDR_REV_SHIFT) +#define GITS_IIDR_REV(r) (((r) >> GITS_IIDR_REV_SHIFT) & 0xf) +#define GITS_IIDR_PRODUCTID_SHIFT 24 + #define GITS_CBASER_VALID (1ULL << 63) #define GITS_CBASER_SHAREABILITY_SHIFT (10) #define GITS_CBASER_INNER_CACHEABILITY_SHIFT (59) -- cgit v1.2.3 From 0d44cdb631ef53ea75be056886cf0541311e48df Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 22 Dec 2016 18:14:14 +0100 Subject: KVM: arm64: vgic-its: Interpret MAPD Size field and check related errors Up to now the MAPD's ITT size field has been ignored. It encodes the number of eventid bit minus 1. It should be used to check the eventid when a MAPTI command is issued on a device. Let's store the number of eventid bits in the its_device and do the check on MAPTI. Also make sure the ITT size field does not exceed the GITS_TYPER IDBITS field. Signed-off-by: Eric Auger Reviewed-by: Christoffer Dall Reviewed-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 2eaea308f003..be8bad00c419 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -347,9 +347,11 @@ #define E_ITS_INT_UNMAPPED_INTERRUPT 0x010307 #define E_ITS_CLEAR_UNMAPPED_INTERRUPT 0x010507 #define E_ITS_MAPD_DEVICE_OOR 0x010801 +#define E_ITS_MAPD_ITTSIZE_OOR 0x010802 #define E_ITS_MAPC_PROCNUM_OOR 0x010902 #define E_ITS_MAPC_COLLECTION_OOR 0x010903 #define E_ITS_MAPTI_UNMAPPED_DEVICE 0x010a04 +#define E_ITS_MAPTI_ID_OOR 0x010a05 #define E_ITS_MAPTI_PHYSICALID_OOR 0x010a06 #define E_ITS_INV_UNMAPPED_INTERRUPT 0x010c07 #define E_ITS_INVALL_UNMAPPED_COLLECTION 0x010d09 -- cgit v1.2.3 From 44de9d683847ba6dbac290bb8c9f1b773cbda746 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 4 May 2017 11:19:52 +0200 Subject: KVM: arm64: vgic-v3: vgic_v3_lpi_sync_pending_status this new helper synchronizes the irq pending_latch with the LPI pending bit status found in rdist pending table. As the status is consumed, we reset the bit in pending table. As we need the PENDBASER_ADDRESS() in vgic-v3, let's move its definition in the irqchip header. We restore the full length of the field, ie [51:16]. Same for PROPBASER_ADDRESS with full field length of [51:12]. Signed-off-by: Eric Auger Reviewed-by: Marc Zyngier Reviewed-by: Christoffer Dall --- include/linux/irqchip/arm-gic-v3.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index be8bad00c419..fffb91202bc9 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -159,6 +159,8 @@ #define GICR_PROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb) #define GICR_PROPBASER_IDBITS_MASK (0x1f) +#define GICR_PROPBASER_ADDRESS(x) ((x) & GENMASK_ULL(51, 12)) +#define GICR_PENDBASER_ADDRESS(x) ((x) & GENMASK_ULL(51, 16)) #define GICR_PENDBASER_SHAREABILITY_SHIFT (10) #define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT (7) -- cgit v1.2.3 From ef51042472f55b325fd7f2b26a2e29fd89757234 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 8 May 2017 10:55:27 -0700 Subject: block, dax: move "select DAX" from BLOCK to FS_DAX For configurations that do not enable DAX filesystems or drivers, do not require the DAX core to be built. Given that the 'direct_access' method has been removed from 'block_device_operations', we can also go ahead and remove the block-related dax helper functions from fs/block_dev.c to drivers/dax/super.c. This keeps dax details out of the block layer and lets the DAX core be built as a module in the FS_DAX=n case. Filesystems need to include dax.h to call bdev_dax_supported(). Cc: linux-xfs@vger.kernel.org Cc: Jens Axboe Cc: "Theodore Ts'o" Cc: Matthew Wilcox Cc: Alexander Viro Cc: "Darrick J. Wong" Cc: Ross Zwisler Reviewed-by: Jan Kara Reported-by: Geert Uytterhoeven Signed-off-by: Dan Williams --- include/linux/blkdev.h | 2 -- include/linux/dax.h | 30 ++++++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 848f87eb1905..e4d9899755a7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1940,8 +1940,6 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); -extern int bdev_dax_supported(struct super_block *, int); -int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/dax.h b/include/linux/dax.h index d3158e74a59e..7fdf1d710042 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -18,12 +18,38 @@ struct dax_operations { void **, pfn_t *); }; +int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); +#if IS_ENABLED(CONFIG_FS_DAX) +int __bdev_dax_supported(struct super_block *sb, int blocksize); +static inline int bdev_dax_supported(struct super_block *sb, int blocksize) +{ + return __bdev_dax_supported(sb, blocksize); +} +#else +static inline int bdev_dax_supported(struct super_block *sb, int blocksize) +{ + return -EOPNOTSUPP; +} +#endif + +#if IS_ENABLED(CONFIG_DAX) +struct dax_device *dax_get_by_host(const char *host); +void put_dax(struct dax_device *dax_dev); +#else +static inline struct dax_device *dax_get_by_host(const char *host) +{ + return NULL; +} + +static inline void put_dax(struct dax_device *dax_dev) +{ +} +#endif + int dax_read_lock(void); void dax_read_unlock(int id); -struct dax_device *dax_get_by_host(const char *host); struct dax_device *alloc_dax(void *private, const char *host, const struct dax_operations *ops); -void put_dax(struct dax_device *dax_dev); bool dax_alive(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); -- cgit v1.2.3 From e092693443b995c8e3a565a73b5fdb05f1260f9b Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 8 May 2017 18:02:24 -0400 Subject: NFS append COMMIT after synchronous COPY Instead of messing with the commit path which has been causing issues, add a COMMIT op after the COPY and ask for stable copies in the first space. It saves a round trip, since after the COPY, the client sends a COMMIT anyway. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 677c6b91dfcd..b28c83475ee8 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1383,6 +1383,7 @@ struct nfs42_copy_res { struct nfs42_write_res write_res; bool consecutive; bool synchronous; + struct nfs_commitres commit_res; }; struct nfs42_seek_args { -- cgit v1.2.3 From 497d72d80a789501501cccabdad6b145f9e31371 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 8 May 2017 20:38:40 +0200 Subject: KVM: Add kvm_vcpu_get_idx to get vcpu index in kvm->vcpus There are occasional needs to use the index of vcpu in the kvm->vcpus array to map something related to a VCPU. For example, unlike the vcpu->vcpu_id, the vcpu index is guaranteed to not be sparse across all vcpus which is useful when allocating a memory area for each vcpu. Signed-off-by: Christoffer Dall Reviewed-by: Eric Auger --- include/linux/kvm_host.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2c14ad9809da..12eb26d665e8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -490,6 +490,17 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) return NULL; } +static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu *tmp; + int idx; + + kvm_for_each_vcpu(idx, tmp, vcpu->kvm) + if (tmp == vcpu) + return idx; + BUG(); +} + #define kvm_for_each_memslot(memslot, slots) \ for (memslot = &slots->memslots[0]; \ memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\ -- cgit v1.2.3 From ea47dd191d543f81e0912b5dc0471b48346b016e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 19 Apr 2017 05:12:18 +1000 Subject: of/fdt: introduce of_scan_flat_dt_subnodes and of_get_flat_dt_phandle Introduce primitives for FDT parsing. These will be used for powerpc cpufeatures node scanning, which has quite complex structure but should be processed early. Cc: devicetree@vger.kernel.org Acked-by: Rob Herring Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- include/linux/of_fdt.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 271b3fdf0070..1dfbfd0d8040 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -54,6 +54,11 @@ extern char __dtb_end[]; extern int of_scan_flat_dt(int (*it)(unsigned long node, const char *uname, int depth, void *data), void *data); +extern int of_scan_flat_dt_subnodes(unsigned long node, + int (*it)(unsigned long node, + const char *uname, + void *data), + void *data); extern int of_get_flat_dt_subnode_by_name(unsigned long node, const char *uname); extern const void *of_get_flat_dt_prop(unsigned long node, const char *name, @@ -62,6 +67,7 @@ extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern int of_flat_dt_match(unsigned long node, const char *const *matches); extern unsigned long of_get_flat_dt_root(void); extern int of_get_flat_dt_size(void); +extern uint32_t of_get_flat_dt_phandle(unsigned long node); extern int early_init_dt_scan_chosen(unsigned long node, const char *uname, int depth, void *data); -- cgit v1.2.3 From 9ea762a5ae45235eb3e5ec9c05c33cf37db78d70 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Thu, 30 Mar 2017 13:13:33 +0200 Subject: virtio: virtio_driver doc Add comments for the virtio_driver members that were not documented. Signed-off-by: Cornelia Huck Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index ed04753278d4..28b0e965360f 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -165,9 +165,13 @@ int virtio_device_restore(struct virtio_device *dev); * @feature_table_legacy: same as feature_table but when working in legacy mode. * @feature_table_size_legacy: number of entries in feature table legacy array. * @probe: the function to call when a device is found. Returns 0 or -errno. + * @scan: optional function to call after successful probe; intended + * for virtio-scsi to invoke a scan. * @remove: the function to call when a device is removed. * @config_changed: optional function to call when the device configuration * changes; may be called in interrupt context. + * @freeze: optional function to call during suspend/hibernation. + * @restore: optional function to call on resume. */ struct virtio_driver { struct device_driver driver; -- cgit v1.2.3 From fb9de9704775d6190c204f4ddf8da4cfdac26be1 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 7 Apr 2017 08:25:09 +0300 Subject: ptr_ring: batch ring zeroing A known weakness in ptr_ring design is that it does not handle well the situation when ring is almost full: as entries are consumed they are immediately used again by the producer, so consumer and producer are writing to a shared cache line. To fix this, add batching to consume calls: as entries are consumed do not write NULL into the ring until we get a multiple (in current implementation 2x) of cache lines away from the producer. At that point, write them all out. We do the write out in the reverse order to keep producer from sharing cache with consumer for as long as possible. Writeout also triggers when ring wraps around - there's no special reason to do this but it helps keep the code a bit simpler. What should we do if getting away from producer by 2 cache lines would mean we are keeping the ring moe than half empty? Maybe we should reduce the batching in this case, current patch simply reduces the batching. Notes: - it is no longer true that a call to consume guarantees that the following call to produce will succeed. No users seem to assume that. - batching can also in theory reduce the signalling rate: users that would previously send interrups to the producer to wake it up after consuming each entry would now only need to do this once in a batch. Doing this would be easy by returning a flag to the caller. No users seem to do signalling on consume yet so this was not implemented yet. Signed-off-by: Michael S. Tsirkin Reviewed-by: Jesper Dangaard Brouer Acked-by: Jason Wang --- include/linux/ptr_ring.h | 63 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 6c70444da3b9..6b2e0dd88569 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -34,11 +34,13 @@ struct ptr_ring { int producer ____cacheline_aligned_in_smp; spinlock_t producer_lock; - int consumer ____cacheline_aligned_in_smp; + int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */ + int consumer_tail; /* next entry to invalidate */ spinlock_t consumer_lock; /* Shared consumer/producer data */ /* Read-only by both the producer and the consumer */ int size ____cacheline_aligned_in_smp; /* max entries in queue */ + int batch; /* number of entries to consume in a batch */ void **queue; }; @@ -170,7 +172,7 @@ static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr) static inline void *__ptr_ring_peek(struct ptr_ring *r) { if (likely(r->size)) - return r->queue[r->consumer]; + return r->queue[r->consumer_head]; return NULL; } @@ -231,9 +233,38 @@ static inline bool ptr_ring_empty_bh(struct ptr_ring *r) /* Must only be called after __ptr_ring_peek returned !NULL */ static inline void __ptr_ring_discard_one(struct ptr_ring *r) { - r->queue[r->consumer++] = NULL; - if (unlikely(r->consumer >= r->size)) - r->consumer = 0; + /* Fundamentally, what we want to do is update consumer + * index and zero out the entry so producer can reuse it. + * Doing it naively at each consume would be as simple as: + * r->queue[r->consumer++] = NULL; + * if (unlikely(r->consumer >= r->size)) + * r->consumer = 0; + * but that is suboptimal when the ring is full as producer is writing + * out new entries in the same cache line. Defer these updates until a + * batch of entries has been consumed. + */ + int head = r->consumer_head++; + + /* Once we have processed enough entries invalidate them in + * the ring all at once so producer can reuse their space in the ring. + * We also do this when we reach end of the ring - not mandatory + * but helps keep the implementation simple. + */ + if (unlikely(r->consumer_head - r->consumer_tail >= r->batch || + r->consumer_head >= r->size)) { + /* Zero out entries in the reverse order: this way we touch the + * cache line that producer might currently be reading the last; + * producer won't make progress and touch other cache lines + * besides the first one until we write out all entries. + */ + while (likely(head >= r->consumer_tail)) + r->queue[head--] = NULL; + r->consumer_tail = r->consumer_head; + } + if (unlikely(r->consumer_head >= r->size)) { + r->consumer_head = 0; + r->consumer_tail = 0; + } } static inline void *__ptr_ring_consume(struct ptr_ring *r) @@ -345,14 +376,27 @@ static inline void **__ptr_ring_init_queue_alloc(int size, gfp_t gfp) return kzalloc(ALIGN(size * sizeof(void *), SMP_CACHE_BYTES), gfp); } +static inline void __ptr_ring_set_size(struct ptr_ring *r, int size) +{ + r->size = size; + r->batch = SMP_CACHE_BYTES * 2 / sizeof(*(r->queue)); + /* We need to set batch at least to 1 to make logic + * in __ptr_ring_discard_one work correctly. + * Batching too much (because ring is small) would cause a lot of + * burstiness. Needs tuning, for now disable batching. + */ + if (r->batch > r->size / 2 || !r->batch) + r->batch = 1; +} + static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp) { r->queue = __ptr_ring_init_queue_alloc(size, gfp); if (!r->queue) return -ENOMEM; - r->size = size; - r->producer = r->consumer = 0; + __ptr_ring_set_size(r, size); + r->producer = r->consumer_head = r->consumer_tail = 0; spin_lock_init(&r->producer_lock); spin_lock_init(&r->consumer_lock); @@ -373,9 +417,10 @@ static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue, else if (destroy) destroy(ptr); - r->size = size; + __ptr_ring_set_size(r, size); r->producer = producer; - r->consumer = 0; + r->consumer_head = 0; + r->consumer_tail = 0; old = r->queue; r->queue = queue; -- cgit v1.2.3 From 3ae3d67ba705c754a3c91ac009f9ce73a0e7286a Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 10 May 2017 15:01:30 -0600 Subject: libnvdimm: add an atomic vs process context flag to rw_bytes nsio_rw_bytes can clear media errors, but this cannot be done while we are in an atomic context due to locking within ACPI. From the BTT, ->rw_bytes may be called either from atomic or process context depending on whether the calls happen during initialization or during IO. During init, we want to ensure error clearing happens, and the flag marking process context allows nsio_rw_bytes to do that. When called during IO, we're in atomic context, and error clearing can be skipped. Cc: Dan Williams Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- include/linux/nd.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nd.h b/include/linux/nd.h index fa66aeed441a..194b8e002ea7 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -48,7 +48,7 @@ struct nd_namespace_common { struct device dev; struct device *claim; int (*rw_bytes)(struct nd_namespace_common *, resource_size_t offset, - void *buf, size_t size, int rw); + void *buf, size_t size, int rw, unsigned long flags); }; static inline struct nd_namespace_common *to_ndns(struct device *dev) @@ -134,9 +134,10 @@ static inline struct nd_namespace_blk *to_nd_namespace_blk(const struct device * * @buf is up-to-date upon return from this routine. */ static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns, - resource_size_t offset, void *buf, size_t size) + resource_size_t offset, void *buf, size_t size, + unsigned long flags) { - return ndns->rw_bytes(ndns, offset, buf, size, READ); + return ndns->rw_bytes(ndns, offset, buf, size, READ, flags); } /** @@ -152,9 +153,10 @@ static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns, * to media is handled internal to the @ndns driver, if at all. */ static inline int nvdimm_write_bytes(struct nd_namespace_common *ndns, - resource_size_t offset, void *buf, size_t size) + resource_size_t offset, void *buf, size_t size, + unsigned long flags) { - return ndns->rw_bytes(ndns, offset, buf, size, WRITE); + return ndns->rw_bytes(ndns, offset, buf, size, WRITE, flags); } #define MODULE_ALIAS_ND_DEVICE(type) \ -- cgit v1.2.3 From 572e0ca9b909339fbe017aaff1a225efb6db3b61 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Fri, 12 May 2017 15:46:29 -0700 Subject: time: delete current_fs_time() All uses of the current_fs_time() function have been replaced by other time interfaces. And, its use cases can be fulfilled by current_time() or ktime_get_* variants. Link: http://lkml.kernel.org/r/1491613030-11599-13-git-send-email-deepa.kernel@gmail.com Signed-off-by: Deepa Dinamani Reviewed-by: Arnd Bergmann Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0ad325ed71e8..803e5a9b2654 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1431,7 +1431,6 @@ static inline void i_gid_write(struct inode *inode, gid_t gid) inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid); } -extern struct timespec current_fs_time(struct super_block *sb); extern struct timespec current_time(struct inode *inode); /* -- cgit v1.2.3 From 8594a21cf7a8318baedbedc3fcd2967a17ddeec0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 12 May 2017 15:46:41 -0700 Subject: mm, vmalloc: fix vmalloc users tracking properly Commit 1f5307b1e094 ("mm, vmalloc: properly track vmalloc users") has pulled asm/pgtable.h include dependency to linux/vmalloc.h and that turned out to be a bad idea for some architectures. E.g. m68k fails with In file included from arch/m68k/include/asm/pgtable_mm.h:145:0, from arch/m68k/include/asm/pgtable.h:4, from include/linux/vmalloc.h:9, from arch/m68k/kernel/module.c:9: arch/m68k/include/asm/mcf_pgtable.h: In function 'nocache_page': >> arch/m68k/include/asm/mcf_pgtable.h:339:43: error: 'init_mm' undeclared (first use in this function) #define pgd_offset_k(address) pgd_offset(&init_mm, address) as spotted by kernel build bot. nios2 fails for other reason In file included from include/asm-generic/io.h:767:0, from arch/nios2/include/asm/io.h:61, from include/linux/io.h:25, from arch/nios2/include/asm/pgtable.h:18, from include/linux/mm.h:70, from include/linux/pid_namespace.h:6, from include/linux/ptrace.h:9, from arch/nios2/include/uapi/asm/elf.h:23, from arch/nios2/include/asm/elf.h:22, from include/linux/elf.h:4, from include/linux/module.h:15, from init/main.c:16: include/linux/vmalloc.h: In function '__vmalloc_node_flags': include/linux/vmalloc.h:99:40: error: 'PAGE_KERNEL' undeclared (first use in this function); did you mean 'GFP_KERNEL'? which is due to the newly added #include , which on nios2 includes and thus and which again includes . Tweaking that around just turns out a bigger headache than necessary. This patch reverts 1f5307b1e094 and reimplements the original fix in a different way. __vmalloc_node_flags can stay static inline which will cover vmalloc* functions. We only have one external user (kvmalloc_node) and we can export __vmalloc_node_flags_caller and provide the caller directly. This is much simpler and it doesn't really need any games with header files. [akpm@linux-foundation.org: coding-style fixes] [mhocko@kernel.org: revert old comment] Link: http://lkml.kernel.org/r/20170509211054.GB16325@dhcp22.suse.cz Fixes: 1f5307b1e094 ("mm, vmalloc: properly track vmalloc users") Link: http://lkml.kernel.org/r/20170509153702.GR6481@dhcp22.suse.cz Signed-off-by: Michal Hocko Cc: Tobias Klauser Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 0328ce003992..2d92dd002abd 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -6,7 +6,6 @@ #include #include #include /* pgprot_t */ -#include /* PAGE_KERNEL */ #include struct vm_area_struct; /* vma defining user mapping in mm_types.h */ @@ -83,22 +82,14 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, const void *caller); #ifndef CONFIG_MMU extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); -#else -extern void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, const void *caller); - -/* - * We really want to have this inlined due to caller tracking. This - * function is used by the highlevel vmalloc apis and so we want to track - * their callers and inlining will achieve that. - */ -static inline void *__vmalloc_node_flags(unsigned long size, - int node, gfp_t flags) +static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, + gfp_t flags, void *caller) { - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, - node, __builtin_return_address(0)); + return __vmalloc_node_flags(size, node, flags); } +#else +extern void *__vmalloc_node_flags_caller(unsigned long size, + int node, gfp_t flags, void *caller); #endif extern void vfree(const void *addr); -- cgit v1.2.3 From 4636e70bb0a8b871998b6841a2e4b205cf2bc863 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 12 May 2017 15:46:47 -0700 Subject: dax: prevent invalidation of mapped DAX entries Patch series "mm,dax: Fix data corruption due to mmap inconsistency", v4. This series fixes data corruption that can happen for DAX mounts when page faults race with write(2) and as a result page tables get out of sync with block mappings in the filesystem and thus data seen through mmap is different from data seen through read(2). The series passes testing with t_mmap_stale test program from Ross and also other mmap related tests on DAX filesystem. This patch (of 4): dax_invalidate_mapping_entry() currently removes DAX exceptional entries only if they are clean and unlocked. This is done via: invalidate_mapping_pages() invalidate_exceptional_entry() dax_invalidate_mapping_entry() However, for page cache pages removed in invalidate_mapping_pages() there is an additional criteria which is that the page must not be mapped. This is noted in the comments above invalidate_mapping_pages() and is checked in invalidate_inode_page(). For DAX entries this means that we can can end up in a situation where a DAX exceptional entry, either a huge zero page or a regular DAX entry, could end up mapped but without an associated radix tree entry. This is inconsistent with the rest of the DAX code and with what happens in the page cache case. We aren't able to unmap the DAX exceptional entry because according to its comments invalidate_mapping_pages() isn't allowed to block, and unmap_mapping_range() takes a write lock on the mapping->i_mmap_rwsem. Since we essentially never have unmapped DAX entries to evict from the radix tree, just remove dax_invalidate_mapping_entry(). Fixes: c6dcf52c23d2 ("mm: Invalidate DAX radix tree entries only if appropriate") Link: http://lkml.kernel.org/r/20170510085419.27601-2-jack@suse.cz Signed-off-by: Ross Zwisler Signed-off-by: Jan Kara Reported-by: Jan Kara Cc: Dan Williams Cc: [4.10+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dax.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index d3158e74a59e..d1236d16ef00 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -63,7 +63,6 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, const struct iomap_ops *ops); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); -int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); void dax_wake_mapping_entry_waiter(struct address_space *mapping, -- cgit v1.2.3