From a1867f85e06edacd82956d3422caa2b9074f4321 Mon Sep 17 00:00:00 2001 From: Min Li Date: Fri, 18 Jun 2021 12:37:12 -0400 Subject: mfd: Add Renesas Synchronization Management Unit (SMU) support Add support for ClockMatrix(TM) and 82P33xxx families of timing and synchronization devices. The access interface can be either SPI or I2C. Currently, it will create 2 types of MFD devices, which are to be used by the corresponding rsmu character device driver and the PTP hardware clock driver, respectively. Signed-off-by: Min Li Signed-off-by: Lee Jones --- include/linux/mfd/idt82p33_reg.h | 112 ++++++ include/linux/mfd/idt8a340_reg.h | 729 +++++++++++++++++++++++++++++++++++++++ include/linux/mfd/rsmu.h | 36 ++ 3 files changed, 877 insertions(+) create mode 100644 include/linux/mfd/idt82p33_reg.h create mode 100644 include/linux/mfd/idt8a340_reg.h create mode 100644 include/linux/mfd/rsmu.h (limited to 'include/linux') diff --git a/include/linux/mfd/idt82p33_reg.h b/include/linux/mfd/idt82p33_reg.h new file mode 100644 index 000000000000..129a6c078221 --- /dev/null +++ b/include/linux/mfd/idt82p33_reg.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Register Map - Based on AN888_SMUforIEEE_SynchEther_82P33xxx_RevH.pdf + * + * Copyright (C) 2021 Integrated Device Technology, Inc., a Renesas Company. + */ +#ifndef HAVE_IDT82P33_REG +#define HAVE_IDT82P33_REG + +/* Register address */ +#define DPLL1_TOD_CNFG 0x134 +#define DPLL2_TOD_CNFG 0x1B4 + +#define DPLL1_TOD_STS 0x10B +#define DPLL2_TOD_STS 0x18B + +#define DPLL1_TOD_TRIGGER 0x115 +#define DPLL2_TOD_TRIGGER 0x195 + +#define DPLL1_OPERATING_MODE_CNFG 0x120 +#define DPLL2_OPERATING_MODE_CNFG 0x1A0 + +#define DPLL1_HOLDOVER_FREQ_CNFG 0x12C +#define DPLL2_HOLDOVER_FREQ_CNFG 0x1AC + +#define DPLL1_PHASE_OFFSET_CNFG 0x143 +#define DPLL2_PHASE_OFFSET_CNFG 0x1C3 + +#define DPLL1_SYNC_EDGE_CNFG 0x140 +#define DPLL2_SYNC_EDGE_CNFG 0x1C0 + +#define DPLL1_INPUT_MODE_CNFG 0x116 +#define DPLL2_INPUT_MODE_CNFG 0x196 + +#define DPLL1_OPERATING_STS 0x102 +#define DPLL2_OPERATING_STS 0x182 + +#define DPLL1_CURRENT_FREQ_STS 0x103 +#define DPLL2_CURRENT_FREQ_STS 0x183 + +#define REG_SOFT_RESET 0X381 + +#define OUT_MUX_CNFG(outn) REG_ADDR(0x6, (0xC * (outn))) + +/* Register bit definitions */ +#define SYNC_TOD BIT(1) +#define PH_OFFSET_EN BIT(7) +#define SQUELCH_ENABLE BIT(5) + +/* Bit definitions for the DPLL_MODE register */ +#define PLL_MODE_SHIFT (0) +#define PLL_MODE_MASK (0x1F) +#define COMBO_MODE_EN BIT(5) +#define COMBO_MODE_SHIFT (6) +#define COMBO_MODE_MASK (0x3) + +/* Bit definitions for DPLL_OPERATING_STS register */ +#define OPERATING_STS_MASK (0x7) +#define OPERATING_STS_SHIFT (0x0) + +/* Bit definitions for DPLL_TOD_TRIGGER register */ +#define READ_TRIGGER_MASK (0xF) +#define READ_TRIGGER_SHIFT (0x0) +#define WRITE_TRIGGER_MASK (0xF0) +#define WRITE_TRIGGER_SHIFT (0x4) + +/* Bit definitions for REG_SOFT_RESET register */ +#define SOFT_RESET_EN BIT(7) + +enum pll_mode { + PLL_MODE_MIN = 0, + PLL_MODE_AUTOMATIC = PLL_MODE_MIN, + PLL_MODE_FORCE_FREERUN = 1, + PLL_MODE_FORCE_HOLDOVER = 2, + PLL_MODE_FORCE_LOCKED = 4, + PLL_MODE_FORCE_PRE_LOCKED2 = 5, + PLL_MODE_FORCE_PRE_LOCKED = 6, + PLL_MODE_FORCE_LOST_PHASE = 7, + PLL_MODE_DCO = 10, + PLL_MODE_WPH = 18, + PLL_MODE_MAX = PLL_MODE_WPH, +}; + +enum hw_tod_trig_sel { + HW_TOD_TRIG_SEL_MIN = 0, + HW_TOD_TRIG_SEL_NO_WRITE = HW_TOD_TRIG_SEL_MIN, + HW_TOD_TRIG_SEL_NO_READ = HW_TOD_TRIG_SEL_MIN, + HW_TOD_TRIG_SEL_SYNC_SEL = 1, + HW_TOD_TRIG_SEL_IN12 = 2, + HW_TOD_TRIG_SEL_IN13 = 3, + HW_TOD_TRIG_SEL_IN14 = 4, + HW_TOD_TRIG_SEL_TOD_PPS = 5, + HW_TOD_TRIG_SEL_TIMER_INTERVAL = 6, + HW_TOD_TRIG_SEL_MSB_PHASE_OFFSET_CNFG = 7, + HW_TOD_TRIG_SEL_MSB_HOLDOVER_FREQ_CNFG = 8, + HW_TOD_WR_TRIG_SEL_MSB_TOD_CNFG = 9, + HW_TOD_RD_TRIG_SEL_LSB_TOD_STS = HW_TOD_WR_TRIG_SEL_MSB_TOD_CNFG, + WR_TRIG_SEL_MAX = HW_TOD_WR_TRIG_SEL_MSB_TOD_CNFG, +}; + +/** @brief Enumerated type listing DPLL operational modes */ +enum dpll_state { + DPLL_STATE_FREERUN = 1, + DPLL_STATE_HOLDOVER = 2, + DPLL_STATE_LOCKED = 4, + DPLL_STATE_PRELOCKED2 = 5, + DPLL_STATE_PRELOCKED = 6, + DPLL_STATE_LOSTPHASE = 7, + DPLL_STATE_MAX +}; + +#endif diff --git a/include/linux/mfd/idt8a340_reg.h b/include/linux/mfd/idt8a340_reg.h new file mode 100644 index 000000000000..92d763230bdf --- /dev/null +++ b/include/linux/mfd/idt8a340_reg.h @@ -0,0 +1,729 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Based on 5.2.0, Family Programming Guide (Sept 30, 2020) + * + * Copyright (C) 2021 Integrated Device Technology, Inc., a Renesas Company. + */ +#ifndef HAVE_IDT8A340_REG +#define HAVE_IDT8A340_REG + +#define PAGE_ADDR_BASE 0x0000 +#define PAGE_ADDR 0x00fc + +#define HW_REVISION 0x8180 +#define REV_ID 0x007a + +#define HW_DPLL_0 (0x8a00) +#define HW_DPLL_1 (0x8b00) +#define HW_DPLL_2 (0x8c00) +#define HW_DPLL_3 (0x8d00) +#define HW_DPLL_4 (0x8e00) +#define HW_DPLL_5 (0x8f00) +#define HW_DPLL_6 (0x9000) +#define HW_DPLL_7 (0x9100) + +#define HW_DPLL_TOD_SW_TRIG_ADDR__0 (0x080) +#define HW_DPLL_TOD_CTRL_1 (0x089) +#define HW_DPLL_TOD_CTRL_2 (0x08A) +#define HW_DPLL_TOD_OVR__0 (0x098) +#define HW_DPLL_TOD_OUT_0__0 (0x0B0) + +#define HW_Q0_Q1_CH_SYNC_CTRL_0 (0xa740) +#define HW_Q0_Q1_CH_SYNC_CTRL_1 (0xa741) +#define HW_Q2_Q3_CH_SYNC_CTRL_0 (0xa742) +#define HW_Q2_Q3_CH_SYNC_CTRL_1 (0xa743) +#define HW_Q4_Q5_CH_SYNC_CTRL_0 (0xa744) +#define HW_Q4_Q5_CH_SYNC_CTRL_1 (0xa745) +#define HW_Q6_Q7_CH_SYNC_CTRL_0 (0xa746) +#define HW_Q6_Q7_CH_SYNC_CTRL_1 (0xa747) +#define HW_Q8_CH_SYNC_CTRL_0 (0xa748) +#define HW_Q8_CH_SYNC_CTRL_1 (0xa749) +#define HW_Q9_CH_SYNC_CTRL_0 (0xa74a) +#define HW_Q9_CH_SYNC_CTRL_1 (0xa74b) +#define HW_Q10_CH_SYNC_CTRL_0 (0xa74c) +#define HW_Q10_CH_SYNC_CTRL_1 (0xa74d) +#define HW_Q11_CH_SYNC_CTRL_0 (0xa74e) +#define HW_Q11_CH_SYNC_CTRL_1 (0xa74f) + +#define SYNC_SOURCE_DPLL0_TOD_PPS 0x14 +#define SYNC_SOURCE_DPLL1_TOD_PPS 0x15 +#define SYNC_SOURCE_DPLL2_TOD_PPS 0x16 +#define SYNC_SOURCE_DPLL3_TOD_PPS 0x17 + +#define SYNCTRL1_MASTER_SYNC_RST BIT(7) +#define SYNCTRL1_MASTER_SYNC_TRIG BIT(5) +#define SYNCTRL1_TOD_SYNC_TRIG BIT(4) +#define SYNCTRL1_FBDIV_FRAME_SYNC_TRIG BIT(3) +#define SYNCTRL1_FBDIV_SYNC_TRIG BIT(2) +#define SYNCTRL1_Q1_DIV_SYNC_TRIG BIT(1) +#define SYNCTRL1_Q0_DIV_SYNC_TRIG BIT(0) + +#define HW_Q8_CTRL_SPARE (0xa7d4) +#define HW_Q11_CTRL_SPARE (0xa7ec) + +/** + * Select FOD5 as sync_trigger for Q8 divider. + * Transition from logic zero to one + * sets trigger to sync Q8 divider. + * + * Unused when FOD4 is driving Q8 divider (normal operation). + */ +#define Q9_TO_Q8_SYNC_TRIG BIT(1) + +/** + * Enable FOD5 as driver for clock and sync for Q8 divider. + * Enable fanout buffer for FOD5. + * + * Unused when FOD4 is driving Q8 divider (normal operation). + */ +#define Q9_TO_Q8_FANOUT_AND_CLOCK_SYNC_ENABLE_MASK (BIT(0) | BIT(2)) + +/** + * Select FOD6 as sync_trigger for Q11 divider. + * Transition from logic zero to one + * sets trigger to sync Q11 divider. + * + * Unused when FOD7 is driving Q11 divider (normal operation). + */ +#define Q10_TO_Q11_SYNC_TRIG BIT(1) + +/** + * Enable FOD6 as driver for clock and sync for Q11 divider. + * Enable fanout buffer for FOD6. + * + * Unused when FOD7 is driving Q11 divider (normal operation). + */ +#define Q10_TO_Q11_FANOUT_AND_CLOCK_SYNC_ENABLE_MASK (BIT(0) | BIT(2)) + +#define RESET_CTRL 0xc000 +#define SM_RESET 0x0012 +#define SM_RESET_V520 0x0013 +#define SM_RESET_CMD 0x5A + +#define GENERAL_STATUS 0xc014 +#define BOOT_STATUS 0x0000 +#define HW_REV_ID 0x000A +#define BOND_ID 0x000B +#define HW_CSR_ID 0x000C +#define HW_IRQ_ID 0x000E +#define MAJ_REL 0x0010 +#define MIN_REL 0x0011 +#define HOTFIX_REL 0x0012 +#define PIPELINE_ID 0x0014 +#define BUILD_ID 0x0018 +#define JTAG_DEVICE_ID 0x001c +#define PRODUCT_ID 0x001e +#define OTP_SCSR_CONFIG_SELECT 0x0022 + +#define STATUS 0xc03c +#define DPLL0_STATUS 0x0018 +#define DPLL1_STATUS 0x0019 +#define DPLL2_STATUS 0x001a +#define DPLL3_STATUS 0x001b +#define DPLL4_STATUS 0x001c +#define DPLL5_STATUS 0x001d +#define DPLL6_STATUS 0x001e +#define DPLL7_STATUS 0x001f +#define DPLL_SYS_STATUS 0x0020 +#define DPLL_SYS_APLL_STATUS 0x0021 +#define DPLL0_FILTER_STATUS 0x0044 +#define DPLL1_FILTER_STATUS 0x004c +#define DPLL2_FILTER_STATUS 0x0054 +#define DPLL3_FILTER_STATUS 0x005c +#define DPLL4_FILTER_STATUS 0x0064 +#define DPLL5_FILTER_STATUS 0x006c +#define DPLL6_FILTER_STATUS 0x0074 +#define DPLL7_FILTER_STATUS 0x007c +#define DPLLSYS_FILTER_STATUS 0x0084 +#define USER_GPIO0_TO_7_STATUS 0x008a +#define USER_GPIO8_TO_15_STATUS 0x008b + +#define GPIO_USER_CONTROL 0xc160 +#define GPIO0_TO_7_OUT 0x0000 +#define GPIO8_TO_15_OUT 0x0001 +#define GPIO0_TO_7_OUT_V520 0x0002 +#define GPIO8_TO_15_OUT_V520 0x0003 + +#define STICKY_STATUS_CLEAR 0xc164 + +#define GPIO_TOD_NOTIFICATION_CLEAR 0xc16c + +#define ALERT_CFG 0xc188 + +#define SYS_DPLL_XO 0xc194 + +#define SYS_APLL 0xc19c + +#define INPUT_0 0xc1b0 +#define INPUT_1 0xc1c0 +#define INPUT_2 0xc1d0 +#define INPUT_3 0xc200 +#define INPUT_4 0xc210 +#define INPUT_5 0xc220 +#define INPUT_6 0xc230 +#define INPUT_7 0xc240 +#define INPUT_8 0xc250 +#define INPUT_9 0xc260 +#define INPUT_10 0xc280 +#define INPUT_11 0xc290 +#define INPUT_12 0xc2a0 +#define INPUT_13 0xc2b0 +#define INPUT_14 0xc2c0 +#define INPUT_15 0xc2d0 + +#define REF_MON_0 0xc2e0 +#define REF_MON_1 0xc2ec +#define REF_MON_2 0xc300 +#define REF_MON_3 0xc30c +#define REF_MON_4 0xc318 +#define REF_MON_5 0xc324 +#define REF_MON_6 0xc330 +#define REF_MON_7 0xc33c +#define REF_MON_8 0xc348 +#define REF_MON_9 0xc354 +#define REF_MON_10 0xc360 +#define REF_MON_11 0xc36c +#define REF_MON_12 0xc380 +#define REF_MON_13 0xc38c +#define REF_MON_14 0xc398 +#define REF_MON_15 0xc3a4 + +#define DPLL_0 0xc3b0 +#define DPLL_CTRL_REG_0 0x0002 +#define DPLL_CTRL_REG_1 0x0003 +#define DPLL_CTRL_REG_2 0x0004 +#define DPLL_TOD_SYNC_CFG 0x0031 +#define DPLL_COMBO_SLAVE_CFG_0 0x0032 +#define DPLL_COMBO_SLAVE_CFG_1 0x0033 +#define DPLL_SLAVE_REF_CFG 0x0034 +#define DPLL_REF_MODE 0x0035 +#define DPLL_PHASE_MEASUREMENT_CFG 0x0036 +#define DPLL_MODE 0x0037 +#define DPLL_MODE_V520 0x003B +#define DPLL_1 0xc400 +#define DPLL_2 0xc438 +#define DPLL_2_V520 0xc43c +#define DPLL_3 0xc480 +#define DPLL_4 0xc4b8 +#define DPLL_4_V520 0xc4bc +#define DPLL_5 0xc500 +#define DPLL_6 0xc538 +#define DPLL_6_V520 0xc53c +#define DPLL_7 0xc580 +#define SYS_DPLL 0xc5b8 +#define SYS_DPLL_V520 0xc5bc + +#define DPLL_CTRL_0 0xc600 +#define DPLL_CTRL_DPLL_MANU_REF_CFG 0x0001 +#define DPLL_CTRL_DPLL_FOD_FREQ 0x001c +#define DPLL_CTRL_COMBO_MASTER_CFG 0x003a +#define DPLL_CTRL_1 0xc63c +#define DPLL_CTRL_2 0xc680 +#define DPLL_CTRL_3 0xc6bc +#define DPLL_CTRL_4 0xc700 +#define DPLL_CTRL_5 0xc73c +#define DPLL_CTRL_6 0xc780 +#define DPLL_CTRL_7 0xc7bc +#define SYS_DPLL_CTRL 0xc800 + +#define DPLL_PHASE_0 0xc818 +/* Signed 42-bit FFO in units of 2^(-53) */ +#define DPLL_WR_PHASE 0x0000 +#define DPLL_PHASE_1 0xc81c +#define DPLL_PHASE_2 0xc820 +#define DPLL_PHASE_3 0xc824 +#define DPLL_PHASE_4 0xc828 +#define DPLL_PHASE_5 0xc82c +#define DPLL_PHASE_6 0xc830 +#define DPLL_PHASE_7 0xc834 + +#define DPLL_FREQ_0 0xc838 +/* Signed 42-bit FFO in units of 2^(-53) */ +#define DPLL_WR_FREQ 0x0000 +#define DPLL_FREQ_1 0xc840 +#define DPLL_FREQ_2 0xc848 +#define DPLL_FREQ_3 0xc850 +#define DPLL_FREQ_4 0xc858 +#define DPLL_FREQ_5 0xc860 +#define DPLL_FREQ_6 0xc868 +#define DPLL_FREQ_7 0xc870 + +#define DPLL_PHASE_PULL_IN_0 0xc880 +#define PULL_IN_OFFSET 0x0000 /* Signed 32 bit */ +#define PULL_IN_SLOPE_LIMIT 0x0004 /* Unsigned 24 bit */ +#define PULL_IN_CTRL 0x0007 +#define DPLL_PHASE_PULL_IN_1 0xc888 +#define DPLL_PHASE_PULL_IN_2 0xc890 +#define DPLL_PHASE_PULL_IN_3 0xc898 +#define DPLL_PHASE_PULL_IN_4 0xc8a0 +#define DPLL_PHASE_PULL_IN_5 0xc8a8 +#define DPLL_PHASE_PULL_IN_6 0xc8b0 +#define DPLL_PHASE_PULL_IN_7 0xc8b8 + +#define GPIO_CFG 0xc8c0 +#define GPIO_CFG_GBL 0x0000 +#define GPIO_0 0xc8c2 +#define GPIO_DCO_INC_DEC 0x0000 +#define GPIO_OUT_CTRL_0 0x0001 +#define GPIO_OUT_CTRL_1 0x0002 +#define GPIO_TOD_TRIG 0x0003 +#define GPIO_DPLL_INDICATOR 0x0004 +#define GPIO_LOS_INDICATOR 0x0005 +#define GPIO_REF_INPUT_DSQ_0 0x0006 +#define GPIO_REF_INPUT_DSQ_1 0x0007 +#define GPIO_REF_INPUT_DSQ_2 0x0008 +#define GPIO_REF_INPUT_DSQ_3 0x0009 +#define GPIO_MAN_CLK_SEL_0 0x000a +#define GPIO_MAN_CLK_SEL_1 0x000b +#define GPIO_MAN_CLK_SEL_2 0x000c +#define GPIO_SLAVE 0x000d +#define GPIO_ALERT_OUT_CFG 0x000e +#define GPIO_TOD_NOTIFICATION_CFG 0x000f +#define GPIO_CTRL 0x0010 +#define GPIO_CTRL_V520 0x0011 +#define GPIO_1 0xc8d4 +#define GPIO_2 0xc8e6 +#define GPIO_3 0xc900 +#define GPIO_4 0xc912 +#define GPIO_5 0xc924 +#define GPIO_6 0xc936 +#define GPIO_7 0xc948 +#define GPIO_8 0xc95a +#define GPIO_9 0xc980 +#define GPIO_10 0xc992 +#define GPIO_11 0xc9a4 +#define GPIO_12 0xc9b6 +#define GPIO_13 0xc9c8 +#define GPIO_14 0xc9da +#define GPIO_15 0xca00 + +#define OUT_DIV_MUX 0xca12 +#define OUTPUT_0 0xca14 +#define OUTPUT_0_V520 0xca20 +/* FOD frequency output divider value */ +#define OUT_DIV 0x0000 +#define OUT_DUTY_CYCLE_HIGH 0x0004 +#define OUT_CTRL_0 0x0008 +#define OUT_CTRL_1 0x0009 +/* Phase adjustment in FOD cycles */ +#define OUT_PHASE_ADJ 0x000c +#define OUTPUT_1 0xca24 +#define OUTPUT_1_V520 0xca30 +#define OUTPUT_2 0xca34 +#define OUTPUT_2_V520 0xca40 +#define OUTPUT_3 0xca44 +#define OUTPUT_3_V520 0xca50 +#define OUTPUT_4 0xca54 +#define OUTPUT_4_V520 0xca60 +#define OUTPUT_5 0xca64 +#define OUTPUT_5_V520 0xca80 +#define OUTPUT_6 0xca80 +#define OUTPUT_6_V520 0xca90 +#define OUTPUT_7 0xca90 +#define OUTPUT_7_V520 0xcaa0 +#define OUTPUT_8 0xcaa0 +#define OUTPUT_8_V520 0xcab0 +#define OUTPUT_9 0xcab0 +#define OUTPUT_9_V520 0xcac0 +#define OUTPUT_10 0xcac0 +#define OUTPUT_10_V520 0xcad0 +#define OUTPUT_11 0xcad0 +#define OUTPUT_11_V520 0xcae0 + +#define SERIAL 0xcae0 +#define SERIAL_V520 0xcaf0 + +#define PWM_ENCODER_0 0xcb00 +#define PWM_ENCODER_1 0xcb08 +#define PWM_ENCODER_2 0xcb10 +#define PWM_ENCODER_3 0xcb18 +#define PWM_ENCODER_4 0xcb20 +#define PWM_ENCODER_5 0xcb28 +#define PWM_ENCODER_6 0xcb30 +#define PWM_ENCODER_7 0xcb38 +#define PWM_DECODER_0 0xcb40 +#define PWM_DECODER_1 0xcb48 +#define PWM_DECODER_1_V520 0xcb4a +#define PWM_DECODER_2 0xcb50 +#define PWM_DECODER_2_V520 0xcb54 +#define PWM_DECODER_3 0xcb58 +#define PWM_DECODER_3_V520 0xcb5e +#define PWM_DECODER_4 0xcb60 +#define PWM_DECODER_4_V520 0xcb68 +#define PWM_DECODER_5 0xcb68 +#define PWM_DECODER_5_V520 0xcb80 +#define PWM_DECODER_6 0xcb70 +#define PWM_DECODER_6_V520 0xcb8a +#define PWM_DECODER_7 0xcb80 +#define PWM_DECODER_7_V520 0xcb94 +#define PWM_DECODER_8 0xcb88 +#define PWM_DECODER_8_V520 0xcb9e +#define PWM_DECODER_9 0xcb90 +#define PWM_DECODER_9_V520 0xcba8 +#define PWM_DECODER_10 0xcb98 +#define PWM_DECODER_10_V520 0xcbb2 +#define PWM_DECODER_11 0xcba0 +#define PWM_DECODER_11_V520 0xcbbc +#define PWM_DECODER_12 0xcba8 +#define PWM_DECODER_12_V520 0xcbc6 +#define PWM_DECODER_13 0xcbb0 +#define PWM_DECODER_13_V520 0xcbd0 +#define PWM_DECODER_14 0xcbb8 +#define PWM_DECODER_14_V520 0xcbda +#define PWM_DECODER_15 0xcbc0 +#define PWM_DECODER_15_V520 0xcbe4 +#define PWM_USER_DATA 0xcbc8 +#define PWM_USER_DATA_V520 0xcbf0 + +#define TOD_0 0xcbcc +#define TOD_0_V520 0xcc00 +/* Enable TOD counter, output channel sync and even-PPS mode */ +#define TOD_CFG 0x0000 +#define TOD_CFG_V520 0x0001 +#define TOD_1 0xcbce +#define TOD_1_V520 0xcc02 +#define TOD_2 0xcbd0 +#define TOD_2_V520 0xcc04 +#define TOD_3 0xcbd2 +#define TOD_3_V520 0xcc06 + +#define TOD_WRITE_0 0xcc00 +#define TOD_WRITE_0_V520 0xcc10 +/* 8-bit subns, 32-bit ns, 48-bit seconds */ +#define TOD_WRITE 0x0000 +/* Counter increments after TOD write is completed */ +#define TOD_WRITE_COUNTER 0x000c +/* TOD write trigger configuration */ +#define TOD_WRITE_SELECT_CFG_0 0x000d +/* TOD write trigger selection */ +#define TOD_WRITE_CMD 0x000f +#define TOD_WRITE_1 0xcc10 +#define TOD_WRITE_1_V520 0xcc20 +#define TOD_WRITE_2 0xcc20 +#define TOD_WRITE_2_V520 0xcc30 +#define TOD_WRITE_3 0xcc30 +#define TOD_WRITE_3_V520 0xcc40 + +#define TOD_READ_PRIMARY_0 0xcc40 +#define TOD_READ_PRIMARY_0_V520 0xcc50 +/* 8-bit subns, 32-bit ns, 48-bit seconds */ +#define TOD_READ_PRIMARY 0x0000 +/* Counter increments after TOD write is completed */ +#define TOD_READ_PRIMARY_COUNTER 0x000b +/* Read trigger configuration */ +#define TOD_READ_PRIMARY_SEL_CFG_0 0x000c +/* Read trigger selection */ +#define TOD_READ_PRIMARY_CMD 0x000e +#define TOD_READ_PRIMARY_CMD_V520 0x000f +#define TOD_READ_PRIMARY_1 0xcc50 +#define TOD_READ_PRIMARY_1_V520 0xcc60 +#define TOD_READ_PRIMARY_2 0xcc60 +#define TOD_READ_PRIMARY_2_V520 0xcc80 +#define TOD_READ_PRIMARY_3 0xcc80 +#define TOD_READ_PRIMARY_3_V520 0xcc90 + +#define TOD_READ_SECONDARY_0 0xcc90 +#define TOD_READ_SECONDARY_0_V520 0xcca0 +#define TOD_READ_SECONDARY_1 0xcca0 +#define TOD_READ_SECONDARY_1_V520 0xccb0 +#define TOD_READ_SECONDARY_2 0xccb0 +#define TOD_READ_SECONDARY_2_V520 0xccc0 +#define TOD_READ_SECONDARY_3 0xccc0 +#define TOD_READ_SECONDARY_3_V520 0xccd0 + +#define OUTPUT_TDC_CFG 0xccd0 +#define OUTPUT_TDC_CFG_V520 0xcce0 +#define OUTPUT_TDC_0 0xcd00 +#define OUTPUT_TDC_1 0xcd08 +#define OUTPUT_TDC_2 0xcd10 +#define OUTPUT_TDC_3 0xcd18 +#define INPUT_TDC 0xcd20 + +#define SCRATCH 0xcf50 +#define SCRATCH_V520 0xcf4c + +#define EEPROM 0xcf68 +#define EEPROM_V520 0xcf64 + +#define OTP 0xcf70 + +#define BYTE 0xcf80 + +/* Bit definitions for the MAJ_REL register */ +#define MAJOR_SHIFT (1) +#define MAJOR_MASK (0x7f) +#define PR_BUILD BIT(0) + +/* Bit definitions for the USER_GPIO0_TO_7_STATUS register */ +#define GPIO0_LEVEL BIT(0) +#define GPIO1_LEVEL BIT(1) +#define GPIO2_LEVEL BIT(2) +#define GPIO3_LEVEL BIT(3) +#define GPIO4_LEVEL BIT(4) +#define GPIO5_LEVEL BIT(5) +#define GPIO6_LEVEL BIT(6) +#define GPIO7_LEVEL BIT(7) + +/* Bit definitions for the USER_GPIO8_TO_15_STATUS register */ +#define GPIO8_LEVEL BIT(0) +#define GPIO9_LEVEL BIT(1) +#define GPIO10_LEVEL BIT(2) +#define GPIO11_LEVEL BIT(3) +#define GPIO12_LEVEL BIT(4) +#define GPIO13_LEVEL BIT(5) +#define GPIO14_LEVEL BIT(6) +#define GPIO15_LEVEL BIT(7) + +/* Bit definitions for the GPIO0_TO_7_OUT register */ +#define GPIO0_DRIVE_LEVEL BIT(0) +#define GPIO1_DRIVE_LEVEL BIT(1) +#define GPIO2_DRIVE_LEVEL BIT(2) +#define GPIO3_DRIVE_LEVEL BIT(3) +#define GPIO4_DRIVE_LEVEL BIT(4) +#define GPIO5_DRIVE_LEVEL BIT(5) +#define GPIO6_DRIVE_LEVEL BIT(6) +#define GPIO7_DRIVE_LEVEL BIT(7) + +/* Bit definitions for the GPIO8_TO_15_OUT register */ +#define GPIO8_DRIVE_LEVEL BIT(0) +#define GPIO9_DRIVE_LEVEL BIT(1) +#define GPIO10_DRIVE_LEVEL BIT(2) +#define GPIO11_DRIVE_LEVEL BIT(3) +#define GPIO12_DRIVE_LEVEL BIT(4) +#define GPIO13_DRIVE_LEVEL BIT(5) +#define GPIO14_DRIVE_LEVEL BIT(6) +#define GPIO15_DRIVE_LEVEL BIT(7) + +/* Bit definitions for the DPLL_TOD_SYNC_CFG register */ +#define TOD_SYNC_SOURCE_SHIFT (1) +#define TOD_SYNC_SOURCE_MASK (0x3) +#define TOD_SYNC_EN BIT(0) + +/* Bit definitions for the DPLL_MODE register */ +#define WRITE_TIMER_MODE BIT(6) +#define PLL_MODE_SHIFT (3) +#define PLL_MODE_MASK (0x7) +#define STATE_MODE_SHIFT (0) +#define STATE_MODE_MASK (0x7) + +/* Bit definitions for the GPIO_CFG_GBL register */ +#define SUPPLY_MODE_SHIFT (0) +#define SUPPLY_MODE_MASK (0x3) + +/* Bit definitions for the GPIO_DCO_INC_DEC register */ +#define INCDEC_DPLL_INDEX_SHIFT (0) +#define INCDEC_DPLL_INDEX_MASK (0x7) + +/* Bit definitions for the GPIO_OUT_CTRL_0 register */ +#define CTRL_OUT_0 BIT(0) +#define CTRL_OUT_1 BIT(1) +#define CTRL_OUT_2 BIT(2) +#define CTRL_OUT_3 BIT(3) +#define CTRL_OUT_4 BIT(4) +#define CTRL_OUT_5 BIT(5) +#define CTRL_OUT_6 BIT(6) +#define CTRL_OUT_7 BIT(7) + +/* Bit definitions for the GPIO_OUT_CTRL_1 register */ +#define CTRL_OUT_8 BIT(0) +#define CTRL_OUT_9 BIT(1) +#define CTRL_OUT_10 BIT(2) +#define CTRL_OUT_11 BIT(3) +#define CTRL_OUT_12 BIT(4) +#define CTRL_OUT_13 BIT(5) +#define CTRL_OUT_14 BIT(6) +#define CTRL_OUT_15 BIT(7) + +/* Bit definitions for the GPIO_TOD_TRIG register */ +#define TOD_TRIG_0 BIT(0) +#define TOD_TRIG_1 BIT(1) +#define TOD_TRIG_2 BIT(2) +#define TOD_TRIG_3 BIT(3) + +/* Bit definitions for the GPIO_DPLL_INDICATOR register */ +#define IND_DPLL_INDEX_SHIFT (0) +#define IND_DPLL_INDEX_MASK (0x7) + +/* Bit definitions for the GPIO_LOS_INDICATOR register */ +#define REFMON_INDEX_SHIFT (0) +#define REFMON_INDEX_MASK (0xf) +/* Active level of LOS indicator, 0=low 1=high */ +#define ACTIVE_LEVEL BIT(4) + +/* Bit definitions for the GPIO_REF_INPUT_DSQ_0 register */ +#define DSQ_INP_0 BIT(0) +#define DSQ_INP_1 BIT(1) +#define DSQ_INP_2 BIT(2) +#define DSQ_INP_3 BIT(3) +#define DSQ_INP_4 BIT(4) +#define DSQ_INP_5 BIT(5) +#define DSQ_INP_6 BIT(6) +#define DSQ_INP_7 BIT(7) + +/* Bit definitions for the GPIO_REF_INPUT_DSQ_1 register */ +#define DSQ_INP_8 BIT(0) +#define DSQ_INP_9 BIT(1) +#define DSQ_INP_10 BIT(2) +#define DSQ_INP_11 BIT(3) +#define DSQ_INP_12 BIT(4) +#define DSQ_INP_13 BIT(5) +#define DSQ_INP_14 BIT(6) +#define DSQ_INP_15 BIT(7) + +/* Bit definitions for the GPIO_REF_INPUT_DSQ_2 register */ +#define DSQ_DPLL_0 BIT(0) +#define DSQ_DPLL_1 BIT(1) +#define DSQ_DPLL_2 BIT(2) +#define DSQ_DPLL_3 BIT(3) +#define DSQ_DPLL_4 BIT(4) +#define DSQ_DPLL_5 BIT(5) +#define DSQ_DPLL_6 BIT(6) +#define DSQ_DPLL_7 BIT(7) + +/* Bit definitions for the GPIO_REF_INPUT_DSQ_3 register */ +#define DSQ_DPLL_SYS BIT(0) +#define GPIO_DSQ_LEVEL BIT(1) + +/* Bit definitions for the GPIO_TOD_NOTIFICATION_CFG register */ +#define DPLL_TOD_SHIFT (0) +#define DPLL_TOD_MASK (0x3) +#define TOD_READ_SECONDARY BIT(2) +#define GPIO_ASSERT_LEVEL BIT(3) + +/* Bit definitions for the GPIO_CTRL register */ +#define GPIO_FUNCTION_EN BIT(0) +#define GPIO_CMOS_OD_MODE BIT(1) +#define GPIO_CONTROL_DIR BIT(2) +#define GPIO_PU_PD_MODE BIT(3) +#define GPIO_FUNCTION_SHIFT (4) +#define GPIO_FUNCTION_MASK (0xf) + +/* Bit definitions for the OUT_CTRL_1 register */ +#define OUT_SYNC_DISABLE BIT(7) +#define SQUELCH_VALUE BIT(6) +#define SQUELCH_DISABLE BIT(5) +#define PAD_VDDO_SHIFT (2) +#define PAD_VDDO_MASK (0x7) +#define PAD_CMOSDRV_SHIFT (0) +#define PAD_CMOSDRV_MASK (0x3) + +/* Bit definitions for the TOD_CFG register */ +#define TOD_EVEN_PPS_MODE BIT(2) +#define TOD_OUT_SYNC_ENABLE BIT(1) +#define TOD_ENABLE BIT(0) + +/* Bit definitions for the TOD_WRITE_SELECT_CFG_0 register */ +#define WR_PWM_DECODER_INDEX_SHIFT (4) +#define WR_PWM_DECODER_INDEX_MASK (0xf) +#define WR_REF_INDEX_SHIFT (0) +#define WR_REF_INDEX_MASK (0xf) + +/* Bit definitions for the TOD_WRITE_CMD register */ +#define TOD_WRITE_SELECTION_SHIFT (0) +#define TOD_WRITE_SELECTION_MASK (0xf) +/* 4.8.7 */ +#define TOD_WRITE_TYPE_SHIFT (4) +#define TOD_WRITE_TYPE_MASK (0x3) + +/* Bit definitions for the TOD_READ_PRIMARY_SEL_CFG_0 register */ +#define RD_PWM_DECODER_INDEX_SHIFT (4) +#define RD_PWM_DECODER_INDEX_MASK (0xf) +#define RD_REF_INDEX_SHIFT (0) +#define RD_REF_INDEX_MASK (0xf) + +/* Bit definitions for the TOD_READ_PRIMARY_CMD register */ +#define TOD_READ_TRIGGER_MODE BIT(4) +#define TOD_READ_TRIGGER_SHIFT (0) +#define TOD_READ_TRIGGER_MASK (0xf) + +/* Bit definitions for the DPLL_CTRL_COMBO_MASTER_CFG register */ +#define COMBO_MASTER_HOLD BIT(0) + +/* Bit definitions for DPLL_SYS_STATUS register */ +#define DPLL_SYS_STATE_MASK (0xf) + +/* Bit definitions for SYS_APLL_STATUS register */ +#define SYS_APLL_LOSS_LOCK_LIVE_MASK BIT(0) +#define SYS_APLL_LOSS_LOCK_LIVE_LOCKED 0 +#define SYS_APLL_LOSS_LOCK_LIVE_UNLOCKED 1 + +/* Bit definitions for the DPLL0_STATUS register */ +#define DPLL_STATE_MASK (0xf) +#define DPLL_STATE_SHIFT (0x0) + +/* Values of DPLL_N.DPLL_MODE.PLL_MODE */ +enum pll_mode { + PLL_MODE_MIN = 0, + PLL_MODE_NORMAL = PLL_MODE_MIN, + PLL_MODE_WRITE_PHASE = 1, + PLL_MODE_WRITE_FREQUENCY = 2, + PLL_MODE_GPIO_INC_DEC = 3, + PLL_MODE_SYNTHESIS = 4, + PLL_MODE_PHASE_MEASUREMENT = 5, + PLL_MODE_DISABLED = 6, + PLL_MODE_MAX = PLL_MODE_DISABLED, +}; + +enum hw_tod_write_trig_sel { + HW_TOD_WR_TRIG_SEL_MIN = 0, + HW_TOD_WR_TRIG_SEL_MSB = HW_TOD_WR_TRIG_SEL_MIN, + HW_TOD_WR_TRIG_SEL_RESERVED = 1, + HW_TOD_WR_TRIG_SEL_TOD_PPS = 2, + HW_TOD_WR_TRIG_SEL_IRIGB_PPS = 3, + HW_TOD_WR_TRIG_SEL_PWM_PPS = 4, + HW_TOD_WR_TRIG_SEL_GPIO = 5, + HW_TOD_WR_TRIG_SEL_FOD_SYNC = 6, + WR_TRIG_SEL_MAX = HW_TOD_WR_TRIG_SEL_FOD_SYNC, +}; + +enum scsr_read_trig_sel { + /* CANCEL CURRENT TOD READ; MODULE BECOMES IDLE - NO TRIGGER OCCURS */ + SCSR_TOD_READ_TRIG_SEL_DISABLE = 0, + /* TRIGGER IMMEDIATELY */ + SCSR_TOD_READ_TRIG_SEL_IMMEDIATE = 1, + /* TRIGGER ON RISING EDGE OF INTERNAL TOD PPS SIGNAL */ + SCSR_TOD_READ_TRIG_SEL_TODPPS = 2, + /* TRGGER ON RISING EDGE OF SELECTED REFERENCE INPUT */ + SCSR_TOD_READ_TRIG_SEL_REFCLK = 3, + /* TRIGGER ON RISING EDGE OF SELECTED PWM DECODER 1PPS OUTPUT */ + SCSR_TOD_READ_TRIG_SEL_PWMPPS = 4, + SCSR_TOD_READ_TRIG_SEL_RESERVED = 5, + /* TRIGGER WHEN WRITE FREQUENCY EVENT OCCURS */ + SCSR_TOD_READ_TRIG_SEL_WRITEFREQUENCYEVENT = 6, + /* TRIGGER ON SELECTED GPIO */ + SCSR_TOD_READ_TRIG_SEL_GPIO = 7, + SCSR_TOD_READ_TRIG_SEL_MAX = SCSR_TOD_READ_TRIG_SEL_GPIO, +}; + +/* Values STATUS.DPLL_SYS_STATUS.DPLL_SYS_STATE */ +enum dpll_state { + DPLL_STATE_MIN = 0, + DPLL_STATE_FREERUN = DPLL_STATE_MIN, + DPLL_STATE_LOCKACQ = 1, + DPLL_STATE_LOCKREC = 2, + DPLL_STATE_LOCKED = 3, + DPLL_STATE_HOLDOVER = 4, + DPLL_STATE_OPEN_LOOP = 5, + DPLL_STATE_MAX = DPLL_STATE_OPEN_LOOP, +}; + +/* 4.8.7 only */ +enum scsr_tod_write_trig_sel { + SCSR_TOD_WR_TRIG_SEL_DISABLE = 0, + SCSR_TOD_WR_TRIG_SEL_IMMEDIATE = 1, + SCSR_TOD_WR_TRIG_SEL_REFCLK = 2, + SCSR_TOD_WR_TRIG_SEL_PWMPPS = 3, + SCSR_TOD_WR_TRIG_SEL_TODPPS = 4, + SCSR_TOD_WR_TRIG_SEL_SYNCFOD = 5, + SCSR_TOD_WR_TRIG_SEL_GPIO = 6, + SCSR_TOD_WR_TRIG_SEL_MAX = SCSR_TOD_WR_TRIG_SEL_GPIO, +}; + +/* 4.8.7 only */ +enum scsr_tod_write_type_sel { + SCSR_TOD_WR_TYPE_SEL_ABSOLUTE = 0, + SCSR_TOD_WR_TYPE_SEL_DELTA_PLUS = 1, + SCSR_TOD_WR_TYPE_SEL_DELTA_MINUS = 2, + SCSR_TOD_WR_TYPE_SEL_MAX = SCSR_TOD_WR_TYPE_SEL_DELTA_MINUS, +}; +#endif diff --git a/include/linux/mfd/rsmu.h b/include/linux/mfd/rsmu.h new file mode 100644 index 000000000000..6870de608233 --- /dev/null +++ b/include/linux/mfd/rsmu.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Core interface for Renesas Synchronization Management Unit (SMU) devices. + * + * Copyright (C) 2021 Integrated Device Technology, Inc., a Renesas Company. + */ + +#ifndef __LINUX_MFD_RSMU_H +#define __LINUX_MFD_RSMU_H + +/* The supported devices are ClockMatrix, Sabre and SnowLotus */ +enum rsmu_type { + RSMU_CM = 0x34000, + RSMU_SABRE = 0x33810, + RSMU_SL = 0x19850, +}; + +/** + * + * struct rsmu_ddata - device data structure for sub devices. + * + * @dev: i2c/spi device. + * @regmap: i2c/spi bus access. + * @lock: mutex used by sub devices to make sure a series of + * bus access requests are not interrupted. + * @type: RSMU device type. + * @page: i2c/spi bus driver internal use only. + */ +struct rsmu_ddata { + struct device *dev; + struct regmap *regmap; + struct mutex lock; + enum rsmu_type type; + u16 page; +}; +#endif /* __LINUX_MFD_RSMU_H */ -- cgit v1.2.3 From 69031f500865ee3eee19566a1b9c40a189817eaa Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:34 +0800 Subject: swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used Always have the pointer to the swiotlb pool used in struct device. This could help simplify the code for other pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/device.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 59940f1744c1..2a22875238a6 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -423,6 +423,7 @@ struct dev_links_info { * @dma_pools: Dma pools (if dma'ble device). * @dma_mem: Internal for coherent mem override. * @cma_area: Contiguous memory area for dma allocations + * @dma_io_tlb_mem: Pointer to the swiotlb pool used. Not for driver use. * @archdata: For arch-specific additions. * @of_node: Associated device tree node. * @fwnode: Associated device node supplied by platform firmware. @@ -531,6 +532,9 @@ struct device { #ifdef CONFIG_DMA_CMA struct cma *cma_area; /* contiguous memory area for dma allocations */ +#endif +#ifdef CONFIG_SWIOTLB + struct io_tlb_mem *dma_io_tlb_mem; #endif /* arch specific additions */ struct dev_archdata archdata; -- cgit v1.2.3 From 7fd856aa7f4261ddac62ea59d8383fef22a0690e Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:35 +0800 Subject: swiotlb: Update is_swiotlb_buffer to add a struct device argument Update is_swiotlb_buffer to add a struct device argument. This will be useful later to allow for different pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 216854a5e513..d1f3d95881cd 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -2,6 +2,7 @@ #ifndef __LINUX_SWIOTLB_H #define __LINUX_SWIOTLB_H +#include #include #include #include @@ -101,9 +102,9 @@ struct io_tlb_mem { }; extern struct io_tlb_mem *io_tlb_default_mem; -static inline bool is_swiotlb_buffer(phys_addr_t paddr) +static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; return mem && paddr >= mem->start && paddr < mem->end; } @@ -115,7 +116,7 @@ bool is_swiotlb_active(void); void __init swiotlb_adjust_size(unsigned long size); #else #define swiotlb_force SWIOTLB_NO_FORCE -static inline bool is_swiotlb_buffer(phys_addr_t paddr) +static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) { return false; } -- cgit v1.2.3 From 6f2beb268a5d35504a636c4a3b7aaa76ec32d96c Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:36 +0800 Subject: swiotlb: Update is_swiotlb_active to add a struct device argument Update is_swiotlb_active to add a struct device argument. This will be useful later to allow for different pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index d1f3d95881cd..dd1c30a83058 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -112,7 +112,7 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); size_t swiotlb_max_mapping_size(struct device *dev); -bool is_swiotlb_active(void); +bool is_swiotlb_active(struct device *dev); void __init swiotlb_adjust_size(unsigned long size); #else #define swiotlb_force SWIOTLB_NO_FORCE @@ -132,7 +132,7 @@ static inline size_t swiotlb_max_mapping_size(struct device *dev) return SIZE_MAX; } -static inline bool is_swiotlb_active(void) +static inline bool is_swiotlb_active(struct device *dev) { return false; } -- cgit v1.2.3 From 903cd0f315fe426c6a64c54ed389de0becb663dc Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Thu, 24 Jun 2021 23:55:20 +0800 Subject: swiotlb: Use is_swiotlb_force_bounce for swiotlb data bouncing Propagate the swiotlb_force into io_tlb_default_mem->force_bounce and use it to determine whether to bounce the data or not. This will be useful later to allow for different pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk [v2: Includes Will's fix] --- include/linux/swiotlb.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index dd1c30a83058..da348671b0d5 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -84,6 +84,7 @@ extern enum swiotlb_force swiotlb_force; * unmap calls. * @debugfs: The dentry to debugfs. * @late_alloc: %true if allocated using the page allocator + * @force_bounce: %true if swiotlb bouncing is forced */ struct io_tlb_mem { phys_addr_t start; @@ -94,6 +95,7 @@ struct io_tlb_mem { spinlock_t lock; struct dentry *debugfs; bool late_alloc; + bool force_bounce; struct io_tlb_slot { phys_addr_t orig_addr; size_t alloc_size; @@ -109,6 +111,13 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) return mem && paddr >= mem->start && paddr < mem->end; } +static inline bool is_swiotlb_force_bounce(struct device *dev) +{ + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + + return mem && mem->force_bounce; +} + void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); size_t swiotlb_max_mapping_size(struct device *dev); @@ -120,6 +129,10 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) { return false; } +static inline bool is_swiotlb_force_bounce(struct device *dev) +{ + return false; +} static inline void swiotlb_exit(void) { } -- cgit v1.2.3 From f4111e39a52aa5d5136d890bbd1aa87c1c8fe3bc Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:40 +0800 Subject: swiotlb: Add restricted DMA alloc/free support Add the functions, swiotlb_{alloc,free} and is_swiotlb_for_alloc to support the memory allocation from restricted DMA pool. The restricted DMA pool is preferred if available. Note that since coherent allocation needs remapping, one must set up another device coherent pool by shared-dma-pool and use dma_alloc_from_dev_coherent instead for atomic coherent allocation. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index da348671b0d5..3b9454d1e498 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -85,6 +85,7 @@ extern enum swiotlb_force swiotlb_force; * @debugfs: The dentry to debugfs. * @late_alloc: %true if allocated using the page allocator * @force_bounce: %true if swiotlb bouncing is forced + * @for_alloc: %true if the pool is used for memory allocation */ struct io_tlb_mem { phys_addr_t start; @@ -96,6 +97,7 @@ struct io_tlb_mem { struct dentry *debugfs; bool late_alloc; bool force_bounce; + bool for_alloc; struct io_tlb_slot { phys_addr_t orig_addr; size_t alloc_size; @@ -158,4 +160,28 @@ static inline void swiotlb_adjust_size(unsigned long size) extern void swiotlb_print_info(void); extern void swiotlb_set_max_segment(unsigned int); +#ifdef CONFIG_DMA_RESTRICTED_POOL +struct page *swiotlb_alloc(struct device *dev, size_t size); +bool swiotlb_free(struct device *dev, struct page *page, size_t size); + +static inline bool is_swiotlb_for_alloc(struct device *dev) +{ + return dev->dma_io_tlb_mem->for_alloc; +} +#else +static inline struct page *swiotlb_alloc(struct device *dev, size_t size) +{ + return NULL; +} +static inline bool swiotlb_free(struct device *dev, struct page *page, + size_t size) +{ + return false; +} +static inline bool is_swiotlb_for_alloc(struct device *dev) +{ + return false; +} +#endif /* CONFIG_DMA_RESTRICTED_POOL */ + #endif /* __LINUX_SWIOTLB_H */ -- cgit v1.2.3 From 0b84e4f8b793eb4045fd64f6f514165a7974cd16 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:41 +0800 Subject: swiotlb: Add restricted DMA pool initialization Add the initialization function to create restricted DMA pools from matching reserved-memory nodes. Regardless of swiotlb setting, the restricted DMA pool is preferred if available. The restricted DMA pools provide a basic level of protection against the DMA overwriting buffer contents at unexpected times. However, to protect against general data leakage and system memory corruption, the system needs to provide a way to lock down the memory access, e.g., MPU. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 3b9454d1e498..39284ff2a6cd 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -73,7 +73,8 @@ extern enum swiotlb_force swiotlb_force; * range check to see if the memory was in fact allocated by this * API. * @nslabs: The number of IO TLB blocks (in groups of 64) between @start and - * @end. This is command line adjustable via setup_io_tlb_npages. + * @end. For default swiotlb, this is command line adjustable via + * setup_io_tlb_npages. * @used: The number of used IO TLB block. * @list: The free list describing the number of free entries available * from each index. -- cgit v1.2.3 From fe364a7d95c24e07e9b3f2ab917f01d6d8330bba Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 12 Jul 2021 14:39:40 +0300 Subject: dmaengine: dw: Program xBAR hardware for Elkhart Lake Intel Elkhart Lake PSE DMA implementation is integrated with crossbar IP in order to serve more hardware than there are DMA request lines available. Due to this, program xBAR hardware to make flexible support of PSE peripheral. The Device-to-Device has not been tested and it's not supported by DMA Engine, but it's left in the code for the sake of documenting hardware features. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210712113940.42753-1-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/platform_data/dma-dw.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h index b34a094b2258..b11b0c8bc5da 100644 --- a/include/linux/platform_data/dma-dw.h +++ b/include/linux/platform_data/dma-dw.h @@ -52,6 +52,7 @@ struct dw_dma_slave { * @max_burst: Maximum value of burst transaction size supported by hardware * per channel (in units of CTL.SRC_TR_WIDTH/CTL.DST_TR_WIDTH). * @protctl: Protection control signals setting per channel. + * @quirks: Optional platform quirks. */ struct dw_dma_platform_data { unsigned int nr_channels; @@ -71,6 +72,8 @@ struct dw_dma_platform_data { #define CHAN_PROTCTL_CACHEABLE BIT(2) #define CHAN_PROTCTL_MASK GENMASK(2, 0) unsigned char protctl; +#define DW_DMA_QUIRK_XBAR_PRESENT BIT(0) + unsigned int quirks; }; #endif /* _PLATFORM_DATA_DMA_DW_H */ -- cgit v1.2.3 From 7ed012969bbcdbd7aef5778a061681e6cbc4b402 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 14 Jul 2021 17:01:59 +0200 Subject: Compiler Attributes: fix __has_attribute(__no_sanitize_coverage__) for GCC 4 Fix __has_attribute(__no_sanitize_coverage__) for GCC 4 by defining __GCC4_has_attribute___no_sanitize_coverage__. Fixes: 540540d06e9d ("kcov: add __no_sanitize_coverage to fix noinstr for all architectures") Reported-by: Geert Uytterhoeven Signed-off-by: Marco Elver Signed-off-by: Miguel Ojeda --- include/linux/compiler_attributes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 183ddd5fd072..7b1fa5c30169 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -36,6 +36,7 @@ # define __GCC4_has_attribute___nonstring__ 0 # define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8) # define __GCC4_has_attribute___no_sanitize_undefined__ (__GNUC_MINOR__ >= 9) +# define __GCC4_has_attribute___no_sanitize_coverage__ 0 # define __GCC4_has_attribute___fallthrough__ 0 #endif -- cgit v1.2.3 From 5d1ef2ce13a9098b4e0d31c50e4c79763a57b444 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Fri, 23 Jul 2021 10:53:02 +0200 Subject: ima: Introduce ima_get_current_hash_algo() Buffer measurements, unlike file measurements, are not accessible after the measurement is done, as buffers are not suitable for use with the integrity_iint_cache structure (there is no index, for files it is the inode number). In the subsequent patches, the measurement (digest) will be returned directly by the functions that perform the buffer measurement, ima_measure_critical_data() and process_buffer_measurement(). A caller of those functions also needs to know the algorithm used to calculate the digest. Instead of adding the algorithm as a new parameter to the functions, this patch provides it separately with the new function ima_get_current_hash_algo(). Since the hash algorithm does not change after the IMA setup phase, there is no risk of races (obtaining a digest calculated with a different algorithm than the one returned). Signed-off-by: Roberto Sassu Reviewed-by: Lakshmi Ramasubramanian [zohar@linux.ibm.com: annotate ima_hash_algo as __ro_after_init] Signed-off-by: Mimi Zohar --- include/linux/ima.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index 61d5723ec303..81e830d01ced 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -11,9 +11,11 @@ #include #include #include +#include struct linux_binprm; #ifdef CONFIG_IMA +extern enum hash_algo ima_get_current_hash_algo(void); extern int ima_bprm_check(struct linux_binprm *bprm); extern int ima_file_check(struct file *file, int mask); extern void ima_post_create_tmpfile(struct user_namespace *mnt_userns, @@ -64,6 +66,11 @@ static inline const char * const *arch_get_ima_policy(void) #endif #else +static inline enum hash_algo ima_get_current_hash_algo(void) +{ + return HASH_ALGO__LAST; +} + static inline int ima_bprm_check(struct linux_binprm *bprm) { return 0; -- cgit v1.2.3 From ce5bb5a86e5ebcd3c2e40e6dd1382027b5d43caf Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Fri, 23 Jul 2021 10:53:03 +0200 Subject: ima: Return int in the functions to measure a buffer ima_measure_critical_data() and process_buffer_measurement() currently don't return a result as, unlike appraisal-related functions, the result is not used by callers to deny an operation. Measurement-related functions instead rely on the audit subsystem to notify the system administrator when an error occurs. However, ima_measure_critical_data() and process_buffer_measurement() are a special case, as these are the only functions that can return a buffer measurement (for files, there is ima_file_hash()). In a subsequent patch, they will be modified to return the calculated digest. In preparation to return the result of the digest calculation, this patch modifies the return type from void to int, and returns 0 if the buffer has been successfully measured, a negative value otherwise. Given that the result of the measurement is still not necessary, this patch does not modify the behavior of existing callers by processing the returned value. For those, the return value is ignored. Signed-off-by: Roberto Sassu Reviewed-by: Lakshmi Ramasubramanian Acked-by: Paul Moore (for the SELinux bits) Signed-off-by: Mimi Zohar --- include/linux/ima.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index 81e830d01ced..60492263aa64 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -35,10 +35,10 @@ extern void ima_post_path_mknod(struct user_namespace *mnt_userns, extern int ima_file_hash(struct file *file, char *buf, size_t buf_size); extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size); extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size); -extern void ima_measure_critical_data(const char *event_label, - const char *event_name, - const void *buf, size_t buf_len, - bool hash); +extern int ima_measure_critical_data(const char *event_label, + const char *event_name, + const void *buf, size_t buf_len, + bool hash); #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM extern void ima_appraise_parse_cmdline(void); @@ -144,10 +144,13 @@ static inline int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {} -static inline void ima_measure_critical_data(const char *event_label, +static inline int ima_measure_critical_data(const char *event_label, const char *event_name, const void *buf, size_t buf_len, - bool hash) {} + bool hash) +{ + return -ENOENT; +} #endif /* CONFIG_IMA */ -- cgit v1.2.3 From ca3c9bdb101d9b9eb3ed8a85cc0fe55915ba49de Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Fri, 23 Jul 2021 10:53:04 +0200 Subject: ima: Add digest and digest_len params to the functions to measure a buffer This patch performs the final modification necessary to pass the buffer measurement to callers, so that they provide a functionality similar to ima_file_hash(). It adds the 'digest' and 'digest_len' parameters to ima_measure_critical_data() and process_buffer_measurement(). These functions calculate the digest even if there is no suitable rule in the IMA policy and, in this case, they simply return 1 before generating a new measurement entry. Signed-off-by: Roberto Sassu Reviewed-by: Lakshmi Ramasubramanian Signed-off-by: Mimi Zohar --- include/linux/ima.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index 60492263aa64..b6ab66a546ae 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -38,7 +38,7 @@ extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size); extern int ima_measure_critical_data(const char *event_label, const char *event_name, const void *buf, size_t buf_len, - bool hash); + bool hash, u8 *digest, size_t digest_len); #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM extern void ima_appraise_parse_cmdline(void); @@ -147,7 +147,8 @@ static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) { static inline int ima_measure_critical_data(const char *event_label, const char *event_name, const void *buf, size_t buf_len, - bool hash) + bool hash, u8 *digest, + size_t digest_len) { return -ENOENT; } -- cgit v1.2.3 From 463e862ac63ef27fca423782536f6465abc3f180 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 20 Jul 2021 14:38:24 +0100 Subject: swiotlb: Convert io_default_tlb_mem to static allocation Since commit 69031f500865 ("swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used"), 'struct device' may hold a copy of the global 'io_default_tlb_mem' pointer if the device is using swiotlb for DMA. A subsequent call to swiotlb_exit() will therefore leave dangling pointers behind in these device structures, resulting in KASAN splats such as: | BUG: KASAN: use-after-free in __iommu_dma_unmap_swiotlb+0x64/0xb0 | Read of size 8 at addr ffff8881d7830000 by task swapper/0/0 | | CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.12.0-rc3-debug #1 | Hardware name: HP HP Desktop M01-F1xxx/87D6, BIOS F.12 12/17/2020 | Call Trace: | | dump_stack+0x9c/0xcf | print_address_description.constprop.0+0x18/0x130 | kasan_report.cold+0x7f/0x111 | __iommu_dma_unmap_swiotlb+0x64/0xb0 | nvme_pci_complete_rq+0x73/0x130 | blk_complete_reqs+0x6f/0x80 | __do_softirq+0xfc/0x3be Convert 'io_default_tlb_mem' to a static structure, so that the per-device pointers remain valid after swiotlb_exit() has been invoked. All users are updated to reference the static structure directly, using the 'nslabs' field to determine whether swiotlb has been initialised. The 'slots' array is still allocated dynamically and referenced via a pointer rather than a flexible array member. Cc: Claire Chang Cc: Christoph Hellwig Cc: Robin Murphy Cc: Konrad Rzeszutek Wilk Fixes: 69031f500865 ("swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used") Reported-by: Nathan Chancellor Tested-by: Nathan Chancellor Tested-by: Claire Chang Reviewed-by: Christoph Hellwig Signed-off-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 39284ff2a6cd..b0cb2a9973f4 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -103,9 +103,9 @@ struct io_tlb_mem { phys_addr_t orig_addr; size_t alloc_size; unsigned int list; - } slots[]; + } *slots; }; -extern struct io_tlb_mem *io_tlb_default_mem; +extern struct io_tlb_mem io_tlb_default_mem; static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) { -- cgit v1.2.3 From 374c15594c4ee0dfcceb38852bd43be09070f402 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:42 -0700 Subject: iommu/io-pgtable: Introduce unmap_pages() as a page table op The io-pgtable code expects to operate on a single block or granule of memory that is supported by the IOMMU hardware when unmapping memory. This means that when a large buffer that consists of multiple such blocks is unmapped, the io-pgtable code will walk the page tables to the correct level to unmap each block, even for blocks that are virtually contiguous and at the same level, which can incur an overhead in performance. Introduce the unmap_pages() page table op to express to the io-pgtable code that it should unmap a number of blocks of the same size, instead of a single block. Doing so allows multiple blocks to be unmapped in one call to the io-pgtable code, reducing the number of page table walks, and indirect calls. Signed-off-by: Isaac J. Manjarres Suggested-by: Will Deacon Signed-off-by: Will Deacon Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-2-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 4d40dfa75b55..9391c5fa71e6 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -144,6 +144,7 @@ struct io_pgtable_cfg { * * @map: Map a physically contiguous memory region. * @unmap: Unmap a physically contiguous memory region. + * @unmap_pages: Unmap a range of virtually contiguous pages of the same size. * @iova_to_phys: Translate iova to physical address. * * These functions map directly onto the iommu_ops member functions with @@ -154,6 +155,9 @@ struct io_pgtable_ops { phys_addr_t paddr, size_t size, int prot, gfp_t gfp); size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova, size_t size, struct iommu_iotlb_gather *gather); + size_t (*unmap_pages)(struct io_pgtable_ops *ops, unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *gather); phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops, unsigned long iova); }; -- cgit v1.2.3 From cacffb7f7b45ba7649eedea4c196c6e9f1863bf3 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:43 -0700 Subject: iommu: Add an unmap_pages() op for IOMMU drivers Add a callback for IOMMU drivers to provide a path for the IOMMU framework to call into an IOMMU driver, which can call into the io-pgtable code, to unmap a virtually contiguous range of pages of the same size. For IOMMU drivers that do not specify an unmap_pages() callback, the existing logic of unmapping memory one page block at a time will be used. Signed-off-by: Isaac J. Manjarres Suggested-by: Will Deacon Signed-off-by: Will Deacon Acked-by: Lu Baolu Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-3-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 32d448050bf7..25a844121be5 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -181,6 +181,7 @@ struct iommu_iotlb_gather { * @detach_dev: detach device from an iommu domain * @map: map a physically contiguous memory region to an iommu domain * @unmap: unmap a physically contiguous memory region from an iommu domain + * @unmap_pages: unmap a number of pages of the same size from an iommu domain * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain * @iotlb_sync_map: Sync mappings created recently using @map to the hardware * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush @@ -231,6 +232,9 @@ struct iommu_ops { phys_addr_t paddr, size_t size, int prot, gfp_t gfp); size_t (*unmap)(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather); + size_t (*unmap_pages)(struct iommu_domain *domain, unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *iotlb_gather); void (*flush_iotlb_all)(struct iommu_domain *domain); void (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova, size_t size); -- cgit v1.2.3 From ca073b55d16a83ba7e73cd313312abc68f07f293 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:44 -0700 Subject: iommu/io-pgtable: Introduce map_pages() as a page table op Mapping memory into io-pgtables follows the same semantics that unmapping memory used to follow (i.e. a buffer will be mapped one page block per call to the io-pgtable code). This means that it can be optimized in the same way that unmapping memory was, so add a map_pages() callback to the io-pgtable ops structure, so that a range of pages of the same size can be mapped within the same call. Signed-off-by: Isaac J. Manjarres Suggested-by: Will Deacon Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-4-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 9391c5fa71e6..c43f3b899d2a 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -143,6 +143,7 @@ struct io_pgtable_cfg { * struct io_pgtable_ops - Page table manipulation API for IOMMU drivers. * * @map: Map a physically contiguous memory region. + * @map_pages: Map a physically contiguous range of pages of the same size. * @unmap: Unmap a physically contiguous memory region. * @unmap_pages: Unmap a range of virtually contiguous pages of the same size. * @iova_to_phys: Translate iova to physical address. @@ -153,6 +154,9 @@ struct io_pgtable_cfg { struct io_pgtable_ops { int (*map)(struct io_pgtable_ops *ops, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); + int (*map_pages)(struct io_pgtable_ops *ops, unsigned long iova, + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped); size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova, size_t size, struct iommu_iotlb_gather *gather); size_t (*unmap_pages)(struct io_pgtable_ops *ops, unsigned long iova, -- cgit v1.2.3 From 910c4406ccc9613de0a54abf910edc4bf8a575c0 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:45 -0700 Subject: iommu: Add a map_pages() op for IOMMU drivers Add a callback for IOMMU drivers to provide a path for the IOMMU framework to call into an IOMMU driver, which can call into the io-pgtable code, to map a physically contiguous rnage of pages of the same size. For IOMMU drivers that do not specify a map_pages() callback, the existing logic of mapping memory one page block at a time will be used. Signed-off-by: Isaac J. Manjarres Suggested-by: Will Deacon Acked-by: Lu Baolu Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-5-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 25a844121be5..d7989d4a7404 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -180,6 +180,8 @@ struct iommu_iotlb_gather { * @attach_dev: attach device to an iommu domain * @detach_dev: detach device from an iommu domain * @map: map a physically contiguous memory region to an iommu domain + * @map_pages: map a physically contiguous set of pages of the same size to + * an iommu domain. * @unmap: unmap a physically contiguous memory region from an iommu domain * @unmap_pages: unmap a number of pages of the same size from an iommu domain * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain @@ -230,6 +232,9 @@ struct iommu_ops { void (*detach_dev)(struct iommu_domain *domain, struct device *dev); int (*map)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); + int (*map_pages)(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped); size_t (*unmap)(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather); size_t (*unmap_pages)(struct iommu_domain *domain, unsigned long iova, -- cgit v1.2.3 From 308723e3580027f0cd7c86a5edfe6b5acb6863d2 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 12 Jul 2021 19:12:20 +0800 Subject: iommu: Remove mode argument from iommu_set_dma_strict() We only ever now set strict mode enabled in iommu_set_dma_strict(), so just remove the argument. Signed-off-by: John Garry Reviewed-by: Robin Murphy Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/1626088340-5838-7-git-send-email-john.garry@huawei.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d7989d4a7404..4997c78e2670 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -485,7 +485,7 @@ int iommu_enable_nesting(struct iommu_domain *domain); int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks); -void iommu_set_dma_strict(bool val); +void iommu_set_dma_strict(void); bool iommu_get_dma_strict(struct iommu_domain *domain); extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev, -- cgit v1.2.3 From c25abcd625505f53b72dc156bac32b5120826742 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Mon, 12 Jul 2021 19:16:18 +0530 Subject: kdb: Get rid of redundant kdb_register_flags() Commit e4f291b3f7bb ("kdb: Simplify kdb commands registration") allowed registration of pre-allocated kdb commands with pointer to struct kdbtab_t. Lets switch other users as well to register pre- allocated kdb commands via: - Changing prototype for kdb_register() to pass a pointer to struct kdbtab_t instead. - Embed kdbtab_t structure in kdb_macro_t rather than individual params. With these changes kdb_register_flags() becomes redundant and hence removed. Also, since we have switched all users to register pre-allocated commands, "is_dynamic" flag in struct kdbtab_t becomes redundant and hence removed as well. Suggested-by: Daniel Thompson Signed-off-by: Sumit Garg Acked-by: Steven Rostedt (VMware) Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20210712134620.276667-3-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- include/linux/kdb.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index 0125a677b67f..de858edfb3b8 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -13,6 +13,8 @@ * Copyright (C) 2009 Jason Wessel */ +#include + /* Shifted versions of the command enable bits are be used if the command * has no arguments (see kdb_check_flags). This allows commands, such as * go, to have different permissions depending upon whether it is called @@ -64,6 +66,17 @@ typedef enum { typedef int (*kdb_func_t)(int, const char **); +/* The KDB shell command table */ +typedef struct _kdbtab { + char *cmd_name; /* Command name */ + kdb_func_t cmd_func; /* Function to execute command */ + char *cmd_usage; /* Usage String for this command */ + char *cmd_help; /* Help message for this command */ + short cmd_minlen; /* Minimum legal # cmd chars required */ + kdb_cmdflags_t cmd_flags; /* Command behaviour flags */ + struct list_head list_node; /* Command list */ +} kdbtab_t; + #ifdef CONFIG_KGDB_KDB #include #include @@ -193,19 +206,13 @@ static inline const char *kdb_walk_kallsyms(loff_t *pos) #endif /* ! CONFIG_KALLSYMS */ /* Dynamic kdb shell command registration */ -extern int kdb_register(char *, kdb_func_t, char *, char *, short); -extern int kdb_register_flags(char *, kdb_func_t, char *, char *, - short, kdb_cmdflags_t); -extern int kdb_unregister(char *); +extern int kdb_register(kdbtab_t *cmd); +extern void kdb_unregister(kdbtab_t *cmd); #else /* ! CONFIG_KGDB_KDB */ static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; } static inline void kdb_init(int level) {} -static inline int kdb_register(char *cmd, kdb_func_t func, char *usage, - char *help, short minlen) { return 0; } -static inline int kdb_register_flags(char *cmd, kdb_func_t func, char *usage, - char *help, short minlen, - kdb_cmdflags_t flags) { return 0; } -static inline int kdb_unregister(char *cmd) { return 0; } +static inline int kdb_register(kdbtab_t *cmd) { return 0; } +static inline void kdb_unregister(kdbtab_t *cmd) {} #endif /* CONFIG_KGDB_KDB */ enum { KDB_NOT_INITIALIZED, -- cgit v1.2.3 From e868f0a3c4b9c1d7721f08b703142a876814a3f8 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Mon, 12 Jul 2021 19:16:20 +0530 Subject: kdb: Rename members of struct kdbtab_t Remove redundant prefix "cmd_" from name of members in struct kdbtab_t for better readibility. Suggested-by: Doug Anderson Signed-off-by: Sumit Garg Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20210712134620.276667-5-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- include/linux/kdb.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index de858edfb3b8..ea0f5e580fac 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -68,12 +68,12 @@ typedef int (*kdb_func_t)(int, const char **); /* The KDB shell command table */ typedef struct _kdbtab { - char *cmd_name; /* Command name */ - kdb_func_t cmd_func; /* Function to execute command */ - char *cmd_usage; /* Usage String for this command */ - char *cmd_help; /* Help message for this command */ - short cmd_minlen; /* Minimum legal # cmd chars required */ - kdb_cmdflags_t cmd_flags; /* Command behaviour flags */ + char *name; /* Command name */ + kdb_func_t func; /* Function to execute command */ + char *usage; /* Usage String for this command */ + char *help; /* Help message for this command */ + short minlen; /* Minimum legal # cmd chars required */ + kdb_cmdflags_t flags; /* Command behaviour flags */ struct list_head list_node; /* Command list */ } kdbtab_t; -- cgit v1.2.3 From cf0a95659e659d36838e56cc439d3986dcb46870 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 22 Jul 2021 22:34:50 +0300 Subject: clk: x86: Rename clk-lpt to more specific clk-lpss-atom The LPT stands for Lynxpoint PCH. However the driver is used on a few Intel Atom SoCs. Rename it to reflect this in a way how another clock driver, i.e. clk-pmc-atom, is called. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210722193450.35321-1-andriy.shevchenko@linux.intel.com Acked-by: Rafael J. Wysocki Signed-off-by: Stephen Boyd --- include/linux/platform_data/x86/clk-lpss.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/clk-lpss.h b/include/linux/platform_data/x86/clk-lpss.h index 207e1a317800..41df326583f9 100644 --- a/include/linux/platform_data/x86/clk-lpss.h +++ b/include/linux/platform_data/x86/clk-lpss.h @@ -15,6 +15,6 @@ struct lpss_clk_data { struct clk *clk; }; -extern int lpt_clk_init(void); +extern int lpss_atom_clk_init(void); #endif /* __CLK_LPSS_H */ -- cgit v1.2.3 From 147b589c5f446d602215577835356a96c40a4044 Mon Sep 17 00:00:00 2001 From: Dong Aisheng Date: Tue, 6 Jul 2021 22:21:56 +0800 Subject: remoteproc: fix kernel doc for struct rproc_ops The load_rsc_table was removed since the commit c1d35c1ab424 ("remoteproc: Rename "load_rsc_table" to "parse_fw"") but got added back again by mistake in the below commit: commit b1a17513a2d6 ("remoteproc: add vendor resources handling"). The patch fixed a small code indent issue which not worth a separate patch. Fixes: b1a17513a2d6 ("remoteproc: add vendor resources handling") Signed-off-by: Dong Aisheng Link: https://lore.kernel.org/r/20210706142156.952794-2-aisheng.dong@nxp.com Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index a5b37bc10865..83c09ac36b13 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -369,9 +369,8 @@ enum rsc_handling_status { * @da_to_va: optional platform hook to perform address translations * @parse_fw: parse firmware to extract information (e.g. resource table) * @handle_rsc: optional platform hook to handle vendor resources. Should return - * RSC_HANDLED if resource was handled, RSC_IGNORED if not handled and a - * negative value on error - * @load_rsc_table: load resource table from firmware image + * RSC_HANDLED if resource was handled, RSC_IGNORED if not handled + * and a negative value on error * @find_loaded_rsc_table: find the loaded resource table from firmware image * @get_loaded_rsc_table: get resource table installed in memory * by external entity -- cgit v1.2.3 From fb1ba406c451045f1063ace70086b4645d4e9d54 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 24 Jul 2021 09:20:18 +0200 Subject: scsi: scsi_ioctl: Remove scsi_cmd_blk_ioctl() Open code scsi_cmd_blk_ioctl() in its two callers. Link: https://lore.kernel.org/r/20210724072033.1284840-10-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3177181c4326..19aa3d5429c0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -889,8 +889,6 @@ extern blk_status_t blk_insert_cloned_request(struct request_queue *q, int blk_rq_append_bio(struct request *rq, struct bio *bio); extern void blk_queue_split(struct bio **); extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int); -extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t, - unsigned int, void __user *); extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, unsigned int, void __user *); extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, -- cgit v1.2.3 From 4f07bfc56157ebc689ef54879e90c48a47294083 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 24 Jul 2021 09:20:19 +0200 Subject: scsi: scsi_ioctl: Remove scsi_verify_blk_ioctl() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Manually verify that the device is not a partition and the caller has admin privіleges at the beginning of the sr ioctl method and open code the trivial check for sd as well. Link: https://lore.kernel.org/r/20210724072033.1284840-11-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 19aa3d5429c0..e2b972a85012 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -888,7 +888,6 @@ extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); int blk_rq_append_bio(struct request *rq, struct bio *bio); extern void blk_queue_split(struct bio **); -extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int); extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, unsigned int, void __user *); extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, -- cgit v1.2.3 From 547e2f7093b19a993d76c249b4c3ec8af8127d09 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 24 Jul 2021 09:20:21 +0200 Subject: scsi: block: Add a queue_max_bytes() helper Return the max_sectors value in bytes. Lifted from scsi_ioctl.c. Link: https://lore.kernel.org/r/20210724072033.1284840-13-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e2b972a85012..9971796819ef 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1373,6 +1373,11 @@ static inline unsigned int queue_max_sectors(const struct request_queue *q) return q->limits.max_sectors; } +static inline unsigned int queue_max_bytes(struct request_queue *q) +{ + return min_t(unsigned int, queue_max_sectors(q), INT_MAX >> 9) << 9; +} + static inline unsigned int queue_max_hw_sectors(const struct request_queue *q) { return q->limits.max_hw_sectors; -- cgit v1.2.3 From 78011042684dfbb50f7060f4623793f7a5c74a01 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 24 Jul 2021 09:20:23 +0200 Subject: scsi: bsg: Move bsg_scsi_ops to drivers/scsi/ Move the SCSI-specific bsg code in the SCSI midlayer instead of in the common bsg code. This just keeps the common bsg code block/ and also allows building it as a module. Link: https://lore.kernel.org/r/20210724072033.1284840-15-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- include/linux/blkdev.h | 2 +- include/linux/bsg.h | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9971796819ef..d36b67bd7267 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -537,7 +537,7 @@ struct request_queue { int mq_freeze_depth; -#if defined(CONFIG_BLK_DEV_BSG) +#if IS_ENABLED(CONFIG_BLK_DEV_BSG_COMMON) struct bsg_class_device bsg_dev; #endif diff --git a/include/linux/bsg.h b/include/linux/bsg.h index dac37b6e00ec..b887da20bd41 100644 --- a/include/linux/bsg.h +++ b/include/linux/bsg.h @@ -5,8 +5,9 @@ #include struct request; +struct request_queue; -#ifdef CONFIG_BLK_DEV_BSG +#ifdef CONFIG_BLK_DEV_BSG_COMMON struct bsg_ops { int (*check_proto)(struct sg_io_v4 *hdr); int (*fill_hdr)(struct request *rq, struct sg_io_v4 *hdr, @@ -24,16 +25,10 @@ struct bsg_class_device { int bsg_register_queue(struct request_queue *q, struct device *parent, const char *name, const struct bsg_ops *ops); -int bsg_scsi_register_queue(struct request_queue *q, struct device *parent); void bsg_unregister_queue(struct request_queue *q); #else -static inline int bsg_scsi_register_queue(struct request_queue *q, - struct device *parent) -{ - return 0; -} static inline void bsg_unregister_queue(struct request_queue *q) { } -#endif /* CONFIG_BLK_DEV_BSG */ +#endif /* CONFIG_BLK_DEV_BSG_COMMON */ #endif /* _LINUX_BSG_H */ -- cgit v1.2.3 From 7353dc06c9a8e37c80da7ff986e6ef5123bec8ce Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 24 Jul 2021 09:20:26 +0200 Subject: scsi: scsi_ioctl: Simplify SCSI passthrough permission checking Remove the separate command filter structure and just use a switch statement (which also cought two duplicate commands), return a bool and give the function a sensible name. Link: https://lore.kernel.org/r/20210724072033.1284840-18-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d36b67bd7267..e28679e63373 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1343,7 +1343,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, gfp_mask, 0); } -extern int blk_verify_command(unsigned char *cmd, fmode_t mode); +bool scsi_cmd_allowed(unsigned char *cmd, fmode_t mode); static inline bool bdev_is_partition(struct block_device *bdev) { -- cgit v1.2.3 From f2542a3be3277a65c766fa6e86b930d3d839f79e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 24 Jul 2021 09:20:27 +0200 Subject: scsi: scsi_ioctl: Move the "block layer" SCSI ioctl handling to drivers/scsi Merge the ioctl handling in block/scsi_ioctl.c into its only caller in drivers/scsi/scsi_ioctl.c. Link: https://lore.kernel.org/r/20210724072033.1284840-19-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- include/linux/blkdev.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e28679e63373..8c617a5a5d61 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -28,8 +28,6 @@ #include struct module; -struct scsi_ioctl_command; - struct request_queue; struct elevator_queue; struct blk_trace; @@ -888,13 +886,6 @@ extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); int blk_rq_append_bio(struct request *rq, struct bio *bio); extern void blk_queue_split(struct bio **); -extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, - unsigned int, void __user *); -extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, - struct scsi_ioctl_command __user *); -extern int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp); -extern int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp); - extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); @@ -1343,8 +1334,6 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, gfp_mask, 0); } -bool scsi_cmd_allowed(unsigned char *cmd, fmode_t mode); - static inline bool bdev_is_partition(struct block_device *bdev) { return bdev->bd_partno; -- cgit v1.2.3 From 4e804c39f1be4498d80f379e5b7bc6d4f80f813c Mon Sep 17 00:00:00 2001 From: Sergio Paracuellos Date: Wed, 28 Jul 2021 06:12:51 +0200 Subject: gpiolib: convert 'devprop_gpiochip_set_names' to support multiple gpiochip banks per device The default gpiolib-of implementation does not work with the multiple gpiochip banks per device structure used for example by the gpio-mt7621 and gpio-brcmstb drivers. To fix these kind of situations driver code is forced to fill the names to avoid the gpiolib code to set names repeated along the banks. Instead of continue with that antipattern fix the gpiolib core function to get expected behaviour for every single situation adding a field 'offset' in the gpiochip structure. Doing in this way, we can assume this offset will be zero for normal driver code where only one gpiochip bank per device is used but can be set explicitly in those drivers that really need more than one gpiochip. Reviewed-by: Andy Shevchenko Reviewed-by: Gregory Fong Signed-off-by: Sergio Paracuellos Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 3a268781fcec..a0f9901dcae6 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -312,6 +312,9 @@ struct gpio_irq_chip { * get rid of the static GPIO number space in the long run. * @ngpio: the number of GPIOs handled by this controller; the last GPIO * handled is (base + ngpio - 1). + * @offset: when multiple gpio chips belong to the same device this + * can be used as offset within the device so friendly names can + * be properly assigned. * @names: if set, must be an array of strings to use as alternative * names for the GPIOs in this chip. Any entry in the array * may be NULL if there is no alias for the GPIO, however the @@ -398,6 +401,7 @@ struct gpio_chip { int base; u16 ngpio; + u16 offset; const char *const *names; bool can_sleep; -- cgit v1.2.3 From ba51bdafaafc065019c6f6a2cdae006d176cee48 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 Jul 2021 15:02:43 +0200 Subject: scsi: sr: cdrom: Move cdrom_read_cdda_bpc() into the sr driver cdrom_read_cdda_bpc() relies on sending SCSI command to the low level driver using a REQ_OP_SCSI_IN request. This isn't generic block layer functionality, so move the actual low-level code into the sr driver and call it through a new read_cdda_bpc method in the cdrom_device_ops structure. With this the CDROM code does not have to pull in scsi_normalize_sense() and depend on CONFIG_SCSI_COMMON. Link: https://lore.kernel.org/r/20210730072752.GB23847%40lst.de Tested-by: Anders Roxell Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- include/linux/cdrom.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index f48d0a31deae..c4fef00abdf3 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -86,11 +86,13 @@ struct cdrom_device_ops { /* play stuff */ int (*audio_ioctl) (struct cdrom_device_info *,unsigned int, void *); -/* driver specifications */ - const int capability; /* capability flags */ /* handle uniform packets for scsi type devices (scsi,atapi) */ int (*generic_packet) (struct cdrom_device_info *, struct packet_command *); + int (*read_cdda_bpc)(struct cdrom_device_info *cdi, void __user *ubuf, + u32 lba, u32 nframes, u8 *last_sense); +/* driver specifications */ + const int capability; /* capability flags */ }; int cdrom_multisession(struct cdrom_device_info *cdi, -- cgit v1.2.3 From ead09dd3aed5cc6a6c6288a87a5bfa9bbc8d5ecf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Jul 2021 08:48:42 +0200 Subject: scsi: bsg: Simplify device registration Use the per-device cdev_device_interface to store the bsg data in the char device inode, and thus remove the need to embedd the bsg_class_device structure in the request_queue. Link: https://lore.kernel.org/r/20210729064845.1044147-2-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- include/linux/blkdev.h | 6 ------ include/linux/bsg-lib.h | 1 + include/linux/bsg.h | 21 ++++++--------------- 3 files changed, 7 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8c617a5a5d61..28957ccdd9c2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -33,7 +32,6 @@ struct elevator_queue; struct blk_trace; struct request; struct sg_io_hdr; -struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; @@ -535,10 +533,6 @@ struct request_queue { int mq_freeze_depth; -#if IS_ENABLED(CONFIG_BLK_DEV_BSG_COMMON) - struct bsg_class_device bsg_dev; -#endif - #ifdef CONFIG_BLK_DEV_THROTTLING /* Throttle data */ struct throtl_data *td; diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h index 960988d42f77..6b211323a489 100644 --- a/include/linux/bsg-lib.h +++ b/include/linux/bsg-lib.h @@ -12,6 +12,7 @@ #include #include +struct bsg_job; struct request; struct device; struct scatterlist; diff --git a/include/linux/bsg.h b/include/linux/bsg.h index b887da20bd41..fa21f79beda2 100644 --- a/include/linux/bsg.h +++ b/include/linux/bsg.h @@ -4,10 +4,11 @@ #include +struct bsg_device; +struct device; struct request; struct request_queue; -#ifdef CONFIG_BLK_DEV_BSG_COMMON struct bsg_ops { int (*check_proto)(struct sg_io_v4 *hdr); int (*fill_hdr)(struct request *rq, struct sg_io_v4 *hdr, @@ -16,19 +17,9 @@ struct bsg_ops { void (*free_rq)(struct request *rq); }; -struct bsg_class_device { - struct device *class_dev; - int minor; - struct request_queue *queue; - const struct bsg_ops *ops; -}; +struct bsg_device *bsg_register_queue(struct request_queue *q, + struct device *parent, const char *name, + const struct bsg_ops *ops); +void bsg_unregister_queue(struct bsg_device *bcd); -int bsg_register_queue(struct request_queue *q, struct device *parent, - const char *name, const struct bsg_ops *ops); -void bsg_unregister_queue(struct request_queue *q); -#else -static inline void bsg_unregister_queue(struct request_queue *q) -{ -} -#endif /* CONFIG_BLK_DEV_BSG_COMMON */ #endif /* _LINUX_BSG_H */ -- cgit v1.2.3 From cf93a27446fe1a6e0acb9bbedf5fce1e98e4fc5b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Jul 2021 08:48:43 +0200 Subject: scsi: block: Remove BLK_SCSI_MAX_CMDS This was used for the table based SCSI passthough permission checking that is gone now. Link: https://lore.kernel.org/r/20210729064845.1044147-3-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- include/linux/blkdev.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 28957ccdd9c2..e0bb14acb708 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -271,9 +271,6 @@ enum blk_queue_state { #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ -#define BLK_SCSI_MAX_CMDS (256) -#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) - /* * Zoned block device models (zoned limit). * -- cgit v1.2.3 From 1e61c1a804d2a2a3c46add01cac3a6e9eca01080 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Jul 2021 08:48:44 +0200 Subject: scsi: block: Remove the remaining SG_IO-related fields from struct request_queue Move the sg_timeout and sg_reserved_size fields into the bsg_device and scsi_device structures as they have nothing to do with generic block I/O. Note that these values are now separate for bsg vs. SCSI device node access, but that just matches how /dev/sg vs the other nodes has always behaved. Link: https://lore.kernel.org/r/20210729064845.1044147-4-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e0bb14acb708..987f15089eeb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -499,11 +499,6 @@ struct request_queue { unsigned int max_active_zones; #endif /* CONFIG_BLK_DEV_ZONED */ - /* - * sg stuff - */ - unsigned int sg_timeout; - unsigned int sg_reserved_size; int node; struct mutex debugfs_mutex; #ifdef CONFIG_BLK_DEV_IO_TRACE -- cgit v1.2.3 From 75ca56409e5b35aa6ceef94462f39ef4f533fc41 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Jul 2021 08:48:45 +0200 Subject: scsi: bsg: Move the whole request execution into the SCSI/transport handlers Remove the amount of indirect calls by making the handler responsible for the entire execution of the request. Link: https://lore.kernel.org/r/20210729064845.1044147-5-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- include/linux/bsg.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bsg.h b/include/linux/bsg.h index fa21f79beda2..1ac81c809da9 100644 --- a/include/linux/bsg.h +++ b/include/linux/bsg.h @@ -6,20 +6,14 @@ struct bsg_device; struct device; -struct request; struct request_queue; -struct bsg_ops { - int (*check_proto)(struct sg_io_v4 *hdr); - int (*fill_hdr)(struct request *rq, struct sg_io_v4 *hdr, - fmode_t mode); - int (*complete_rq)(struct request *rq, struct sg_io_v4 *hdr); - void (*free_rq)(struct request *rq); -}; +typedef int (bsg_sg_io_fn)(struct request_queue *, struct sg_io_v4 *hdr, + fmode_t mode, unsigned int timeout); struct bsg_device *bsg_register_queue(struct request_queue *q, struct device *parent, const char *name, - const struct bsg_ops *ops); + bsg_sg_io_fn *sg_io_fn); void bsg_unregister_queue(struct bsg_device *bcd); #endif /* _LINUX_BSG_H */ -- cgit v1.2.3 From 3136895cc5b665c1ab406d78f90c0700a3551e74 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 23 Jul 2021 02:32:05 -0700 Subject: iommu: Improve iommu_iotlb_gather helpers The Mediatek driver is not the only one which might want a basic address-based gathering behaviour, so although it's arguably simple enough to open-code, let's factor it out for the sake of cleanliness. Let's also take this opportunity to document the intent of these helpers for clarity. Cc: Joerg Roedel Cc: Will Deacon Cc: Jiajun Cao Cc: Robin Murphy Cc: Lu Baolu Cc: iommu@lists.linux-foundation.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Robin Murphy Signed-off-by: Nadav Amit Link: https://lore.kernel.org/r/20210723093209.714328-4-namit@vmware.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 32d448050bf7..e554871db46f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -497,6 +497,38 @@ static inline void iommu_iotlb_sync(struct iommu_domain *domain, iommu_iotlb_gather_init(iotlb_gather); } +/** + * iommu_iotlb_gather_add_range - Gather for address-based TLB invalidation + * @gather: TLB gather data + * @iova: start of page to invalidate + * @size: size of page to invalidate + * + * Helper for IOMMU drivers to build arbitrarily-sized invalidation commands + * where only the address range matters, and simply minimising intermediate + * syncs is preferred. + */ +static inline void iommu_iotlb_gather_add_range(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t size) +{ + unsigned long end = iova + size - 1; + + if (gather->start > iova) + gather->start = iova; + if (gather->end < end) + gather->end = end; +} + +/** + * iommu_iotlb_gather_add_page - Gather for page-based TLB invalidation + * @domain: IOMMU domain to be invalidated + * @gather: TLB gather data + * @iova: start of page to invalidate + * @size: size of page to invalidate + * + * Helper for IOMMU drivers to build invalidation commands based on individual + * pages, or with page size/table level hints which cannot be gathered if they + * differ. + */ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, struct iommu_iotlb_gather *gather, unsigned long iova, size_t size) @@ -515,11 +547,7 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, gather->pgsize = size; } - if (gather->end < end) - gather->end = end; - - if (gather->start > start) - gather->start = start; + iommu_iotlb_gather_add_range(gather, iova, size); } /* PCI device grouping function */ -- cgit v1.2.3 From febb82c208e481eee057c70fa3176bb48712a111 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 23 Jul 2021 02:32:06 -0700 Subject: iommu: Factor iommu_iotlb_gather_is_disjoint() out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor iommu_iotlb_gather_add_page() and factor out the logic that detects whether IOTLB gather range and a new range are disjoint. To be used by the next patch that implements different gathering logic for AMD. Note that updating gather->pgsize unconditionally does not affect correctness as the function had (and has) an invariant, in which gather->pgsize always represents the flushing granularity of its range. Arguably, “size" should never be zero, but lets assume for the matter of discussion that it might. If "size" equals to "gather->pgsize", then the assignment in question has no impact. Otherwise, if "size" is non-zero, then iommu_iotlb_sync() would initialize the size and range (see iommu_iotlb_gather_init()), and the invariant is kept. Otherwise, "size" is zero, and "gather" already holds a range, so gather->pgsize is non-zero and (gather->pgsize && gather->pgsize != size) is true. Therefore, again, iommu_iotlb_sync() would be called and initialize the size. Cc: Joerg Roedel Cc: Jiajun Cao Cc: Lu Baolu Cc: iommu@lists.linux-foundation.org Cc: linux-kernel@vger.kernel.org> Reviewed-by: Robin Murphy Acked-by: Will Deacon Signed-off-by: Nadav Amit Link: https://lore.kernel.org/r/20210723093209.714328-5-namit@vmware.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e554871db46f..979a5ceeea55 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -497,6 +497,28 @@ static inline void iommu_iotlb_sync(struct iommu_domain *domain, iommu_iotlb_gather_init(iotlb_gather); } +/** + * iommu_iotlb_gather_is_disjoint - Checks whether a new range is disjoint + * + * @gather: TLB gather data + * @iova: start of page to invalidate + * @size: size of page to invalidate + * + * Helper for IOMMU drivers to check whether a new range and the gathered range + * are disjoint. For many IOMMUs, flushing the IOMMU in this case is better + * than merging the two, which might lead to unnecessary invalidations. + */ +static inline +bool iommu_iotlb_gather_is_disjoint(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t size) +{ + unsigned long start = iova, end = start + size - 1; + + return gather->end != 0 && + (end + 1 < gather->start || start > gather->end + 1); +} + + /** * iommu_iotlb_gather_add_range - Gather for address-based TLB invalidation * @gather: TLB gather data @@ -533,20 +555,16 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, struct iommu_iotlb_gather *gather, unsigned long iova, size_t size) { - unsigned long start = iova, end = start + size - 1; - /* * If the new page is disjoint from the current range or is mapped at * a different granularity, then sync the TLB so that the gather * structure can be rewritten. */ - if (gather->pgsize != size || - end + 1 < gather->start || start > gather->end + 1) { - if (gather->pgsize) - iommu_iotlb_sync(domain, gather); - gather->pgsize = size; - } + if ((gather->pgsize && gather->pgsize != size) || + iommu_iotlb_gather_is_disjoint(gather, iova, size)) + iommu_iotlb_sync(domain, gather); + gather->pgsize = size; iommu_iotlb_gather_add_range(gather, iova, size); } -- cgit v1.2.3 From 205d76ff0684a0b4fe3ff3a283d143a47439d191 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Jul 2021 16:35:50 +0100 Subject: KVM: Remove kvm_is_transparent_hugepage() and PageTransCompoundMap() Now that arm64 has stopped using kvm_is_transparent_hugepage(), we can remove it, as well as PageTransCompoundMap() which was only used by the former. Acked-by: Paolo Bonzini Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210726153552.1535838-5-maz@kernel.org --- include/linux/page-flags.h | 37 ------------------------------------- 1 file changed, 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5922031ffab6..1ace27c4a8e0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -632,43 +632,6 @@ static inline int PageTransCompound(struct page *page) return PageCompound(page); } -/* - * PageTransCompoundMap is the same as PageTransCompound, but it also - * guarantees the primary MMU has the entire compound page mapped - * through pmd_trans_huge, which in turn guarantees the secondary MMUs - * can also map the entire compound page. This allows the secondary - * MMUs to call get_user_pages() only once for each compound page and - * to immediately map the entire compound page with a single secondary - * MMU fault. If there will be a pmd split later, the secondary MMUs - * will get an update through the MMU notifier invalidation through - * split_huge_pmd(). - * - * Unlike PageTransCompound, this is safe to be called only while - * split_huge_pmd() cannot run from under us, like if protected by the - * MMU notifier, otherwise it may result in page->_mapcount check false - * positives. - * - * We have to treat page cache THP differently since every subpage of it - * would get _mapcount inc'ed once it is PMD mapped. But, it may be PTE - * mapped in the current process so comparing subpage's _mapcount to - * compound_mapcount to filter out PTE mapped case. - */ -static inline int PageTransCompoundMap(struct page *page) -{ - struct page *head; - - if (!PageTransCompound(page)) - return 0; - - if (PageAnon(page)) - return atomic_read(&page->_mapcount) < 0; - - head = compound_head(page); - /* File THP is PMD mapped and not PTE mapped */ - return atomic_read(&page->_mapcount) == - atomic_read(compound_mapcount_ptr(head)); -} - /* * PageTransTail returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known -- cgit v1.2.3 From 36c3ce6c0d03a6c9992c3359f879cdc70fde836a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 26 Jul 2021 16:35:52 +0100 Subject: KVM: Get rid of kvm_get_pfn() Nobody is using kvm_get_pfn() anymore. Get rid of it. Acked-by: Paolo Bonzini Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210726153552.1535838-7-maz@kernel.org --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ae7735b490b4..9818d271c2a1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -824,7 +824,6 @@ void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_accessed(kvm_pfn_t pfn); -void kvm_get_pfn(kvm_pfn_t pfn); void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, -- cgit v1.2.3 From 0b8f11737cffc1a406d1134b58687abc29d76b52 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Jul 2021 15:04:23 -0700 Subject: KVM: Add infrastructure and macro to mark VM as bugged Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Reviewed-by: Paolo Bonzini Message-Id: <3a0998645c328bf0895f1290e61821b70f048549.1625186503.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ae7735b490b4..5342592841be 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -150,6 +150,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_UNBLOCK 2 #define KVM_REQ_UNHALT 3 +#define KVM_REQ_VM_BUGGED (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQUEST_ARCH_BASE 8 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \ @@ -596,6 +597,7 @@ struct kvm { pid_t userspace_pid; unsigned int max_halt_poll_ns; u32 dirty_ring_size; + bool vm_bugged; #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; @@ -629,6 +631,31 @@ struct kvm { #define vcpu_err(vcpu, fmt, ...) \ kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) +bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); +static inline void kvm_vm_bugged(struct kvm *kvm) +{ + kvm->vm_bugged = true; + kvm_make_all_cpus_request(kvm, KVM_REQ_VM_BUGGED); +} + +#define KVM_BUG(cond, kvm, fmt...) \ +({ \ + int __ret = (cond); \ + \ + if (WARN_ONCE(__ret && !(kvm)->vm_bugged, fmt)) \ + kvm_vm_bugged(kvm); \ + unlikely(__ret); \ +}) + +#define KVM_BUG_ON(cond, kvm) \ +({ \ + int __ret = (cond); \ + \ + if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged)) \ + kvm_vm_bugged(kvm); \ + unlikely(__ret); \ +}) + static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm) { return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET); @@ -946,7 +973,6 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, struct kvm_vcpu *except, unsigned long *vcpu_bitmap, cpumask_var_t tmp); -bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, struct kvm_vcpu *except); bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req, -- cgit v1.2.3 From 7ee3e8c39d3aed6ff4cc618d86ba9128f0c80087 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Jul 2021 15:04:24 -0700 Subject: KVM: Export kvm_make_all_cpus_request() for use in marking VMs as bugged Export kvm_make_all_cpus_request() and hoist the request helper declarations of request up to the KVM_REQ_* definitions in preparation for adding a "VM bugged" framework. The framework will add KVM_BUG() and KVM_BUG_ON() as alternatives to full BUG()/BUG_ON() for cases where KVM has definitely hit a bug (in itself or in silicon) and the VM is all but guaranteed to be hosed. Marking a VM bugged will trigger a request to all vCPUs to allow arch code to forcefully evict each vCPU from its run loop. Signed-off-by: Sean Christopherson Signed-off-by: Isaku Yamahata Message-Id: <1d8cbbc8065d831343e70b5dcaea92268145eef1.1625186503.git.isaku.yamahata@intel.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5342592841be..b7bf9d6a7780 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -159,6 +159,15 @@ static inline bool is_error_page(struct page *page) }) #define KVM_ARCH_REQ(nr) KVM_ARCH_REQ_FLAGS(nr, 0) +bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except, + unsigned long *vcpu_bitmap, cpumask_var_t tmp); +bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); +bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except); +bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req, + unsigned long *vcpu_bitmap); + #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 @@ -631,7 +640,6 @@ struct kvm { #define vcpu_err(vcpu, fmt, ...) \ kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) -bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); static inline void kvm_vm_bugged(struct kvm *kvm) { kvm->vm_bugged = true; @@ -970,14 +978,6 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); #endif -bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, - struct kvm_vcpu *except, - unsigned long *vcpu_bitmap, cpumask_var_t tmp); -bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, - struct kvm_vcpu *except); -bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req, - unsigned long *vcpu_bitmap); - long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, -- cgit v1.2.3 From 605c713023e3925d0444f495a42c903cb6ce875f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Jun 2021 11:32:07 -0400 Subject: KVM: Introduce kvm_get_kvm_safe() Introduce this safe version of kvm_get_kvm() so that it can be called even during vm destruction. Use it in kvm_debugfs_open() and remove the verbose comment. Prepare to be used elsewhere. Signed-off-by: Peter Xu Message-Id: <20210625153214.43106-3-peterx@redhat.com> [Preserve the comment in kvm_debugfs_open. - Paolo] Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b7bf9d6a7780..de58a0890b1a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -755,6 +755,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); +bool kvm_get_kvm_safe(struct kvm *kvm); void kvm_put_kvm(struct kvm *kvm); bool file_is_kvm(struct file *file); void kvm_put_kvm_no_destroy(struct kvm *kvm); -- cgit v1.2.3 From 52ac8b358b0cb7e91c966225fca61be5d1c984bc Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 May 2021 08:09:15 -0400 Subject: KVM: Block memslot updates across range_start() and range_end() We would like to avoid taking mmu_lock for .invalidate_range_{start,end}() notifications that are unrelated to KVM. Because mmu_notifier_count must be modified while holding mmu_lock for write, and must always be paired across start->end to stay balanced, lock elision must happen in both or none. Therefore, in preparation for this change, this patch prevents memslot updates across range_start() and range_end(). Note, technically flag-only memslot updates could be allowed in parallel, but stalling a memslot update for a relatively short amount of time is not a scalability issue, and this is all more than complex enough. A long note on the locking: a previous version of the patch used an rwsem to block the memslot update while the MMU notifier run, but this resulted in the following deadlock involving the pseudo-lock tagged as "mmu_notifier_invalidate_range_start". ====================================================== WARNING: possible circular locking dependency detected 5.12.0-rc3+ #6 Tainted: G OE ------------------------------------------------------ qemu-system-x86/3069 is trying to acquire lock: ffffffff9c775ca0 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}, at: __mmu_notifier_invalidate_range_end+0x5/0x190 but task is already holding lock: ffffaff7410a9160 (&kvm->mmu_notifier_slots_lock){.+.+}-{3:3}, at: kvm_mmu_notifier_invalidate_range_start+0x36d/0x4f0 [kvm] which lock already depends on the new lock. This corresponds to the following MMU notifier logic: invalidate_range_start take pseudo lock down_read() (*) release pseudo lock invalidate_range_end take pseudo lock (**) up_read() release pseudo lock At point (*) we take the mmu_notifiers_slots_lock inside the pseudo lock; at point (**) we take the pseudo lock inside the mmu_notifiers_slots_lock. This could cause a deadlock (ignoring for a second that the pseudo lock is not a lock): - invalidate_range_start waits on down_read(), because the rwsem is held by install_new_memslots - install_new_memslots waits on down_write(), because the rwsem is held till (another) invalidate_range_end finishes - invalidate_range_end sits waits on the pseudo lock, held by invalidate_range_start. Removing the fairness of the rwsem breaks the cycle (in lockdep terms, it would change the *shared* rwsem readers into *shared recursive* readers), so open-code the wait using a readers count and a spinlock. This also allows handling blockable and non-blockable critical section in the same way. Losing the rwsem fairness does theoretically allow MMU notifiers to block install_new_memslots forever. Note that mm/mmu_notifier.c's own retry scheme in mmu_interval_read_begin also uses wait/wake_up and is likewise not fair. Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index de58a0890b1a..5b6a69caccb5 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -548,6 +548,11 @@ struct kvm { struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + /* Used to wait for completion of MMU notifiers. */ + spinlock_t mn_invalidate_lock; + unsigned long mn_active_invalidate_count; + struct rcuwait mn_memslots_update_rcuwait; + /* * created_vcpus is protected by kvm->lock, and is incremented * at the beginning of KVM_CREATE_VCPU. online_vcpus is only -- cgit v1.2.3 From d08c8b855140e9f5240b3ffd1b8b9d435675e281 Mon Sep 17 00:00:00 2001 From: Wasim Khan Date: Thu, 29 Jul 2021 14:17:47 +0200 Subject: PCI: Add ACS quirks for NXP LX2xx0 and LX2xx2 platforms Root Ports in NXP LX2xx0 and LX2xx2, where each Root Port is a Root Complex with unique segment numbers, do provide isolation features to disable peer transactions and validate bus numbers in requests, but do not provide an actual PCIe ACS capability. Add ACS quirks for NXP LX2xx0 A/C/E/N and LX2xx2 A/C/E/N platforms. LX2xx0A : without security features + CAN-FD LX2160A (0x8d81) - 16 cores LX2120A (0x8da1) - 12 cores LX2080A (0x8d83) - 8 cores LX2xx0C : security features + CAN-FD LX2160C (0x8d80) - 16 cores LX2120C (0x8da0) - 12 cores LX2080C (0x8d82) - 8 cores LX2xx0E : security features + CAN LX2160E (0x8d90) - 16 cores LX2120E (0x8db0) - 12 cores LX2080E (0x8d92) - 8 cores LX2xx0N : without security features + CAN LX2160N (0x8d91) - 16 cores LX2120N (0x8db1) - 12 cores LX2080N (0x8d93) - 8 cores LX2xx2A : without security features + CAN-FD LX2162A (0x8d89) - 16 cores LX2122A (0x8da9) - 12 cores LX2082A (0x8d8b) - 8 cores LX2xx2C : security features + CAN-FD LX2162C (0x8d88) - 16 cores LX2122C (0x8da8) - 12 cores LX2082C (0x8d8a) - 8 cores LX2xx2E : security features + CAN LX2162E (0x8d98) - 16 cores LX2122E (0x8db8) - 12 cores LX2082E (0x8d9a) - 8 cores LX2xx2N : without security features + CAN LX2162N (0x8d99) - 16 cores LX2122N (0x8db9) - 12 cores LX2082N (0x8d9b) - 8 cores [bhelgaas: put PCI_VENDOR_ID_NXP definition next to PCI_VENDOR_ID_FREESCALE as a clue that they share the same Device ID namespace] Link: https://lore.kernel.org/r/20210729121747.1823086-1-wasim.khan@oss.nxp.com Link: https://lore.kernel.org/r/20210803180021.3252886-1-wasim.khan@oss.nxp.com Signed-off-by: Wasim Khan Signed-off-by: Bjorn Helgaas --- include/linux/pci_ids.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 4bac1831de80..1a9b8589391c 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2451,7 +2451,8 @@ #define PCI_VENDOR_ID_TDI 0x192E #define PCI_DEVICE_ID_TDI_EHCI 0x0101 -#define PCI_VENDOR_ID_FREESCALE 0x1957 +#define PCI_VENDOR_ID_FREESCALE 0x1957 /* duplicate: NXP */ +#define PCI_VENDOR_ID_NXP 0x1957 /* duplicate: FREESCALE */ #define PCI_DEVICE_ID_MPC8308 0xc006 #define PCI_DEVICE_ID_MPC8315E 0x00b4 #define PCI_DEVICE_ID_MPC8315 0x00b5 -- cgit v1.2.3 From a065d5615fc83908ef21ed8159ffb63d816ff5de Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 28 Jul 2021 16:42:27 +0200 Subject: of: unify of_count_phandle_with_args() arguments with !CONFIG_OF Unify the declaration of of_count_phandle_with_args() between enabled and disabled OF by making constifying pointed device_node. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Rob Herring Signed-off-by: Bartosz Golaszewski --- include/linux/of.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 9c2e71e202d1..dfeb065c3fad 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -896,7 +896,7 @@ static inline int of_parse_phandle_with_fixed_args(const struct device_node *np, return -ENOSYS; } -static inline int of_count_phandle_with_args(struct device_node *np, +static inline int of_count_phandle_with_args(const struct device_node *np, const char *list_name, const char *cells_name) { -- cgit v1.2.3 From e6ae9a833ef4043b940954b8dcac31493706b9d6 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 28 Jul 2021 16:42:28 +0200 Subject: gpiolib: constify passed device_node pointer Several gpiolib functions receive pointer to struct device_node which is later passed to OF functions. These OF functions accept already pointer to const, so gpiolib can follow similar approach to indicate they are not modifying the struct device_node. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Rob Herring Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 8 ++++---- include/linux/of_gpio.h | 15 ++++++++------- 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 566feb56601f..bf945b776555 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -609,7 +609,7 @@ struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev, #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_OF_GPIO) struct device_node; -struct gpio_desc *gpiod_get_from_of_node(struct device_node *node, +struct gpio_desc *gpiod_get_from_of_node(const struct device_node *node, const char *propname, int index, enum gpiod_flags dflags, const char *label); @@ -619,7 +619,7 @@ struct gpio_desc *gpiod_get_from_of_node(struct device_node *node, struct device_node; static inline -struct gpio_desc *gpiod_get_from_of_node(struct device_node *node, +struct gpio_desc *gpiod_get_from_of_node(const struct device_node *node, const char *propname, int index, enum gpiod_flags dflags, const char *label) @@ -633,7 +633,7 @@ struct gpio_desc *gpiod_get_from_of_node(struct device_node *node, struct device_node; struct gpio_desc *devm_gpiod_get_from_of_node(struct device *dev, - struct device_node *node, + const struct device_node *node, const char *propname, int index, enum gpiod_flags dflags, const char *label); @@ -644,7 +644,7 @@ struct device_node; static inline struct gpio_desc *devm_gpiod_get_from_of_node(struct device *dev, - struct device_node *node, + const struct device_node *node, const char *propname, int index, enum gpiod_flags dflags, const char *label) diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h index f821095218b0..8bf2ea859653 100644 --- a/include/linux/of_gpio.h +++ b/include/linux/of_gpio.h @@ -49,7 +49,7 @@ static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc) return container_of(gc, struct of_mm_gpio_chip, gc); } -extern int of_get_named_gpio_flags(struct device_node *np, +extern int of_get_named_gpio_flags(const struct device_node *np, const char *list_name, int index, enum of_gpio_flags *flags); extern int of_mm_gpiochip_add_data(struct device_node *np, @@ -67,7 +67,7 @@ extern void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc); #include /* Drivers may not strictly depend on the GPIO support, so let them link. */ -static inline int of_get_named_gpio_flags(struct device_node *np, +static inline int of_get_named_gpio_flags(const struct device_node *np, const char *list_name, int index, enum of_gpio_flags *flags) { if (flags) @@ -98,7 +98,8 @@ static inline int of_get_named_gpio_flags(struct device_node *np, * The above example defines four GPIOs, two of which are not specified. * This function will return '4' */ -static inline int of_gpio_named_count(struct device_node *np, const char* propname) +static inline int of_gpio_named_count(const struct device_node *np, + const char *propname) { return of_count_phandle_with_args(np, propname, "#gpio-cells"); } @@ -109,12 +110,12 @@ static inline int of_gpio_named_count(struct device_node *np, const char* propna * * Same as of_gpio_named_count, but hard coded to use the 'gpios' property */ -static inline int of_gpio_count(struct device_node *np) +static inline int of_gpio_count(const struct device_node *np) { return of_gpio_named_count(np, "gpios"); } -static inline int of_get_gpio_flags(struct device_node *np, int index, +static inline int of_get_gpio_flags(const struct device_node *np, int index, enum of_gpio_flags *flags) { return of_get_named_gpio_flags(np, "gpios", index, flags); @@ -129,7 +130,7 @@ static inline int of_get_gpio_flags(struct device_node *np, int index, * Returns GPIO number to use with Linux generic GPIO API, or one of the errno * value on the error condition. */ -static inline int of_get_named_gpio(struct device_node *np, +static inline int of_get_named_gpio(const struct device_node *np, const char *propname, int index) { return of_get_named_gpio_flags(np, propname, index, NULL); @@ -143,7 +144,7 @@ static inline int of_get_named_gpio(struct device_node *np, * Returns GPIO number to use with Linux generic GPIO API, or one of the errno * value on the error condition. */ -static inline int of_get_gpio(struct device_node *np, int index) +static inline int of_get_gpio(const struct device_node *np, int index) { return of_get_gpio_flags(np, index, NULL); } -- cgit v1.2.3 From edfa378448b566f705f5e81fd1565dc39ef6b716 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 14 Jul 2021 12:17:36 +0200 Subject: clk: Align provider-specific CLK_* bit definitions The definition of CLK_MULTIPLIER_ROUND_CLOSEST is not aligned to the two bit definitions next to it. A deeper inspection reveals that the alignment of CLK_MULTIPLIER_ROUND_CLOSEST does match the most common alignment. Align the bit definitions for the various provider types throughout the file at 40 columns, to increase uniformity. Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/5468cd9e50cda8fc59cb6baab9413c6c0de1a974.1626257689.git.geert+renesas@glider.be Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index d83b829305c0..7be81d5fcf8c 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -342,7 +342,7 @@ struct clk_fixed_rate { unsigned long flags; }; -#define CLK_FIXED_RATE_PARENT_ACCURACY BIT(0) +#define CLK_FIXED_RATE_PARENT_ACCURACY BIT(0) extern const struct clk_ops clk_fixed_rate_ops; struct clk_hw *__clk_hw_register_fixed_rate(struct device *dev, @@ -1020,8 +1020,8 @@ struct clk_fractional_divider { #define to_clk_fd(_hw) container_of(_hw, struct clk_fractional_divider, hw) -#define CLK_FRAC_DIVIDER_ZERO_BASED BIT(0) -#define CLK_FRAC_DIVIDER_BIG_ENDIAN BIT(1) +#define CLK_FRAC_DIVIDER_ZERO_BASED BIT(0) +#define CLK_FRAC_DIVIDER_BIG_ENDIAN BIT(1) extern const struct clk_ops clk_fractional_divider_ops; struct clk *clk_register_fractional_divider(struct device *dev, @@ -1069,9 +1069,9 @@ struct clk_multiplier { #define to_clk_multiplier(_hw) container_of(_hw, struct clk_multiplier, hw) -#define CLK_MULTIPLIER_ZERO_BYPASS BIT(0) +#define CLK_MULTIPLIER_ZERO_BYPASS BIT(0) #define CLK_MULTIPLIER_ROUND_CLOSEST BIT(1) -#define CLK_MULTIPLIER_BIG_ENDIAN BIT(2) +#define CLK_MULTIPLIER_BIG_ENDIAN BIT(2) extern const struct clk_ops clk_multiplier_ops; -- cgit v1.2.3 From edeb2ca74716056b2be62925f21494d7af65150f Mon Sep 17 00:00:00 2001 From: Martin Botka Date: Fri, 30 Jul 2021 23:59:24 +0200 Subject: clk: qcom: smd: Add support for SM6125 rpm clocks Add rpm smd clocks, PMIC and bus clocks which are required on SM6125 for clients to vote on. Signed-off-by: Martin Botka Link: https://lore.kernel.org/r/20210730215924.733350-2-martin.botka@somainline.org Signed-off-by: Stephen Boyd --- include/linux/soc/qcom/smd-rpm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/smd-rpm.h b/include/linux/soc/qcom/smd-rpm.h index f2645ec52520..60e66fc9b6bf 100644 --- a/include/linux/soc/qcom/smd-rpm.h +++ b/include/linux/soc/qcom/smd-rpm.h @@ -29,6 +29,7 @@ struct qcom_smd_rpm; #define QCOM_SMD_RPM_NCPB 0x6270636E #define QCOM_SMD_RPM_OCMEM_PWR 0x706d636f #define QCOM_SMD_RPM_QPIC_CLK 0x63697071 +#define QCOM_SMD_RPM_QUP_CLK 0x707571 #define QCOM_SMD_RPM_SMPA 0x61706d73 #define QCOM_SMD_RPM_SMPB 0x62706d73 #define QCOM_SMD_RPM_SPDM 0x63707362 -- cgit v1.2.3 From 87689270b10fa9e6fac7242233b355cb6792b845 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 4 Aug 2021 22:28:38 +0000 Subject: KVM: Rename lru_slot to last_used_slot lru_slot is used to keep track of the index of the most-recently used memslot. The correct acronym would be "mru" but that is not a common acronym. So call it last_used_slot which is a bit more obvious. Suggested-by: Paolo Bonzini Signed-off-by: David Matlack Message-Id: <20210804222844.1419481-2-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5b6a69caccb5..bdfd5ed539c9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -522,7 +522,7 @@ struct kvm_memslots { u64 generation; /* The mapping table from slot id to the index in memslots[]. */ short id_to_index[KVM_MEM_SLOTS_NUM]; - atomic_t lru_slot; + atomic_t last_used_slot; int used_slots; struct kvm_memory_slot memslots[]; }; @@ -1200,7 +1200,7 @@ static inline struct kvm_memory_slot * search_memslots(struct kvm_memslots *slots, gfn_t gfn) { int start = 0, end = slots->used_slots; - int slot = atomic_read(&slots->lru_slot); + int slot = atomic_read(&slots->last_used_slot); struct kvm_memory_slot *memslots = slots->memslots; if (unlikely(!slots->used_slots)) @@ -1221,7 +1221,7 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn) if (start < slots->used_slots && gfn >= memslots[start].base_gfn && gfn < memslots[start].base_gfn + memslots[start].npages) { - atomic_set(&slots->lru_slot, start); + atomic_set(&slots->last_used_slot, start); return &memslots[start]; } -- cgit v1.2.3 From 0f22af940dc8ec4f437189096a5f8677995323b0 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 4 Aug 2021 22:28:39 +0000 Subject: KVM: Move last_used_slot logic out of search_memslots Make search_memslots unconditionally search all memslots and move the last_used_slot logic up one level to __gfn_to_memslot. This is in preparation for introducing a per-vCPU last_used_slot. As part of this change convert existing callers of search_memslots to __gfn_to_memslot to avoid making any functional changes. Signed-off-by: David Matlack Message-Id: <20210804222844.1419481-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 64 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index bdfd5ed539c9..f30b53a07917 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1189,29 +1189,43 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); /* - * search_memslots() and __gfn_to_memslot() are here because they are - * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. - * gfn_to_memslot() itself isn't here as an inline because that would - * bloat other code too much. + * Returns a pointer to the memslot at slot_index if it contains gfn. + * Otherwise returns NULL. + */ +static inline struct kvm_memory_slot * +try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + if (slot_index < 0 || slot_index >= slots->used_slots) + return NULL; + + slot = &slots->memslots[slot_index]; + + if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages) + return slot; + else + return NULL; +} + +/* + * Returns a pointer to the memslot that contains gfn and records the index of + * the slot in index. Otherwise returns NULL. * * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! */ static inline struct kvm_memory_slot * -search_memslots(struct kvm_memslots *slots, gfn_t gfn) +search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index) { int start = 0, end = slots->used_slots; - int slot = atomic_read(&slots->last_used_slot); struct kvm_memory_slot *memslots = slots->memslots; + struct kvm_memory_slot *slot; if (unlikely(!slots->used_slots)) return NULL; - if (gfn >= memslots[slot].base_gfn && - gfn < memslots[slot].base_gfn + memslots[slot].npages) - return &memslots[slot]; - while (start < end) { - slot = start + (end - start) / 2; + int slot = start + (end - start) / 2; if (gfn >= memslots[slot].base_gfn) end = slot; @@ -1219,19 +1233,37 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn) start = slot + 1; } - if (start < slots->used_slots && gfn >= memslots[start].base_gfn && - gfn < memslots[start].base_gfn + memslots[start].npages) { - atomic_set(&slots->last_used_slot, start); - return &memslots[start]; + slot = try_get_memslot(slots, start, gfn); + if (slot) { + *index = start; + return slot; } return NULL; } +/* + * __gfn_to_memslot() and its descendants are here because it is called from + * non-modular code in arch/powerpc/kvm/book3s_64_vio{,_hv}.c. gfn_to_memslot() + * itself isn't here as an inline because that would bloat other code too much. + */ static inline struct kvm_memory_slot * __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) { - return search_memslots(slots, gfn); + struct kvm_memory_slot *slot; + int slot_index = atomic_read(&slots->last_used_slot); + + slot = try_get_memslot(slots, slot_index, gfn); + if (slot) + return slot; + + slot = search_memslots(slots, gfn, &slot_index); + if (slot) { + atomic_set(&slots->last_used_slot, slot_index); + return slot; + } + + return NULL; } static inline unsigned long -- cgit v1.2.3 From fe22ed827c5b60b895b15c5c3f04e04ac606be38 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 4 Aug 2021 22:28:40 +0000 Subject: KVM: Cache the last used slot index per vCPU The memslot for a given gfn is looked up multiple times during page fault handling. Avoid binary searching for it multiple times by caching the most recently used slot. There is an existing VM-wide last_used_slot but that does not work well for cases where vCPUs are accessing memory in different slots (see performance data below). Another benefit of caching the most recently use slot (versus looking up the slot once and passing around a pointer) is speeding up memslot lookups *across* faults and during spte prefetching. To measure the performance of this change I ran dirty_log_perf_test with 64 vCPUs and 64 memslots and measured "Populate memory time" and "Iteration 2 dirty memory time". Tests were ran with eptad=N to force dirty logging to use fast_page_fault so its performance could be measured. Config | Metric | Before | After ---------- | ----------------------------- | ------ | ------ tdp_mmu=Y | Populate memory time | 6.76s | 5.47s tdp_mmu=Y | Iteration 2 dirty memory time | 2.83s | 0.31s tdp_mmu=N | Populate memory time | 20.4s | 18.7s tdp_mmu=N | Iteration 2 dirty memory time | 2.65s | 0.30s The "Iteration 2 dirty memory time" results are especially compelling because they are equivalent to running the same test with a single memslot. In other words, fast_page_fault performance no longer scales with the number of memslots. Signed-off-by: David Matlack Message-Id: <20210804222844.1419481-4-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f30b53a07917..492d183dd7d0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -354,6 +354,13 @@ struct kvm_vcpu { struct kvm_vcpu_stat stat; char stats_id[KVM_STATS_NAME_SIZE]; struct kvm_dirty_ring dirty_ring; + + /* + * The index of the most recently used memslot by this vCPU. It's ok + * if this becomes stale due to memslot changes since we always check + * it is a valid slot. + */ + int last_used_slot; }; /* must be called with irqs disabled */ @@ -1200,6 +1207,12 @@ try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn) if (slot_index < 0 || slot_index >= slots->used_slots) return NULL; + /* + * slot_index can come from vcpu->last_used_slot which is not kept + * in sync with userspace-controllable memslot deletion. So use nospec + * to prevent the CPU from speculating past the end of memslots[]. + */ + slot_index = array_index_nospec(slot_index, slots->used_slots); slot = &slots->memslots[slot_index]; if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages) -- cgit v1.2.3 From 08bf54fcf5ca87328541e035090c6a85c8e064f4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 2 Aug 2021 21:43:54 +0300 Subject: dmaengine: dw: Convert members to u32 in platform data u32 is a type that is used for properties retrieval from DT. With the type change it allows to clean up properties reading routine. While at it, order the fields in way how they are parsed. Signed-off-by: Andy Shevchenko Reviewed-by: Serge Semin Tested-by: Serge Semin Link: https://lore.kernel.org/r/20210802184355.49879-2-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/platform_data/dma-dw.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h index b11b0c8bc5da..860ba4bc5ead 100644 --- a/include/linux/platform_data/dma-dw.h +++ b/include/linux/platform_data/dma-dw.h @@ -41,11 +41,11 @@ struct dw_dma_slave { /** * struct dw_dma_platform_data - Controller configuration parameters + * @nr_masters: Number of AHB masters supported by the controller * @nr_channels: Number of channels supported by hardware (max 8) * @chan_allocation_order: Allocate channels starting from 0 or 7 * @chan_priority: Set channel priority increasing from 0 to 7 or 7 to 0. * @block_size: Maximum block size supported by the controller - * @nr_masters: Number of AHB masters supported by the controller * @data_width: Maximum data width supported by hardware per AHB master * (in bytes, power of 2) * @multi_block: Multi block transfers supported by hardware per channel. @@ -55,25 +55,25 @@ struct dw_dma_slave { * @quirks: Optional platform quirks. */ struct dw_dma_platform_data { - unsigned int nr_channels; + u32 nr_masters; + u32 nr_channels; #define CHAN_ALLOCATION_ASCENDING 0 /* zero to seven */ #define CHAN_ALLOCATION_DESCENDING 1 /* seven to zero */ - unsigned char chan_allocation_order; + u32 chan_allocation_order; #define CHAN_PRIORITY_ASCENDING 0 /* chan0 highest */ #define CHAN_PRIORITY_DESCENDING 1 /* chan7 highest */ - unsigned char chan_priority; - unsigned int block_size; - unsigned char nr_masters; - unsigned char data_width[DW_DMA_MAX_NR_MASTERS]; - unsigned char multi_block[DW_DMA_MAX_NR_CHANNELS]; + u32 chan_priority; + u32 block_size; + u32 data_width[DW_DMA_MAX_NR_MASTERS]; + u32 multi_block[DW_DMA_MAX_NR_CHANNELS]; u32 max_burst[DW_DMA_MAX_NR_CHANNELS]; #define CHAN_PROTCTL_PRIVILEGED BIT(0) #define CHAN_PROTCTL_BUFFERABLE BIT(1) #define CHAN_PROTCTL_CACHEABLE BIT(2) #define CHAN_PROTCTL_MASK GENMASK(2, 0) - unsigned char protctl; + u32 protctl; #define DW_DMA_QUIRK_XBAR_PRESENT BIT(0) - unsigned int quirks; + u32 quirks; }; #endif /* _PLATFORM_DATA_DMA_DW_H */ -- cgit v1.2.3 From f1653c2e2831e9db6cd68473bbec581782df03a5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 6 Aug 2021 11:05:37 -0700 Subject: xfs: introduce CPU hotplug infrastructure We need to move to per-cpu state for both deferred inode inactivation and CIL tracking, but to do that we need to handle CPUs being removed from the system by the hot-plug code. Introduce generic XFS infrastructure to handle CPU hotplug events that is set up at module init time and torn down at module exit time. Initially, we only need CPU dead notifications, so we only set up a callback for these notifications. The infrastructure can be updated in future for other CPU hotplug state machine notifications easily if ever needed. Signed-off-by: Dave Chinner [djwong: rearrange some macros, fix function prototypes] Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index f39b34b13871..439adc05be4e 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -52,6 +52,7 @@ enum cpuhp_state { CPUHP_FS_BUFF_DEAD, CPUHP_PRINTK_DEAD, CPUHP_MM_MEMCQ_DEAD, + CPUHP_XFS_DEAD, CPUHP_PERCPU_CNT_DEAD, CPUHP_RADIX_DEAD, CPUHP_PAGE_ALLOC, -- cgit v1.2.3 From 9050ad816f5205c0d069e3e492eb849265ae5167 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 2 Aug 2021 01:33:14 +0200 Subject: mfd: db8500-prcmu: Handle missing FW variant There was an "unknown" firmware variant turning up in the wild causing problems in the clock driver. Add this missing variant and clarify that varian 11 and 15 are Samsung variants, as this is now very well known from released products. Signed-off-by: Linus Walleij Acked-by: Stephen Boyd Signed-off-by: Lee Jones --- include/linux/mfd/dbx500-prcmu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/dbx500-prcmu.h b/include/linux/mfd/dbx500-prcmu.h index e6ee2ec35de9..cbf9d7619493 100644 --- a/include/linux/mfd/dbx500-prcmu.h +++ b/include/linux/mfd/dbx500-prcmu.h @@ -186,10 +186,11 @@ enum ddr_pwrst { #define PRCMU_FW_PROJECT_U8500_C3 8 #define PRCMU_FW_PROJECT_U8500_C4 9 #define PRCMU_FW_PROJECT_U9500_MBL 10 -#define PRCMU_FW_PROJECT_U8500_MBL 11 /* Customer specific */ +#define PRCMU_FW_PROJECT_U8500_SSG1 11 /* Samsung specific */ #define PRCMU_FW_PROJECT_U8500_MBL2 12 /* Customer specific */ #define PRCMU_FW_PROJECT_U8520 13 #define PRCMU_FW_PROJECT_U8420 14 +#define PRCMU_FW_PROJECT_U8500_SSG2 15 /* Samsung specific */ #define PRCMU_FW_PROJECT_U8420_SYSCLK 17 #define PRCMU_FW_PROJECT_A9420 20 /* [32..63] 9540 and derivatives */ -- cgit v1.2.3 From fffe3cc8c2194f60c4af4fac7f27d25e8828f001 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 29 Jul 2021 14:15:19 -0600 Subject: dma-mapping: allow map_sg() ops to return negative error codes Allow dma_map_sgtable() to pass errors from the map_sg() ops. This will be required for returning appropriate error codes when mapping P2PDMA memory. Introduce __dma_map_sg_attrs() which will return the raw error code from the map_sg operation (whether it be negative or zero). Then add a dma_map_sg_attrs() wrapper to convert any negative errors to zero to satisfy the existing calling convention. dma_map_sgtable() defines three error codes that .map_sg implementations are allowed to return: -EINVAL, -ENOMEM and -EIO. The latter of which is a generic return for cases that are passing DMA_MAPPING_ERROR through. dma_map_sgtable() will convert a zero error return for old map_sg() ops into a -EIO return and return any negative errors as reported. This allows map_sg implementations to start returning multiple negative error codes. Legacy map_sg implementations can continue to return zero until they are all converted. Signed-off-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- include/linux/dma-map-ops.h | 5 +++-- include/linux/dma-mapping.h | 35 +++++++---------------------------- 2 files changed, 10 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 0d53a96a3d64..2f842498c448 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -41,8 +41,9 @@ struct dma_map_ops { size_t size, enum dma_data_direction dir, unsigned long attrs); /* - * map_sg returns 0 on error and a value > 0 on success. - * It should never return a value < 0. + * map_sg should return a negative error code on error. See + * dma_map_sgtable() for a list of appropriate error codes + * and their meanings. */ int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 183e7103a66d..daa1e360f0ee 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -110,6 +110,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); +int dma_map_sgtable(struct device *dev, struct sg_table *sgt, + enum dma_data_direction dir, unsigned long attrs); dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, @@ -174,6 +176,11 @@ static inline void dma_unmap_sg_attrs(struct device *dev, unsigned long attrs) { } +static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt, + enum dma_data_direction dir, unsigned long attrs) +{ + return -EOPNOTSUPP; +} static inline dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -343,34 +350,6 @@ static inline void dma_sync_single_range_for_device(struct device *dev, return dma_sync_single_for_device(dev, addr + offset, size, dir); } -/** - * dma_map_sgtable - Map the given buffer for DMA - * @dev: The device for which to perform the DMA operation - * @sgt: The sg_table object describing the buffer - * @dir: DMA direction - * @attrs: Optional DMA attributes for the map operation - * - * Maps a buffer described by a scatterlist stored in the given sg_table - * object for the @dir DMA operation by the @dev device. After success the - * ownership for the buffer is transferred to the DMA domain. One has to - * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the - * ownership of the buffer back to the CPU domain before touching the - * buffer by the CPU. - * - * Returns 0 on success or -EINVAL on error during mapping the buffer. - */ -static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt, - enum dma_data_direction dir, unsigned long attrs) -{ - int nents; - - nents = dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); - if (nents <= 0) - return -EINVAL; - sgt->nents = nents; - return 0; -} - /** * dma_unmap_sgtable - Unmap the given buffer for DMA * @dev: The device for which to perform the DMA operation -- cgit v1.2.3 From ad8f36e4b6b1c826a0daa5fda2c5839205b5aa8b Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 29 Jul 2021 14:15:21 -0600 Subject: iommu: return full error code from iommu_map_sg[_atomic]() Convert to ssize_t return code so the return code from __iommu_map() can be returned all the way down through dma_iommu_map_sg(). Signed-off-by: Logan Gunthorpe Cc: Joerg Roedel Cc: Will Deacon Signed-off-by: Christoph Hellwig --- include/linux/iommu.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 32d448050bf7..9369458ba1bd 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -414,11 +414,11 @@ extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, extern size_t iommu_unmap_fast(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather); -extern size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, - struct scatterlist *sg,unsigned int nents, int prot); -extern size_t iommu_map_sg_atomic(struct iommu_domain *domain, - unsigned long iova, struct scatterlist *sg, - unsigned int nents, int prot); +extern ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, + struct scatterlist *sg, unsigned int nents, int prot); +extern ssize_t iommu_map_sg_atomic(struct iommu_domain *domain, + unsigned long iova, struct scatterlist *sg, + unsigned int nents, int prot); extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova); extern void iommu_set_fault_handler(struct iommu_domain *domain, iommu_fault_handler_t handler, void *token); @@ -679,18 +679,18 @@ static inline size_t iommu_unmap_fast(struct iommu_domain *domain, return 0; } -static inline size_t iommu_map_sg(struct iommu_domain *domain, - unsigned long iova, struct scatterlist *sg, - unsigned int nents, int prot) +static inline ssize_t iommu_map_sg(struct iommu_domain *domain, + unsigned long iova, struct scatterlist *sg, + unsigned int nents, int prot) { - return 0; + return -ENODEV; } -static inline size_t iommu_map_sg_atomic(struct iommu_domain *domain, +static inline ssize_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova, struct scatterlist *sg, unsigned int nents, int prot) { - return 0; + return -ENODEV; } static inline void iommu_flush_iotlb_all(struct iommu_domain *domain) -- cgit v1.2.3 From 71d3d0ebc894294ef9454e45a3ac2e9ba60b3351 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Jul 2021 08:01:27 -0400 Subject: SUNRPC: Convert rpc_client refcount to use refcount_t There are now tools in the refcount library that allow us to convert the client shutdown code. Reported-by: Xiyu Yang Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 8b5d5c97553e..b2edd5fc2f0c 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -35,7 +36,7 @@ struct rpc_sysfs_client; * The high-level client handle */ struct rpc_clnt { - atomic_t cl_count; /* Number of references */ + refcount_t cl_count; /* Number of references */ unsigned int cl_clid; /* client id */ struct list_head cl_clients; /* Global list of clients */ struct list_head cl_tasks; /* List of tasks */ -- cgit v1.2.3 From c2dc3e5fad13aca5d7bdf4bcb52b1a1d707c8555 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Jul 2021 07:59:23 -0400 Subject: SUNRPC: Fix potential memory corruption We really should not call rpc_wake_up_queued_task_set_status() with xprt->snd_task as an argument unless we are certain that is actually an rpc_task. Fixes: 0445f92c5d53 ("SUNRPC: Fix disconnection races") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index c8c39f22d3b1..59cd97da895b 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -432,6 +432,7 @@ void xprt_release_write(struct rpc_xprt *, struct rpc_task *); #define XPRT_CONGESTED (9) #define XPRT_CWND_WAIT (10) #define XPRT_WRITE_SPACE (11) +#define XPRT_SND_IS_COOKIE (12) static inline void xprt_set_connected(struct rpc_xprt *xprt) { -- cgit v1.2.3 From de2860f4636256836450c6543be744a50118fc66 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 9 Aug 2021 10:10:00 -0700 Subject: mm: Add kvrealloc() During log recovery of an XFS filesystem with 64kB directory buffers, rebuilding a buffer split across two log records results in a memory allocation warning from krealloc like this: xfs filesystem being mounted at /mnt/scratch supports timestamps until 2038 (0x7fffffff) XFS (dm-0): Unmounting Filesystem XFS (dm-0): Mounting V5 Filesystem XFS (dm-0): Starting recovery (logdev: internal) ------------[ cut here ]------------ WARNING: CPU: 5 PID: 3435170 at mm/page_alloc.c:3539 get_page_from_freelist+0xdee/0xe40 ..... RIP: 0010:get_page_from_freelist+0xdee/0xe40 Call Trace: ? complete+0x3f/0x50 __alloc_pages+0x16f/0x300 alloc_pages+0x87/0x110 kmalloc_order+0x2c/0x90 kmalloc_order_trace+0x1d/0x90 __kmalloc_track_caller+0x215/0x270 ? xlog_recover_add_to_cont_trans+0x63/0x1f0 krealloc+0x54/0xb0 xlog_recover_add_to_cont_trans+0x63/0x1f0 xlog_recovery_process_trans+0xc1/0xd0 xlog_recover_process_ophdr+0x86/0x130 xlog_recover_process_data+0x9f/0x160 xlog_recover_process+0xa2/0x120 xlog_do_recovery_pass+0x40b/0x7d0 ? __irq_work_queue_local+0x4f/0x60 ? irq_work_queue+0x3a/0x50 xlog_do_log_recovery+0x70/0x150 xlog_do_recover+0x38/0x1d0 xlog_recover+0xd8/0x170 xfs_log_mount+0x181/0x300 xfs_mountfs+0x4a1/0x9b0 xfs_fs_fill_super+0x3c0/0x7b0 get_tree_bdev+0x171/0x270 ? suffix_kstrtoint.constprop.0+0xf0/0xf0 xfs_fs_get_tree+0x15/0x20 vfs_get_tree+0x24/0xc0 path_mount+0x2f5/0xaf0 __x64_sys_mount+0x108/0x140 do_syscall_64+0x3a/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xae Essentially, we are taking a multi-order allocation from kmem_alloc() (which has an open coded no fail, no warn loop) and then reallocating it out to 64kB using krealloc(__GFP_NOFAIL) and that is then triggering the above warning. This is a regression caused by converting this code from an open coded no fail/no warn reallocation loop to using __GFP_NOFAIL. What we actually need here is kvrealloc(), so that if contiguous page allocation fails we fall back to vmalloc() and we don't get nasty warnings happening in XFS. Fixes: 771915c4f688 ("xfs: remove kmem_realloc()") Signed-off-by: Dave Chinner Acked-by: Mel Gorman Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ca22e6e694a..e59646a5d44d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -829,6 +829,8 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) return kvmalloc_array(n, size, flags | __GFP_ZERO); } +extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, + gfp_t flags); extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); -- cgit v1.2.3 From 438623a06bacd69c40c4af633bb09a3bbb9dfc78 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Jul 2021 15:52:06 -0400 Subject: SUNRPC: Add svc_rqst::rq_auth_stat I'd like to take commit 4532608d71c8 ("SUNRPC: Clean up generic dispatcher code") even further by using only private local SVC dispatchers for all kernel RPC services. This change would enable the removal of the logic that switches between svc_generic_dispatch() and a service's private dispatcher, and simplify the invocation of the service's pc_release method so that humans can visually verify that it is always invoked properly. All that will come later. First, let's provide a better way to return authentication errors from SVC dispatcher functions. Instead of overloading the dispatch method's *statp argument, add a field to struct svc_rqst that can hold an error value. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/svc.h | 1 + include/linux/sunrpc/svcauth.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index e91d51ea028b..35f12963e1ff 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -282,6 +282,7 @@ struct svc_rqst { void * rq_argp; /* decoded arguments */ void * rq_resp; /* xdr'd results */ void * rq_auth_data; /* flavor-specific data */ + __be32 rq_auth_stat; /* authentication status */ int rq_auth_slack; /* extra space xdr code * should leave in head * for krb5i, krb5p. diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h index b0003866a249..6d9cc9080aca 100644 --- a/include/linux/sunrpc/svcauth.h +++ b/include/linux/sunrpc/svcauth.h @@ -127,7 +127,7 @@ struct auth_ops { char * name; struct module *owner; int flavour; - int (*accept)(struct svc_rqst *rq, __be32 *authp); + int (*accept)(struct svc_rqst *rq); int (*release)(struct svc_rqst *rq); void (*domain_release)(struct auth_domain *); int (*set_client)(struct svc_rqst *rq); @@ -149,7 +149,7 @@ struct auth_ops { struct svc_xprt; -extern int svc_authenticate(struct svc_rqst *rqstp, __be32 *authp); +extern int svc_authenticate(struct svc_rqst *rqstp); extern int svc_authorise(struct svc_rqst *rqstp); extern int svc_set_client(struct svc_rqst *rqstp); extern int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops); -- cgit v1.2.3 From 9082e1d914f8b27114352b1940bbcc7522f682e7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Jul 2021 15:52:19 -0400 Subject: SUNRPC: Eliminate the RQ_AUTHERR flag Now that there is an alternate method for returning an auth_stat value, replace the RQ_AUTHERR flag with use of that new method. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/svc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 35f12963e1ff..63c9210cae06 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -275,7 +275,6 @@ struct svc_rqst { #define RQ_VICTIM (5) /* about to be shut down */ #define RQ_BUSY (6) /* request is busy */ #define RQ_DATA (7) /* request has data */ -#define RQ_AUTHERR (8) /* Request status is auth error */ unsigned long rq_flags; /* flags field */ ktime_t rq_qtime; /* enqueue time */ @@ -533,7 +532,6 @@ unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, void *p, size_t total); -__be32 svc_return_autherr(struct svc_rqst *rqstp, __be32 auth_err); __be32 svc_generic_init_request(struct svc_rqst *rqstp, const struct svc_program *progp, struct svc_process_info *procinfo); -- cgit v1.2.3 From b390752191a6e09e8fb89625e227db0d5cc0ca33 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 3 Aug 2021 20:39:25 +0300 Subject: gpiolib: Deduplicate forward declaration in the consumer.h header struct acpi_device is repeated in two branches of ifdeffery. Move it out and hence deduplicate. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij --- include/linux/gpio/consumer.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 566feb56601f..414b8f98d70f 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -680,10 +680,10 @@ struct acpi_gpio_mapping { unsigned int quirks; }; -#if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_ACPI) - struct acpi_device; +#if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_ACPI) + int acpi_dev_add_driver_gpios(struct acpi_device *adev, const struct acpi_gpio_mapping *gpios); void acpi_dev_remove_driver_gpios(struct acpi_device *adev); @@ -696,8 +696,6 @@ struct gpio_desc *acpi_get_and_request_gpiod(char *path, int pin, char *label); #else /* CONFIG_GPIOLIB && CONFIG_ACPI */ -struct acpi_device; - static inline int acpi_dev_add_driver_gpios(struct acpi_device *adev, const struct acpi_gpio_mapping *gpios) { -- cgit v1.2.3 From c1b291e96a6d27ac83938596829086945ff8a36e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Aug 2021 19:00:16 +0300 Subject: gpio: dwapb: Unify ACPI enumeration checks in get_irq() and configure_irqs() Shared IRQ is only enabled for ACPI enumeration, there is no need to have a special flag for that, since we simple can test if device has been enumerated by ACPI. This unifies the checks in dwapb_get_irq() and dwapb_configure_irqs(). Signed-off-by: Andy Shevchenko Acked-by: Lee Jones Acked-by: Serge Semin Tested-by: Serge Semin --- include/linux/platform_data/gpio-dwapb.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/gpio-dwapb.h b/include/linux/platform_data/gpio-dwapb.h index 0aa5c6720259..535e5ed549d9 100644 --- a/include/linux/platform_data/gpio-dwapb.h +++ b/include/linux/platform_data/gpio-dwapb.h @@ -14,7 +14,6 @@ struct dwapb_port_property { unsigned int ngpio; unsigned int gpio_base; int irq[DWAPB_MAX_GPIOS]; - bool irq_shared; }; struct dwapb_platform_data { -- cgit v1.2.3 From 5111c2b6b0194b509f47e6338c4deeeb4497bda8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Aug 2021 19:00:19 +0300 Subject: gpio: dwapb: Get rid of legacy platform data Platform data is a legacy interface to supply device properties to the driver. In this case we don't have anymore in-kernel users for it. Just remove it for good. Signed-off-by: Andy Shevchenko Acked-by: Serge Semin Tested-by: Serge Semin --- include/linux/platform_data/gpio-dwapb.h | 24 ------------------------ 1 file changed, 24 deletions(-) delete mode 100644 include/linux/platform_data/gpio-dwapb.h (limited to 'include/linux') diff --git a/include/linux/platform_data/gpio-dwapb.h b/include/linux/platform_data/gpio-dwapb.h deleted file mode 100644 index 535e5ed549d9..000000000000 --- a/include/linux/platform_data/gpio-dwapb.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright(c) 2014 Intel Corporation. - */ - -#ifndef GPIO_DW_APB_H -#define GPIO_DW_APB_H - -#define DWAPB_MAX_GPIOS 32 - -struct dwapb_port_property { - struct fwnode_handle *fwnode; - unsigned int idx; - unsigned int ngpio; - unsigned int gpio_base; - int irq[DWAPB_MAX_GPIOS]; -}; - -struct dwapb_platform_data { - struct dwapb_port_property *properties; - unsigned int nports; -}; - -#endif -- cgit v1.2.3 From ae03c3771b8cbbed3802ad1153d896c32015c520 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 5 Aug 2021 22:18:59 -0300 Subject: vfio: Introduce a vfio_uninit_group_dev() API call This pairs with vfio_init_group_dev() and allows undoing any state that is stored in the vfio_device unrelated to registration. Add appropriately placed calls to all the drivers. The following patch will use this to add pre-registration state for the device set. Signed-off-by: Max Gurtovoy Reviewed-by: Cornelia Huck Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index a2c5b30e1763..b0875cf8e496 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -61,6 +61,7 @@ extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev); void vfio_init_group_dev(struct vfio_device *device, struct device *dev, const struct vfio_device_ops *ops); +void vfio_uninit_group_dev(struct vfio_device *device); int vfio_register_group_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); extern struct vfio_device *vfio_device_get_from_dev(struct device *dev); -- cgit v1.2.3 From 2fd585f4ed9de9b9259e95affdd7d8cde06b48c3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 5 Aug 2021 22:19:00 -0300 Subject: vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas Reviewed-by: Cornelia Huck Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/mdev.h | 2 ++ include/linux/vfio.h | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 3a38598c2605..cb5b7ed1d7c3 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -111,6 +111,8 @@ struct mdev_parent_ops { int (*create)(struct mdev_device *mdev); int (*remove)(struct mdev_device *mdev); + int (*open_device)(struct mdev_device *mdev); + void (*close_device)(struct mdev_device *mdev); int (*open)(struct mdev_device *mdev); void (*release)(struct mdev_device *mdev); ssize_t (*read)(struct mdev_device *mdev, char __user *buf, diff --git a/include/linux/vfio.h b/include/linux/vfio.h index b0875cf8e496..f0e6a72875e4 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -15,13 +15,28 @@ #include #include +/* + * VFIO devices can be placed in a set, this allows all devices to share this + * structure and the VFIO core will provide a lock that is held around + * open_device()/close_device() for all devices in the set. + */ +struct vfio_device_set { + void *set_id; + struct mutex lock; + struct list_head device_list; + unsigned int device_count; +}; + struct vfio_device { struct device *dev; const struct vfio_device_ops *ops; struct vfio_group *group; + struct vfio_device_set *dev_set; + struct list_head dev_set_list; /* Members below here are private, not for driver use */ refcount_t refcount; + unsigned int open_count; struct completion comp; struct list_head group_next; }; @@ -29,6 +44,8 @@ struct vfio_device { /** * struct vfio_device_ops - VFIO bus driver device callbacks * + * @open_device: Called when the first file descriptor is opened for this device + * @close_device: Opposite of open_device * @open: Called when userspace creates new file descriptor for device * @release: Called when userspace releases file descriptor for device * @read: Perform read(2) on device file descriptor @@ -43,6 +60,8 @@ struct vfio_device { */ struct vfio_device_ops { char *name; + int (*open_device)(struct vfio_device *vdev); + void (*close_device)(struct vfio_device *vdev); int (*open)(struct vfio_device *vdev); void (*release)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, @@ -67,6 +86,8 @@ void vfio_unregister_group_dev(struct vfio_device *device); extern struct vfio_device *vfio_device_get_from_dev(struct device *dev); extern void vfio_device_put(struct vfio_device *device); +int vfio_assign_device_set(struct vfio_device *device, void *set_id); + /* events for the backend driver notify callback */ enum vfio_iommu_notify_type { VFIO_IOMMU_CONTAINER_CLOSE = 0, -- cgit v1.2.3 From eb24c1007e6852e024dc33b0dd9617b8500a1291 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 5 Aug 2021 22:19:10 -0300 Subject: vfio: Remove struct vfio_device_ops open/release Nothing uses this anymore, delete it. Signed-off-by: Yishai Hadas Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/14-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/mdev.h | 7 ------- include/linux/vfio.h | 4 ---- 2 files changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index cb5b7ed1d7c3..68427e8fadeb 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -72,11 +72,6 @@ struct device *mtype_get_parent_dev(struct mdev_type *mtype); * @mdev: mdev_device device structure which is being * destroyed * Returns integer: success (0) or error (< 0) - * @open: Open mediated device. - * @mdev: mediated device. - * Returns integer: success (0) or error (< 0) - * @release: release mediated device - * @mdev: mediated device. * @read: Read emulation callback * @mdev: mediated device structure * @buf: read buffer @@ -113,8 +108,6 @@ struct mdev_parent_ops { int (*remove)(struct mdev_device *mdev); int (*open_device)(struct mdev_device *mdev); void (*close_device)(struct mdev_device *mdev); - int (*open)(struct mdev_device *mdev); - void (*release)(struct mdev_device *mdev); ssize_t (*read)(struct mdev_device *mdev, char __user *buf, size_t count, loff_t *ppos); ssize_t (*write)(struct mdev_device *mdev, const char __user *buf, diff --git a/include/linux/vfio.h b/include/linux/vfio.h index f0e6a72875e4..b53a9557884a 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -46,8 +46,6 @@ struct vfio_device { * * @open_device: Called when the first file descriptor is opened for this device * @close_device: Opposite of open_device - * @open: Called when userspace creates new file descriptor for device - * @release: Called when userspace releases file descriptor for device * @read: Perform read(2) on device file descriptor * @write: Perform write(2) on device file descriptor * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_* @@ -62,8 +60,6 @@ struct vfio_device_ops { char *name; int (*open_device)(struct vfio_device *vdev); void (*close_device)(struct vfio_device *vdev); - int (*open)(struct vfio_device *vdev); - void (*release)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t (*write)(struct vfio_device *vdev, const char __user *buf, -- cgit v1.2.3 From c17495b01b72b53bd290f442d39b060e015c7aea Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 10 Aug 2021 12:04:33 +0530 Subject: cpufreq: Add callback to register with energy model Many cpufreq drivers register with the energy model for each policy and do exactly the same thing. Follow the footsteps of thermal-cooling, to get it done from the cpufreq core itself. Provide a new callback, which will be called, if present, by the cpufreq core at the right moment (more on that in the code's comment). Also provide a generic implementation that uses dev_pm_opp_of_register_em(). This also allows us to register with the EM at a later point of time, compared to ->init(), from where the EM core can access cpufreq policy directly using cpufreq_cpu_get() type of helpers and perform other work, like marking few frequencies inefficient, this will be done separately. Reviewed-by: Quentin Perret Reviewed-by: Lukasz Luba Signed-off-by: Viresh Kumar --- include/linux/cpufreq.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9fd719475fcd..c65a1d7385f8 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -9,10 +9,12 @@ #define _LINUX_CPUFREQ_H #include +#include #include #include #include #include +#include #include #include #include @@ -373,6 +375,12 @@ struct cpufreq_driver { /* platform specific boost support code */ bool boost_enabled; int (*set_boost)(struct cpufreq_policy *policy, int state); + + /* + * Set by drivers that want to register with the energy model after the + * policy is properly initialized, but before the governor is started. + */ + void (*register_em)(struct cpufreq_policy *policy); }; /* flags */ @@ -1046,4 +1054,10 @@ unsigned int cpufreq_generic_get(unsigned int cpu); void cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency); + +static inline void cpufreq_register_em_with_opp(struct cpufreq_policy *policy) +{ + dev_pm_opp_of_register_em(get_cpu_device(policy->cpu), + policy->related_cpus); +} #endif /* _LINUX_CPUFREQ_H */ -- cgit v1.2.3 From 892384cd998a17960dff6ebefc27375f63364111 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Tue, 3 Aug 2021 14:16:49 +0200 Subject: iommu/io-pgtable: Add DART pagetable format Apple's DART iommu uses a pagetable format that shares some similarities with the ones already implemented by io-pgtable.c. Add a new format variant to support the required differences so that we don't have to duplicate the pagetable handling code. Reviewed-by: Alexander Graf Reviewed-by: Alyssa Rosenzweig Reviewed-by: Robin Murphy Signed-off-by: Sven Peter Link: https://lore.kernel.org/r/20210803121651.61594-2-sven@svenpeter.dev Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index c43f3b899d2a..a738483fb4da 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -16,6 +16,7 @@ enum io_pgtable_fmt { ARM_V7S, ARM_MALI_LPAE, AMD_IOMMU_V1, + APPLE_DART, IO_PGTABLE_NUM_FMTS, }; @@ -136,6 +137,11 @@ struct io_pgtable_cfg { u64 transtab; u64 memattr; } arm_mali_lpae_cfg; + + struct { + u64 ttbr[4]; + u32 n_ttbrs; + } apple_dart_cfg; }; }; @@ -254,5 +260,6 @@ extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns; extern struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns; extern struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns; extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns; +extern struct io_pgtable_init_fns io_pgtable_apple_dart_init_fns; #endif /* __IO_PGTABLE_H */ -- cgit v1.2.3 From ca91ea34778f9b2a44a391b10164bcd73b4b0f25 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sat, 7 Aug 2021 14:36:54 +1200 Subject: asus-wmi: Add panel overdrive functionality Some ASUS ROG laptops have the ability to drive the display panel a higher rate to eliminate or reduce ghosting. Signed-off-by: Luke D. Jones Link: https://lore.kernel.org/r/20210807023656.25020-2-luke@ljones.dev Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/asus-wmi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 2f274cf52805..428aea701c7b 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -61,6 +61,7 @@ #define ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY 0x00120075 /* Misc */ +#define ASUS_WMI_DEVID_PANEL_OD 0x00050019 #define ASUS_WMI_DEVID_CAMERA 0x00060013 #define ASUS_WMI_DEVID_LID_FLIP 0x00060062 -- cgit v1.2.3 From 98829e84dc67630efb7de675f0a70066620468a3 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sat, 7 Aug 2021 14:36:55 +1200 Subject: asus-wmi: Add dgpu disable method In Windows the ASUS Armory Crate program can enable or disable the dGPU via a WMI call. This functions much the same as various Linux methods in software where the dGPU is removed from the device tree. However the WMI call saves the state of dGPU (enabled or not) and this then changes the dGPU visibility in Linux with no way for Linux users to re-enable it. We expose the WMI method so users can see and change the dGPU ACPI state. Signed-off-by: Luke D. Jones Link: https://lore.kernel.org/r/20210807023656.25020-3-luke@ljones.dev Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/asus-wmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 428aea701c7b..a528f9d0e4b7 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -90,6 +90,9 @@ /* Keyboard dock */ #define ASUS_WMI_DEVID_KBD_DOCK 0x00120063 +/* dgpu on/off */ +#define ASUS_WMI_DEVID_DGPU 0x00090020 + /* DSTS masks */ #define ASUS_WMI_DSTS_STATUS_BIT 0x00000001 #define ASUS_WMI_DSTS_UNKNOWN_BIT 0x00000002 -- cgit v1.2.3 From 382b91db8044669d254006df799df9d85d4ad891 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sat, 7 Aug 2021 14:36:56 +1200 Subject: asus-wmi: Add egpu enable method The X13 Flow laptops can utilise an external GPU. This requires toggling an ACPI method which will first disable the internal dGPU, and then enable the eGPU. Signed-off-by: Luke D. Jones Link: https://lore.kernel.org/r/20210807023656.25020-4-luke@ljones.dev Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/asus-wmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index a528f9d0e4b7..17dc5cb6f3f2 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -90,6 +90,9 @@ /* Keyboard dock */ #define ASUS_WMI_DEVID_KBD_DOCK 0x00120063 +/* dgpu on/off */ +#define ASUS_WMI_DEVID_EGPU 0x00090019 + /* dgpu on/off */ #define ASUS_WMI_DEVID_DGPU 0x00090020 -- cgit v1.2.3 From 20a1b3acfc802ad7b6b327f2bdc0570711538561 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 3 Aug 2021 18:00:41 +0200 Subject: i2c: acpi: Add an i2c_acpi_client_count() helper function We have 3 files now which have the need to count the number of I2cSerialBus resources in an ACPI-device's resource-list. Currently all implement their own helper function for this, add a generic helper function to replace the 3 implementations. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20210803160044.158802-2-hdegoede@redhat.com Acked-by: Mika Westerberg Acked-by: Wolfram Sang --- include/linux/i2c.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 3eb60a2e9e61..2ce3efbe9198 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -1010,6 +1010,7 @@ struct acpi_resource_i2c_serialbus; #if IS_ENABLED(CONFIG_ACPI) bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, struct acpi_resource_i2c_serialbus **i2c); +int i2c_acpi_client_count(struct acpi_device *adev); u32 i2c_acpi_find_bus_speed(struct device *dev); struct i2c_client *i2c_acpi_new_device(struct device *dev, int index, struct i2c_board_info *info); @@ -1020,6 +1021,10 @@ static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, { return false; } +static inline int i2c_acpi_client_count(struct acpi_device *adev) +{ + return 0; +} static inline u32 i2c_acpi_find_bus_speed(struct device *dev) { return 0; -- cgit v1.2.3 From fd00faa375fbb9d46ae0730d0faf4a3006301005 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 8 Aug 2021 19:21:56 +0200 Subject: PCI/VPD: Embed struct pci_vpd in struct pci_dev Now that struct pci_vpd is really small, simplify the code by embedding struct pci_vpd directly in struct pci_dev instead of dynamically allocating it. Link: https://lore.kernel.org/r/d898489e-22ba-71f1-2f31-f1a78dc15849@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..e752cc39a1fe 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -300,9 +300,14 @@ struct pci_cap_saved_state { struct pci_cap_saved_data cap; }; +struct pci_vpd { + struct mutex lock; + unsigned int len; + u8 cap; +}; + struct irq_affinity; struct pcie_link_state; -struct pci_vpd; struct pci_sriov; struct pci_p2pdma; struct rcec_ea; @@ -473,7 +478,7 @@ struct pci_dev { #ifdef CONFIG_PCI_MSI const struct attribute_group **msi_irq_groups; #endif - struct pci_vpd *vpd; + struct pci_vpd vpd; #ifdef CONFIG_PCIE_DPC u16 dpc_cap; unsigned int dpc_rp_extensions:1; -- cgit v1.2.3 From 928f9e2686110825262685b7aedc7b21b805fecd Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 12 Aug 2021 20:00:23 +0300 Subject: clk: fractional-divider: Hide clk_fractional_divider_ops from wide audience The providers are all located in drivers/clk/ and hence no need to export the clock operations to wider audience. Hide them by moving to drivers/clk/clk-fractional-divider.h. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210812170025.67074-2-andriy.shevchenko@linux.intel.com Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index d83b829305c0..acb8e10d2898 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -1023,7 +1023,6 @@ struct clk_fractional_divider { #define CLK_FRAC_DIVIDER_ZERO_BASED BIT(0) #define CLK_FRAC_DIVIDER_BIG_ENDIAN BIT(1) -extern const struct clk_ops clk_fractional_divider_ops; struct clk *clk_register_fractional_divider(struct device *dev, const char *name, const char *parent_name, unsigned long flags, void __iomem *reg, u8 mshift, u8 mwidth, u8 nshift, u8 nwidth, -- cgit v1.2.3 From 82f53f9ee5770177eb102446cc3513bf07e2668a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 12 Aug 2021 20:00:24 +0300 Subject: clk: fractional-divider: Introduce POWER_OF_TWO_PS flag The newly introduced POWER_OF_TWO_PS flag, when set, makes the flow to skip the assumption that the caller will use an additional 2^scale prescaler to get the desired clock rate. Reported-by: Liu Ying Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210812170025.67074-3-andriy.shevchenko@linux.intel.com Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index acb8e10d2898..d63d07fd251b 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -1001,6 +1001,12 @@ struct clk_hw *devm_clk_hw_register_fixed_factor(struct device *dev, * CLK_FRAC_DIVIDER_BIG_ENDIAN - By default little endian register accesses are * used for the divider register. Setting this flag makes the register * accesses big endian. + * CLK_FRAC_DIVIDER_POWER_OF_TWO_PS - By default the resulting fraction might + * be saturated and the caller will get quite far from the good enough + * approximation. Instead the caller may require, by setting this flag, + * to shift left by a few bits in case, when the asked one is quite small + * to satisfy the desired range of denominator. It assumes that on the + * caller's side the power-of-two capable prescaler exists. */ struct clk_fractional_divider { struct clk_hw hw; @@ -1022,6 +1028,7 @@ struct clk_fractional_divider { #define CLK_FRAC_DIVIDER_ZERO_BASED BIT(0) #define CLK_FRAC_DIVIDER_BIG_ENDIAN BIT(1) +#define CLK_FRAC_DIVIDER_POWER_OF_TWO_PS BIT(2) struct clk *clk_register_fractional_divider(struct device *dev, const char *name, const char *parent_name, unsigned long flags, -- cgit v1.2.3 From 7bb698f09bdd01fbb6d48c14bb1dde556dc1af00 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 28 Jul 2021 07:47:33 -0500 Subject: fs: Move notify_change permission checks into may_setattr Move the permission checks in notify_change into a separate function to make them available to filesystems. When notify_change is called, the vfs performs those checks before calling into iop->setattr. However, a filesystem like gfs2 can only lock and revalidate the inode inside ->setattr, and it must then repeat those checks to err on the safe side. It would be nice to get rid of the double checking, but moving the permission check into iop->setattr altogether isn't really an option. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Signed-off-by: Al Viro --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 640574294216..50192964bf6b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3469,6 +3469,8 @@ extern int buffer_migrate_page_norefs(struct address_space *, #define buffer_migrate_page_norefs NULL #endif +int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, + unsigned int ia_valid); int setattr_prepare(struct user_namespace *, struct dentry *, struct iattr *); extern int inode_newsize_ok(const struct inode *, loff_t offset); void setattr_copy(struct user_namespace *, struct inode *inode, -- cgit v1.2.3 From 3165af738ed3224a84ead7d97c6909de2e453b4c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 30 Jul 2021 18:04:49 -0400 Subject: KVM: Allow to have arch-specific per-vm debugfs files Allow archs to create arch-specific nodes under kvm->debugfs_dentry directory besides the stats fields. The new interface kvm_arch_create_vm_debugfs() is defined but not yet used. It's called after kvm->debugfs_dentry is created, so it can be referenced directly in kvm_arch_create_vm_debugfs(). Arch should define their own versions when they want to create extra debugfs nodes. Signed-off-by: Peter Xu Message-Id: <20210730220455.26054-2-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 492d183dd7d0..f50bfcf225f0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1073,6 +1073,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_post_init_vm(struct kvm *kvm); void kvm_arch_pre_destroy_vm(struct kvm *kvm); +int kvm_arch_create_vm_debugfs(struct kvm *kvm); #ifndef __KVM_HAVE_ARCH_VM_ALLOC /* -- cgit v1.2.3 From 2a047e0662aee1bd773e0415accd785ad26a9398 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 17:19:36 +0200 Subject: dma-mapping: return an unsigned int from dma_map_sg{,_attrs} These can only return 0 for failure or the number of entries, so turn the return value into an unsigned int. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe --- include/linux/dma-mapping.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index daa1e360f0ee..dca2b1355bb1 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -105,8 +105,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, unsigned long attrs); void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); -int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, unsigned long attrs); +unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); @@ -166,8 +166,9 @@ static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { } -static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir, unsigned long attrs) +static inline unsigned int dma_map_sg_attrs(struct device *dev, + struct scatterlist *sg, int nents, enum dma_data_direction dir, + unsigned long attrs) { return 0; } -- cgit v1.2.3 From fb83610762dd5927212aa62a468dd3b756b57a88 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 22 Jul 2021 11:06:44 +0200 Subject: thermal/core: Fix thermal_cooling_device_register() prototype There are two pairs of declarations for thermal_cooling_device_register() and thermal_of_cooling_device_register(), and only one set was changed in a recent patch, so the other one now causes a compile-time warning: drivers/net/wireless/mediatek/mt76/mt7915/init.c: In function 'mt7915_thermal_init': drivers/net/wireless/mediatek/mt76/mt7915/init.c:134:48: error: passing argument 1 of 'thermal_cooling_device_register' discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers] 134 | cdev = thermal_cooling_device_register(wiphy_name(wiphy), phy, | ^~~~~~~~~~~~~~~~~ In file included from drivers/net/wireless/mediatek/mt76/mt7915/init.c:7: include/linux/thermal.h:407:39: note: expected 'char *' but argument is of type 'const char *' 407 | thermal_cooling_device_register(char *type, void *devdata, | ~~~~~~^~~~ Change the dummy helper functions to have the same arguments as the normal version. Fixes: f991de53a8ab ("thermal: make device_register's type argument const") Signed-off-by: Arnd Bergmann Reviewed-by: Jean-Francois Dagenais Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210722090717.1116748-1-arnd@kernel.org --- include/linux/thermal.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index d296f3b88fb9..8050d929a5b4 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -404,12 +404,13 @@ static inline void thermal_zone_device_unregister( struct thermal_zone_device *tz) { } static inline struct thermal_cooling_device * -thermal_cooling_device_register(char *type, void *devdata, +thermal_cooling_device_register(const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { return ERR_PTR(-ENODEV); } static inline struct thermal_cooling_device * thermal_of_cooling_device_register(struct device_node *np, - char *type, void *devdata, const struct thermal_cooling_device_ops *ops) + const char *type, void *devdata, + const struct thermal_cooling_device_ops *ops) { return ERR_PTR(-ENODEV); } static inline struct thermal_cooling_device * devm_thermal_of_cooling_device_register(struct device *dev, -- cgit v1.2.3 From 454f2ed4b34f9ef5726d080b1eb5dc47a7f36d6f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 14 Jul 2021 13:27:01 +0200 Subject: thermal: Spelling s/scallbacks/callbacks/ Fix a misspelling of the word "callbacks". Signed-off-by: Geert Uytterhoeven Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/ae38372996a23bb67769e2d62ca170ae9457c4df.1626261946.git.geert+renesas@glider.be --- include/linux/thermal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 8050d929a5b4..c314893970b3 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -285,7 +285,7 @@ struct thermal_zone_params { }; /** - * struct thermal_zone_of_device_ops - scallbacks for handling DT based zones + * struct thermal_zone_of_device_ops - callbacks for handling DT based zones * * Mandatory: * @get_temp: a pointer to a function that reads the sensor temperature. -- cgit v1.2.3 From c3b1c377f0102e88dea6354d9cdb34a9d1c90971 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Mon, 2 Aug 2021 14:02:34 +0000 Subject: tracing: Fix a typo in tracepoint.h It should be @prev_pid, not @prev_prid. Link: https://lkml.kernel.org/r/20210802140234.5383-1-shijie@os.amperecomputing.com Signed-off-by: Huang Shijie Signed-off-by: Steven Rostedt (VMware) --- include/linux/tracepoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index ab58696d0ddd..28031b15f878 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -475,7 +475,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * * * * The declared 'local variable' is called '__entry' * * - * * __field(pid_t, prev_prid) is equivalent to a standard declaration: + * * __field(pid_t, prev_pid) is equivalent to a standard declaration: * * * * pid_t prev_pid; * * -- cgit v1.2.3 From 4f911138c8da94bcff84f1d093d28e378703c43f Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sat, 19 Jun 2021 12:26:16 +0300 Subject: fs: add generic helper for filling statx attribute flags The immutable and append-only properties on an inode are published on the inode's i_flags and enforced by the VFS. Create a helper to fill the corresponding STATX_ATTR_ flags in the kstat structure from the inode's i_flags. Only orange was converted to use this helper. Other filesystems could use it in the future. Suggested-by: Miklos Szeredi Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 1 + include/linux/stat.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 640574294216..ae6c6c34db94 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3355,6 +3355,7 @@ extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); void generic_fillattr(struct user_namespace *, struct inode *, struct kstat *); +void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); void __inode_add_bytes(struct inode *inode, loff_t bytes); diff --git a/include/linux/stat.h b/include/linux/stat.h index fff27e603814..7df06931f25d 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -34,6 +34,10 @@ struct kstat { STATX_ATTR_ENCRYPTED | \ STATX_ATTR_VERITY \ )/* Attrs corresponding to FS_*_FL flags */ +#define KSTAT_ATTR_VFS_FLAGS \ + (STATX_ATTR_IMMUTABLE | \ + STATX_ATTR_APPEND \ + ) /* Attrs corresponding to S_* flags that are enforced by the VFS */ u64 ino; dev_t dev; dev_t rdev; -- cgit v1.2.3 From 69139244806537f9d51364f37fe146bb2ee88a05 Mon Sep 17 00:00:00 2001 From: Amey Narkhede Date: Tue, 17 Aug 2021 23:34:52 +0530 Subject: PCI: Cache PCIe Device Capabilities register Add a new member called devcap in struct pci_dev for caching the PCIe Device Capabilities register to avoid reading PCI_EXP_DEVCAP multiple times. Refactor pcie_has_flr() to use cached device capabilities. Link: https://lore.kernel.org/r/20210817180500.1253-2-ameynarkhede03@gmail.com Signed-off-by: Amey Narkhede Signed-off-by: Bjorn Helgaas Reviewed-by: Raphael Norwitz --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..1179c0ee2bfb 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -333,6 +333,7 @@ struct pci_dev { struct rcec_ea *rcec_ea; /* RCEC cached endpoint association */ struct pci_dev *rcec; /* Associated RCEC device */ #endif + u32 devcap; /* PCIe Device Capabilities */ u8 pcie_cap; /* PCIe capability offset */ u8 msi_cap; /* MSI capability offset */ u8 msix_cap; /* MSI-X capability offset */ -- cgit v1.2.3 From 56f107d7813f116484019617043393a7753ffcbf Mon Sep 17 00:00:00 2001 From: Amey Narkhede Date: Tue, 17 Aug 2021 23:34:53 +0530 Subject: PCI: Add pcie_reset_flr() with 'probe' argument Most reset methods are of the form "pci_*_reset(dev, probe)". pcie_flr() was an exception because it relied on a separate pcie_has_flr() function instead of taking a "probe" argument. Add "pcie_reset_flr(dev, probe)" to follow the convention. Remove pcie_has_flr(). Some pcie_flr() callers that did not use pcie_has_flr() remain. [bhelgaas: commit log, rework pcie_reset_flr() to use dev->devcap directly] Link: https://lore.kernel.org/r/20210817180500.1253-3-ameynarkhede03@gmail.com Signed-off-by: Amey Narkhede Signed-off-by: Bjorn Helgaas Reviewed-by: Raphael Norwitz --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 1179c0ee2bfb..1de37e3fc29d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1229,7 +1229,7 @@ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev, enum pci_bus_speed *speed, enum pcie_link_width *width); void pcie_print_link_status(struct pci_dev *dev); -bool pcie_has_flr(struct pci_dev *dev); +int pcie_reset_flr(struct pci_dev *dev, int probe); int pcie_flr(struct pci_dev *dev); int __pci_reset_function_locked(struct pci_dev *dev); int pci_reset_function(struct pci_dev *dev); -- cgit v1.2.3 From e20afa06244eb5d7fa850f9fe2a78ae17ba96f81 Mon Sep 17 00:00:00 2001 From: Amey Narkhede Date: Tue, 17 Aug 2021 23:34:54 +0530 Subject: PCI: Add array to track reset method ordering Add reset_methods[] in struct pci_dev to keep track of reset mechanisms supported by the device and their ordering. Refactor probing and reset functions to take advantage of calling convention of reset functions. Co-developed-by: Alex Williamson Link: https://lore.kernel.org/r/20210817180500.1253-4-ameynarkhede03@gmail.com Signed-off-by: Alex Williamson Signed-off-by: Amey Narkhede Signed-off-by: Bjorn Helgaas Reviewed-by: Raphael Norwitz --- include/linux/pci.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 1de37e3fc29d..2faf517d20c1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -49,6 +49,9 @@ PCI_STATUS_SIG_TARGET_ABORT | \ PCI_STATUS_PARITY) +/* Number of reset methods used in pci_reset_fn_methods array in pci.c */ +#define PCI_NUM_RESET_METHODS 6 + /* * The PCI interface treats multi-function devices as independent * devices. The slot/function address of each device is encoded @@ -506,6 +509,9 @@ struct pci_dev { char *driver_override; /* Driver name to force a match */ unsigned long priv_flags; /* Private flags for the PCI driver */ + + /* These methods index pci_reset_fn_methods[] */ + u8 reset_methods[PCI_NUM_RESET_METHODS]; /* In priority order */ }; static inline struct pci_dev *pci_physfn(struct pci_dev *dev) -- cgit v1.2.3 From 4ec36dfeb155b72da8d28ab006a46f2f8b981eac Mon Sep 17 00:00:00 2001 From: Amey Narkhede Date: Tue, 17 Aug 2021 23:34:55 +0530 Subject: PCI: Remove reset_fn field from pci_dev "reset_fn" indicates whether the device supports any reset mechanism. Remove the use of reset_fn in favor of the reset_methods array that tracks supported reset mechanisms of a device and their ordering. The octeon driver incorrectly used reset_fn to detect whether the device supports FLR or not. Use pcie_reset_flr() to probe whether it supports FLR. Co-developed-by: Alex Williamson Link: https://lore.kernel.org/r/20210817180500.1253-5-ameynarkhede03@gmail.com Signed-off-by: Alex Williamson Signed-off-by: Amey Narkhede Signed-off-by: Bjorn Helgaas Reviewed-by: Alex Williamson Reviewed-by: Raphael Norwitz --- include/linux/pci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 2faf517d20c1..d1f4d248617b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -431,7 +431,6 @@ struct pci_dev { unsigned int state_saved:1; unsigned int is_physfn:1; unsigned int is_virtfn:1; - unsigned int reset_fn:1; unsigned int is_hotplug_bridge:1; unsigned int shpc_managed:1; /* SHPC owned by shpchp */ unsigned int is_thunderbolt:1; /* Thunderbolt controller */ -- cgit v1.2.3 From de3438c47a8ddc75548e62a03736a9321c2b7bac Mon Sep 17 00:00:00 2001 From: Thara Gopinath Date: Mon, 9 Aug 2021 15:15:59 -0400 Subject: firmware: qcom_scm: Introduce SCM calls to access LMh Introduce SCM calls to access/configure limits management hardware(LMH). Reviewed-by: Bjorn Andersson Signed-off-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210809191605.3742979-2-thara.gopinath@linaro.org --- include/linux/qcom_scm.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h index 0165824c5128..c0475d1c9885 100644 --- a/include/linux/qcom_scm.h +++ b/include/linux/qcom_scm.h @@ -109,6 +109,12 @@ extern int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, u32 *resp); extern int qcom_scm_qsmmu500_wait_safe_toggle(bool en); + +extern int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, + u64 limit_node, u32 node_id, u64 version); +extern int qcom_scm_lmh_profile_change(u32 profile_id); +extern bool qcom_scm_lmh_dcvsh_available(void); + #else #include @@ -170,5 +176,13 @@ static inline int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, static inline int qcom_scm_qsmmu500_wait_safe_toggle(bool en) { return -ENODEV; } + +static inline int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, + u64 limit_node, u32 node_id, u64 version) + { return -ENODEV; } + +static inline int qcom_scm_lmh_profile_change(u32 profile_id) { return -ENODEV; } + +static inline bool qcom_scm_lmh_dcvsh_available(void) { return -ENODEV; } #endif #endif -- cgit v1.2.3 From 46983fcd67ac5a830d41ebe3755314db67a6dd16 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:15 +0100 Subject: iommu: Pull IOVA cookie management into the core Now that everyone has converged on iommu-dma for IOMMU_DOMAIN_DMA support, we can abandon the notion of drivers being responsible for the cookie type, and consolidate all the management into the core code. CC: Yong Wu CC: Chunyan Zhang CC: Maxime Ripard Tested-by: Heiko Stuebner Tested-by: Marek Szyprowski Tested-by: Yoshihiro Shimoda Reviewed-by: Jean-Philippe Brucker Reviewed-by: Lu Baolu Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/46a2c0e7419c7d1d931762dc7b6a69fa082d199a.1628682048.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4997c78e2670..141779d76035 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -40,6 +40,7 @@ struct iommu_domain; struct notifier_block; struct iommu_sva; struct iommu_fault_event; +struct iommu_dma_cookie; /* iommu fault flags */ #define IOMMU_FAULT_READ 0x0 @@ -86,7 +87,7 @@ struct iommu_domain { iommu_fault_handler_t handler; void *handler_token; struct iommu_domain_geometry geometry; - void *iova_cookie; + struct iommu_dma_cookie *iova_cookie; }; enum iommu_cap { -- cgit v1.2.3 From 7a7c5badf85806eab75e31ab8d45021f1545b0e3 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:28 +0100 Subject: iommu: Indicate queued flushes via gather data Since iommu_iotlb_gather exists to help drivers optimise flushing for a given unmap request, it is also the logical place to indicate whether the unmap is strict or not, and thus help them further optimise for whether to expect a sync or a flush_all subsequently. As part of that, it also seems fair to make the flush queue code take responsibility for enforcing the really subtle ordering requirement it brings, so that we don't need to worry about forgetting that if new drivers want to add flush queue support, and can consolidate the existing versions. While we're adding to the kerneldoc, also fill in some info for @freelist which was overlooked previously. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/bf5f8e2ad84e48c712ccbf80fa8c610594c7595f.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 141779d76035..f7679f6684b1 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -161,16 +161,22 @@ enum iommu_dev_features { * @start: IOVA representing the start of the range to be flushed * @end: IOVA representing the end of the range to be flushed (inclusive) * @pgsize: The interval at which to perform the flush + * @freelist: Removed pages to free after sync + * @queued: Indicates that the flush will be queued * * This structure is intended to be updated by multiple calls to the * ->unmap() function in struct iommu_ops before eventually being passed - * into ->iotlb_sync(). + * into ->iotlb_sync(). Drivers can add pages to @freelist to be freed after + * ->iotlb_sync() or ->iotlb_flush_all() have cleared all cached references to + * them. @queued is set to indicate when ->iotlb_flush_all() will be called + * later instead of ->iotlb_sync(), so drivers may optimise accordingly. */ struct iommu_iotlb_gather { unsigned long start; unsigned long end; size_t pgsize; struct page *freelist; + bool queued; }; /** -- cgit v1.2.3 From a8e5f04458c4e496eada2b029ce96713bb6c388d Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:29 +0100 Subject: iommu/io-pgtable: Remove non-strict quirk IO_PGTABLE_QUIRK_NON_STRICT was never a very comfortable fit, since it's not a quirk of the pagetable format itself. Now that we have a more appropriate way to convey non-strict unmaps, though, this last of the non-quirk quirks can also go, and with the flush queue code also now enforcing its own ordering we can have a lovely cleanup all round. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/155b5c621cd8936472e273a8b07a182f62c6c20d.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index c43f3b899d2a..9ba6d9ea316e 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -73,10 +73,6 @@ struct io_pgtable_cfg { * to support up to 35 bits PA where the bit32, bit33 and bit34 are * encoded in the bit9, bit4 and bit5 of the PTE respectively. * - * IO_PGTABLE_QUIRK_NON_STRICT: Skip issuing synchronous leaf TLBIs - * on unmap, for DMA domains using the flush queue mechanism for - * delayed invalidation. - * * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table * for use in the upper half of a split address space. * @@ -86,7 +82,6 @@ struct io_pgtable_cfg { #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) #define IO_PGTABLE_QUIRK_ARM_MTK_EXT BIT(3) - #define IO_PGTABLE_QUIRK_NON_STRICT BIT(4) #define IO_PGTABLE_QUIRK_ARM_TTBR1 BIT(5) #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6) unsigned long quirks; -- cgit v1.2.3 From bf3aed4660c6e3c44c69f07d8927ee5a22a952ac Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:30 +0100 Subject: iommu: Introduce explicit type for non-strict DMA domains Promote the difference between strict and non-strict DMA domains from an internal detail to a distinct domain feature and type, to pave the road for exposing it through the sysfs default domain interface. Reviewed-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/08cd2afaf6b63c58ad49acec3517c9b32c2bb946.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f7679f6684b1..5629ae42951f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -61,6 +61,7 @@ struct iommu_domain_geometry { #define __IOMMU_DOMAIN_DMA_API (1U << 1) /* Domain for use in DMA-API implementation */ #define __IOMMU_DOMAIN_PT (1U << 2) /* Domain is identity mapped */ +#define __IOMMU_DOMAIN_DMA_FQ (1U << 3) /* DMA-API uses flush queue */ /* * This are the possible domain-types @@ -73,12 +74,17 @@ struct iommu_domain_geometry { * IOMMU_DOMAIN_DMA - Internally used for DMA-API implementations. * This flag allows IOMMU drivers to implement * certain optimizations for these domains + * IOMMU_DOMAIN_DMA_FQ - As above, but definitely using batched TLB + * invalidation. */ #define IOMMU_DOMAIN_BLOCKED (0U) #define IOMMU_DOMAIN_IDENTITY (__IOMMU_DOMAIN_PT) #define IOMMU_DOMAIN_UNMANAGED (__IOMMU_DOMAIN_PAGING) #define IOMMU_DOMAIN_DMA (__IOMMU_DOMAIN_PAGING | \ __IOMMU_DOMAIN_DMA_API) +#define IOMMU_DOMAIN_DMA_FQ (__IOMMU_DOMAIN_PAGING | \ + __IOMMU_DOMAIN_DMA_API | \ + __IOMMU_DOMAIN_DMA_FQ) struct iommu_domain { unsigned type; @@ -90,6 +96,11 @@ struct iommu_domain { struct iommu_dma_cookie *iova_cookie; }; +static inline bool iommu_is_dma_domain(struct iommu_domain *domain) +{ + return domain->type & __IOMMU_DOMAIN_DMA_API; +} + enum iommu_cap { IOMMU_CAP_CACHE_COHERENCY, /* IOMMU can enforce cache coherent DMA transactions */ -- cgit v1.2.3 From c208916fe6c7b84e3ec95cd91853039596eeb2cf Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:34 +0100 Subject: iommu: Express DMA strictness via the domain type Eliminate the iommu_get_dma_strict() indirection and pipe the information through the domain type from the beginning. Besides the flow simplification this also has several nice side-effects: - Automatically implies strict mode for untrusted devices by virtue of their IOMMU_DOMAIN_DMA override. - Ensures that we only end up using flush queues for drivers which are aware of them and can actually benefit. - Allows us to handle flush queue init failure by falling back to strict mode instead of leaving it to possibly blow up later. Reviewed-by: Lu Baolu Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/47083d69155577f1367877b1594921948c366eb3.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 5629ae42951f..923a8d1c5e39 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -504,7 +504,6 @@ int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks); void iommu_set_dma_strict(void); -bool iommu_get_dma_strict(struct iommu_domain *domain); extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev, unsigned long iova, int flags); -- cgit v1.2.3 From 452e69b58c2889e5546edb92d9e66285410f7463 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:38 +0100 Subject: iommu: Allow enabling non-strict mode dynamically Allocating and enabling a flush queue is in fact something we can reasonably do while a DMA domain is active, without having to rebuild it from scratch. Thus we can allow a strict -> non-strict transition from sysfs without requiring to unbind the device's driver, which is of particular interest to users who want to make selective relaxations to critical devices like the one serving their root filesystem. Disabling and draining a queue also seems technically possible to achieve without rebuilding the whole domain, but would certainly be more involved. Furthermore there's not such a clear use-case for tightening up security *after* the device may already have done whatever it is that you don't trust it not to do, so we only consider the relaxation case. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/d652966348c78457c38bf18daf369272a4ebc2c9.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/dma-iommu.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h index 758ca4694257..24607dc3c2ac 100644 --- a/include/linux/dma-iommu.h +++ b/include/linux/dma-iommu.h @@ -20,6 +20,7 @@ void iommu_put_dma_cookie(struct iommu_domain *domain); /* Setup call for arch DMA mapping code */ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit); +int iommu_dma_init_fq(struct iommu_domain *domain); /* The DMA API isn't _quite_ the whole story, though... */ /* @@ -54,6 +55,11 @@ static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base, { } +static inline int iommu_dma_init_fq(struct iommu_domain *domain) +{ + return -EINVAL; +} + static inline int iommu_get_dma_cookie(struct iommu_domain *domain) { return -ENODEV; -- cgit v1.2.3 From 39a2d3506b2d53c569a6db13d65b2f3728c4feec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Jun 2021 16:05:05 +0200 Subject: dma-mapping: add a dma_init_global_coherent helper Add a new helper to initialize the global coherent pool. This both cleans up the existing initialization which indirects through the reserved_mem_ops that are normally only used for struct device, and also allows using the global pool for non-devicetree architectures. Signed-off-by: Christoph Hellwig Tested-by: Dillon Min --- include/linux/dma-map-ops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 2f842498c448..068f1b11a6a4 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -177,7 +177,7 @@ void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, int dma_release_from_global_coherent(int order, void *vaddr); int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret); - +int dma_init_global_coherent(phys_addr_t phys_addr, size_t size); #else static inline int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size) -- cgit v1.2.3 From 0cad6246621b5887d5b33fea84219d2a71f2f99a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Aug 2021 22:08:24 +0200 Subject: vfs: add rcu argument to ->get_acl() callback Add a rcu argument to the ->get_acl() callback to allow get_cached_acl_rcu() to call the ->get_acl() method in the next patch. Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ae6c6c34db94..73376dfe28d0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2065,7 +2065,7 @@ struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); int (*permission) (struct user_namespace *, struct inode *, int); - struct posix_acl * (*get_acl)(struct inode *, int); + struct posix_acl * (*get_acl)(struct inode *, int, bool); int (*readlink) (struct dentry *, char __user *,int); -- cgit v1.2.3 From 332f606b32b6291a944c8cf23b91f53a6e676525 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Aug 2021 22:08:24 +0200 Subject: ovl: enable RCU'd ->get_acl() Overlayfs does not cache ACL's (to avoid double caching). Instead it just calls the underlying filesystem's i_op->get_acl(), which will return the cached value, if possible. In rcu path walk, however, get_cached_acl_rcu() is employed to get the value from the cache, which will fail on overlayfs resulting in dropping out of rcu walk mode. This can result in a big performance hit in certain situations. Fix by calling ->get_acl() with rcu=true in case of ACL_DONT_CACHE (which indicates pass-through) Reported-by: garyhuang Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 5 +++++ include/linux/posix_acl.h | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 73376dfe28d0..c6e5bcbff0c0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -581,6 +581,11 @@ static inline void mapping_allow_writable(struct address_space *mapping) struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) +/* + * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to + * cache the ACL. This also means that ->get_acl() can be called in RCU mode + * with the LOOKUP_RCU flag. + */ #define ACL_DONT_CACHE ((void *)(-3)) static inline struct posix_acl * diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 307094ebb88c..b65c877d92b8 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -72,6 +72,8 @@ extern struct posix_acl *get_posix_acl(struct inode *, int); extern int set_posix_acl(struct user_namespace *, struct inode *, int, struct posix_acl *); +struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); + #ifdef CONFIG_FS_POSIX_ACL int posix_acl_chmod(struct user_namespace *, struct inode *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, @@ -84,7 +86,6 @@ extern int simple_set_acl(struct user_namespace *, struct inode *, extern int simple_acl_create(struct inode *, struct inode *); struct posix_acl *get_cached_acl(struct inode *inode, int type); -struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); void forget_cached_acl(struct inode *inode, int type); void forget_all_cached_acls(struct inode *inode); -- cgit v1.2.3 From 8b0e6c744fef6462382041b30878c91f15069fc6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:56 -0400 Subject: tracing: Add DYNAMIC flag for dynamic events To differentiate between static and dynamic events, add a new flag DYNAMIC to the event flags that all dynamic events have set. This will allow to differentiate when attaching to a dynamic event from a static event. Static events have a mod pointer that references the module they were created in (or NULL for core kernel). This can be incremented when the event has something attached to it. But there exists no such mechanism for dynamic events. This is dangerous as the dynamic events may now disappear without the "attachment" knowing that it no longer exists. To enforce the dynamic flag, change dyn_event_add() to pass the event that is being created such that it can set the DYNAMIC flag of the event. This helps make sure that no location that creates a dynamic event misses setting this flag. Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/ Link: https://lkml.kernel.org/r/20210817035026.936958254@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index ad413b382a3c..53c9dffd87fd 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -310,6 +310,7 @@ enum { TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, TRACE_EVENT_FL_TRACEPOINT_BIT, + TRACE_EVENT_FL_DYNAMIC_BIT, TRACE_EVENT_FL_KPROBE_BIT, TRACE_EVENT_FL_UPROBE_BIT, }; @@ -321,6 +322,7 @@ enum { * NO_SET_FILTER - Set when filter has error and is to be ignored * IGNORE_ENABLE - For trace internal events, do not enable with debugfs file * TRACEPOINT - Event is a tracepoint + * DYNAMIC - Event is a dynamic event (created at run time) * KPROBE - Event is a kprobe * UPROBE - Event is a uprobe */ @@ -330,6 +332,7 @@ enum { TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), + TRACE_EVENT_FL_DYNAMIC = (1 << TRACE_EVENT_FL_DYNAMIC_BIT), TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT), }; -- cgit v1.2.3 From 1d18538e6a09265003a0a94ca779d7a6127cb76c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:57 -0400 Subject: tracing: Have dynamic events have a ref counter As dynamic events are not created by modules, if something is attached to one, calling "try_module_get()" on its "mod" field, is not going to keep the dynamic event from going away. Since dynamic events do not need the "mod" pointer of the event structure, make a union out of it in order to save memory (there's one structure for each of the thousand+ events in the kernel), and have any event with the DYNAMIC flag set to use a ref counter instead. Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/ Link: https://lkml.kernel.org/r/20210817035027.174869074@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 45 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 53c9dffd87fd..9564c4d9a3b6 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -350,7 +350,14 @@ struct trace_event_call { struct trace_event event; char *print_fmt; struct event_filter *filter; - void *mod; + /* + * Static events can disappear with modules, + * where as dynamic ones need their own ref count. + */ + union { + void *module; + atomic_t refcnt; + }; void *data; /* See the TRACE_EVENT_FL_* flags above */ @@ -366,6 +373,42 @@ struct trace_event_call { #endif }; +#ifdef CONFIG_DYNAMIC_EVENTS +bool trace_event_dyn_try_get_ref(struct trace_event_call *call); +void trace_event_dyn_put_ref(struct trace_event_call *call); +bool trace_event_dyn_busy(struct trace_event_call *call); +#else +static inline bool trace_event_dyn_try_get_ref(struct trace_event_call *call) +{ + /* Without DYNAMIC_EVENTS configured, nothing should be calling this */ + return false; +} +static inline void trace_event_dyn_put_ref(struct trace_event_call *call) +{ +} +static inline bool trace_event_dyn_busy(struct trace_event_call *call) +{ + /* Nothing should call this without DYNAIMIC_EVENTS configured. */ + return true; +} +#endif + +static inline bool trace_event_try_get_ref(struct trace_event_call *call) +{ + if (call->flags & TRACE_EVENT_FL_DYNAMIC) + return trace_event_dyn_try_get_ref(call); + else + return try_module_get(call->module); +} + +static inline void trace_event_put_ref(struct trace_event_call *call) +{ + if (call->flags & TRACE_EVENT_FL_DYNAMIC) + trace_event_dyn_put_ref(call); + else + module_put(call->module); +} + #ifdef CONFIG_PERF_EVENTS static inline bool bpf_prog_array_valid(struct trace_event_call *call) { -- cgit v1.2.3 From 6937b7dd434962377e00efc04adac0390c287199 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Tue, 17 Aug 2021 23:34:59 +0530 Subject: PCI: Add support for ACPI _RST reset method _RST is a standard ACPI method that performs a function level reset of a device (ACPI v6.3, sec 7.3.25). Add pci_dev_acpi_reset() to probe for _RST method and execute if present. The default priority of this reset is set to below device-specific and above hardware resets. Suggested-by: Alex Williamson Link: https://lore.kernel.org/r/20210817180500.1253-9-ameynarkhede03@gmail.com Signed-off-by: Shanker Donthineni Signed-off-by: Bjorn Helgaas Reviewed-by: Sinan Kaya Reviewed-by: Alex Williamson --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index d1f4d248617b..98718f46a61c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -50,7 +50,7 @@ PCI_STATUS_PARITY) /* Number of reset methods used in pci_reset_fn_methods array in pci.c */ -#define PCI_NUM_RESET_METHODS 6 +#define PCI_NUM_RESET_METHODS 7 /* * The PCI interface treats multi-function devices as independent -- cgit v1.2.3 From 9bdc81ce440ec6ea899b236879aee470ec388020 Mon Sep 17 00:00:00 2001 From: Amey Narkhede Date: Tue, 17 Aug 2021 23:35:00 +0530 Subject: PCI: Change the type of probe argument in reset functions Change the type of probe argument in functions which implement reset methods from int to bool to make the context and intent clear. Suggested-by: Alex Williamson Link: https://lore.kernel.org/r/20210817180500.1253-10-ameynarkhede03@gmail.com Signed-off-by: Amey Narkhede Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 5 ++++- include/linux/pci_hotplug.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 98718f46a61c..a46363f29b68 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -52,6 +52,9 @@ /* Number of reset methods used in pci_reset_fn_methods array in pci.c */ #define PCI_NUM_RESET_METHODS 7 +#define PCI_RESET_PROBE true +#define PCI_RESET_DO_RESET false + /* * The PCI interface treats multi-function devices as independent * devices. The slot/function address of each device is encoded @@ -1234,7 +1237,7 @@ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev, enum pci_bus_speed *speed, enum pcie_link_width *width); void pcie_print_link_status(struct pci_dev *dev); -int pcie_reset_flr(struct pci_dev *dev, int probe); +int pcie_reset_flr(struct pci_dev *dev, bool probe); int pcie_flr(struct pci_dev *dev); int __pci_reset_function_locked(struct pci_dev *dev); int pci_reset_function(struct pci_dev *dev); diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h index 2dac431d94ac..3a10d6ec3ee7 100644 --- a/include/linux/pci_hotplug.h +++ b/include/linux/pci_hotplug.h @@ -44,7 +44,7 @@ struct hotplug_slot_ops { int (*get_attention_status) (struct hotplug_slot *slot, u8 *value); int (*get_latch_status) (struct hotplug_slot *slot, u8 *value); int (*get_adapter_status) (struct hotplug_slot *slot, u8 *value); - int (*reset_slot) (struct hotplug_slot *slot, int probe); + int (*reset_slot) (struct hotplug_slot *slot, bool probe); }; /** -- cgit v1.2.3 From 39f75da7bcc829ddc4d40bb60d0e95520de7898b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 2 Aug 2021 23:40:31 +0300 Subject: isystem: trim/fixup stdarg.h and other headers Delete/fixup few includes in anticipation of global -isystem compile option removal. Note: crypto/aegis128-neon-inner.c keeps due to redefinition of uintptr_t error (one definition comes from , another from ). Signed-off-by: Alexey Dobriyan Signed-off-by: Masahiro Yamada --- include/linux/filter.h | 2 -- include/linux/mISDNif.h | 1 - 2 files changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 83b896044e79..c1711c9f9439 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -5,8 +5,6 @@ #ifndef __LINUX_FILTER_H__ #define __LINUX_FILTER_H__ -#include - #include #include #include diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h index a7330eb3ec64..7dd1f01ec4f9 100644 --- a/include/linux/mISDNif.h +++ b/include/linux/mISDNif.h @@ -18,7 +18,6 @@ #ifndef mISDNIF_H #define mISDNIF_H -#include #include #include #include -- cgit v1.2.3 From c0891ac15f0428ffa81b2e818d416bdf3cb74ab6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 2 Aug 2021 23:40:32 +0300 Subject: isystem: ship and use stdarg.h Ship minimal stdarg.h (1 type, 4 macros) as . stdarg.h is the only userspace header commonly used in the kernel. GPL 2 version of can be extracted from http://archive.debian.org/debian/pool/main/g/gcc-4.2/gcc-4.2_4.2.4.orig.tar.gz Signed-off-by: Alexey Dobriyan Acked-by: Rafael J. Wysocki Acked-by: Ard Biesheuvel Signed-off-by: Masahiro Yamada --- include/linux/kernel.h | 2 +- include/linux/printk.h | 2 +- include/linux/stdarg.h | 11 +++++++++++ include/linux/string.h | 2 +- 4 files changed, 14 insertions(+), 3 deletions(-) create mode 100644 include/linux/stdarg.h (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 1b2f0a7e00d6..2776423a587e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -2,7 +2,7 @@ #ifndef _LINUX_KERNEL_H #define _LINUX_KERNEL_H -#include +#include #include #include #include diff --git a/include/linux/printk.h b/include/linux/printk.h index e834d78f0478..9f3f29ea348e 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -2,7 +2,7 @@ #ifndef __KERNEL_PRINTK__ #define __KERNEL_PRINTK__ -#include +#include #include #include #include diff --git a/include/linux/stdarg.h b/include/linux/stdarg.h new file mode 100644 index 000000000000..c8dc7f4f390c --- /dev/null +++ b/include/linux/stdarg.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#ifndef _LINUX_STDARG_H +#define _LINUX_STDARG_H + +typedef __builtin_va_list va_list; +#define va_start(v, l) __builtin_va_start(v, l) +#define va_end(v) __builtin_va_end(v) +#define va_arg(v, T) __builtin_va_arg(v, T) +#define va_copy(d, s) __builtin_va_copy(d, s) + +#endif diff --git a/include/linux/string.h b/include/linux/string.h index b48d2d28e0b1..5e96d656be7a 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -6,7 +6,7 @@ #include /* for size_t */ #include /* for NULL */ #include /* for E2BIG */ -#include +#include #include extern char *strndup_user(const char __user *, long); -- cgit v1.2.3 From 22f9feb49950885cdb6e37513f134d154175e743 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jun 2021 19:37:00 +0200 Subject: dma-mapping: make the global coherent pool conditional Only build the code to support the global coherent pool if support for it is enabled. Signed-off-by: Christoph Hellwig Tested-by: Dillon Min --- include/linux/dma-map-ops.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 068f1b11a6a4..0d5b06b3a4a6 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -171,13 +171,6 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr); int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret); - -void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle); -int dma_release_from_global_coherent(int order, void *vaddr); -int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, - size_t size, int *ret); -int dma_init_global_coherent(phys_addr_t phys_addr, size_t size); #else static inline int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size) @@ -187,7 +180,16 @@ static inline int dma_declare_coherent_memory(struct device *dev, #define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0) #define dma_release_from_dev_coherent(dev, order, vaddr) (0) #define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0) +#endif /* CONFIG_DMA_DECLARE_COHERENT */ +#ifdef CONFIG_DMA_GLOBAL_POOL +void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle); +int dma_release_from_global_coherent(int order, void *vaddr); +int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, + size_t size, int *ret); +int dma_init_global_coherent(phys_addr_t phys_addr, size_t size); +#else static inline void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle) { @@ -202,7 +204,7 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma, { return 0; } -#endif /* CONFIG_DMA_DECLARE_COHERENT */ +#endif /* CONFIG_DMA_GLOBAL_POOL */ /* * This is the actual return value from the ->alloc_noncontiguous method. -- cgit v1.2.3 From 4d99efb229e63928c6b03a756a2e38cd4777fbe8 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 18 Aug 2021 21:48:44 +0800 Subject: iommu/vt-d: Update the virtual command related registers The VT-d spec Revision 3.3 updated the virtual command registers, virtual command opcode B register, virtual command response register and virtual command capability register (Section 10.4.43, 10.4.44, 10.4.45, 10.4.46). This updates the virtual command interface implementation in the Intel IOMMU driver accordingly. Fixes: 24f27d32ab6b7 ("iommu/vt-d: Enlightened PASID allocation") Signed-off-by: Lu Baolu Cc: Ashok Raj Cc: Sanjay Kumar Cc: Kevin Tian Link: https://lore.kernel.org/r/20210713042649.3547403-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20210818134852.1847070-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index d0fa0b31994d..05a65eb155f7 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -124,9 +124,9 @@ #define DMAR_MTRR_PHYSMASK8_REG 0x208 #define DMAR_MTRR_PHYSBASE9_REG 0x210 #define DMAR_MTRR_PHYSMASK9_REG 0x218 -#define DMAR_VCCAP_REG 0xe00 /* Virtual command capability register */ -#define DMAR_VCMD_REG 0xe10 /* Virtual command register */ -#define DMAR_VCRSP_REG 0xe20 /* Virtual command response register */ +#define DMAR_VCCAP_REG 0xe30 /* Virtual command capability register */ +#define DMAR_VCMD_REG 0xe00 /* Virtual command register */ +#define DMAR_VCRSP_REG 0xe10 /* Virtual command response register */ #define DMAR_IQER_REG_IQEI(reg) FIELD_GET(GENMASK_ULL(3, 0), reg) #define DMAR_IQER_REG_ITESID(reg) FIELD_GET(GENMASK_ULL(47, 32), reg) -- cgit v1.2.3 From 48811c44349ffbb778d3e36b53beb03ad43a979c Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 18 Aug 2021 21:48:49 +0800 Subject: iommu/vt-d: Allow devices to have more than 32 outstanding PRs The minimum per-IOMMU PRQ queue size is one 4K page, this is more entries than the hardcoded limit of 32 in the current VT-d code. Some devices can support up to 512 outstanding PRQs but underutilized by this limit of 32. Although, 32 gives some rough fairness when multiple devices share the same IOMMU PRQ queue, but far from optimal for customized use case. This extends the per-IOMMU PRQ queue size to four 4K pages and let the devices have as many outstanding page requests as they can. Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20210720013856.4143880-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20210818134852.1847070-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-svm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 10fa80eef13a..57cceecbe37f 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -14,6 +14,11 @@ #define SVM_REQ_EXEC (1<<1) #define SVM_REQ_PRIV (1<<0) +/* Page Request Queue depth */ +#define PRQ_ORDER 2 +#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) +#define PRQ_DEPTH ((0x1000 << PRQ_ORDER) >> 5) + /* * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only * for access to kernel addresses. No IOTLB flushes are automatically done -- cgit v1.2.3 From e1c6b9e1669e44fb7f9688e34e460b759e3b9187 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 2 Aug 2021 19:28:08 +0000 Subject: entry: KVM: Allow use of generic KVM entry w/o full generic support Some architectures (e.g. arm64) have yet to adopt the generic entry infrastructure. Despite that, it would be nice to use some common plumbing for guest entry/exit handling. For example, KVM/arm64 currently does not handle TIF_NOTIFY_PENDING correctly. Allow use of only the generic KVM entry code by tightening up the include list. No functional change intended. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210802192809.1851010-3-oupton@google.com --- include/linux/entry-kvm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h index 136b8d97d8c0..0d7865a0731c 100644 --- a/include/linux/entry-kvm.h +++ b/include/linux/entry-kvm.h @@ -2,7 +2,11 @@ #ifndef __LINUX_ENTRYKVM_H #define __LINUX_ENTRYKVM_H -#include +#include +#include +#include +#include +#include #include /* Transfer to guest mode work */ -- cgit v1.2.3 From 1cf362e907f36f104b9cf590ee6ced786226b388 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 19 Aug 2021 18:03:37 +0530 Subject: PCI: endpoint: Add support to add virtual function in endpoint core Add support to add virtual function in endpoint core. The virtual function can only be associated with a physical function instead of a endpoint controller. Provide APIs to associate a virtual function with a physical function here. [weiyongjun1@huawei.com: PCI: endpoint: Fix missing unlock on error in pci_epf_add_vepf() - Reported-by: Hulk Robot ] Link: https://lore.kernel.org/r/20210819123343.1951-3-kishon@ti.com Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Wei Yongjun Signed-off-by: Lorenzo Pieralisi --- include/linux/pci-epf.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index 2debc27ba95e..043b4c9c7188 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -121,8 +121,10 @@ struct pci_epf_bar { * @bar: represents the BAR of EPF device * @msi_interrupts: number of MSI interrupts required by this function * @msix_interrupts: number of MSI-X interrupts required by this function - * @func_no: unique function number within this endpoint device + * @func_no: unique (physical) function number within this endpoint device + * @vfunc_no: unique virtual function number within a physical function * @epc: the EPC device to which this EPF device is bound + * @epf_pf: the physical EPF device to which this virtual EPF device is bound * @driver: the EPF driver to which this EPF device is bound * @list: to add pci_epf as a list of PCI endpoint functions to pci_epc * @nb: notifier block to notify EPF of any EPC events (like linkup) @@ -133,6 +135,10 @@ struct pci_epf_bar { * @sec_epc_bar: represents the BAR of EPF device associated with secondary EPC * @sec_epc_func_no: unique (physical) function number within the secondary EPC * @group: configfs group associated with the EPF device + * @is_bound: indicates if bind notification to function driver has been invoked + * @is_vf: true - virtual function, false - physical function + * @vfunction_num_map: bitmap to manage virtual function number + * @pci_vepf: list of virtual endpoint functions associated with this function */ struct pci_epf { struct device dev; @@ -142,8 +148,10 @@ struct pci_epf { u8 msi_interrupts; u16 msix_interrupts; u8 func_no; + u8 vfunc_no; struct pci_epc *epc; + struct pci_epf *epf_pf; struct pci_epf_driver *driver; struct list_head list; struct notifier_block nb; @@ -156,6 +164,10 @@ struct pci_epf { struct pci_epf_bar sec_epc_bar[6]; u8 sec_epc_func_no; struct config_group *group; + unsigned int is_bound; + unsigned int is_vf; + unsigned long vfunction_num_map; + struct list_head pci_vepf; }; /** @@ -199,4 +211,6 @@ int pci_epf_bind(struct pci_epf *epf); void pci_epf_unbind(struct pci_epf *epf); struct config_group *pci_epf_type_add_cfs(struct pci_epf *epf, struct config_group *group); +int pci_epf_add_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf); +void pci_epf_remove_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf); #endif /* __LINUX_PCI_EPF_H */ -- cgit v1.2.3 From 53fd3cbe5e9d791d6bb6059f73a3851f155ce7c6 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 19 Aug 2021 18:03:39 +0530 Subject: PCI: endpoint: Add virtual function number in pci_epc ops Add virtual function number in pci_epc ops. EPC controller driver can perform virtual function specific initialization based on the virtual function number. Link: https://lore.kernel.org/r/20210819123343.1951-5-kishon@ti.com Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Lorenzo Pieralisi --- include/linux/pci-epc.h | 57 +++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 50a649d33e68..a48778e1a4ee 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -62,31 +62,32 @@ pci_epc_interface_string(enum pci_epc_interface_type type) * @owner: the module owner containing the ops */ struct pci_epc_ops { - int (*write_header)(struct pci_epc *epc, u8 func_no, + int (*write_header)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_header *hdr); - int (*set_bar)(struct pci_epc *epc, u8 func_no, + int (*set_bar)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar); - void (*clear_bar)(struct pci_epc *epc, u8 func_no, + void (*clear_bar)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar); - int (*map_addr)(struct pci_epc *epc, u8 func_no, + int (*map_addr)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t addr, u64 pci_addr, size_t size); - void (*unmap_addr)(struct pci_epc *epc, u8 func_no, + void (*unmap_addr)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t addr); - int (*set_msi)(struct pci_epc *epc, u8 func_no, u8 interrupts); - int (*get_msi)(struct pci_epc *epc, u8 func_no); - int (*set_msix)(struct pci_epc *epc, u8 func_no, u16 interrupts, - enum pci_barno, u32 offset); - int (*get_msix)(struct pci_epc *epc, u8 func_no); - int (*raise_irq)(struct pci_epc *epc, u8 func_no, + int (*set_msi)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + u8 interrupts); + int (*get_msi)(struct pci_epc *epc, u8 func_no, u8 vfunc_no); + int (*set_msix)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + u16 interrupts, enum pci_barno, u32 offset); + int (*get_msix)(struct pci_epc *epc, u8 func_no, u8 vfunc_no); + int (*raise_irq)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, enum pci_epc_irq_type type, u16 interrupt_num); - int (*map_msi_irq)(struct pci_epc *epc, u8 func_no, + int (*map_msi_irq)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr, u8 interrupt_num, u32 entry_size, u32 *msi_data, u32 *msi_addr_offset); int (*start)(struct pci_epc *epc); void (*stop)(struct pci_epc *epc); const struct pci_epc_features* (*get_features)(struct pci_epc *epc, - u8 func_no); + u8 func_no, u8 vfunc_no); struct module *owner; }; @@ -128,6 +129,8 @@ struct pci_epc_mem { * single window. * @num_windows: number of windows supported by device * @max_functions: max number of functions that can be configured in this EPC + * @max_vfs: Array indicating the maximum number of virtual functions that can + * be associated with each physical function * @group: configfs group representing the PCI EPC device * @lock: mutex to protect pci_epc ops * @function_num_map: bitmap to manage physical function number @@ -141,6 +144,7 @@ struct pci_epc { struct pci_epc_mem *mem; unsigned int num_windows; u8 max_functions; + u8 *max_vfs; struct config_group *group; /* mutex to protect against concurrent access of EP controller */ struct mutex lock; @@ -208,31 +212,32 @@ void pci_epc_linkup(struct pci_epc *epc); void pci_epc_init_notify(struct pci_epc *epc); void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf, enum pci_epc_interface_type type); -int pci_epc_write_header(struct pci_epc *epc, u8 func_no, +int pci_epc_write_header(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_header *hdr); -int pci_epc_set_bar(struct pci_epc *epc, u8 func_no, +int pci_epc_set_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar); -void pci_epc_clear_bar(struct pci_epc *epc, u8 func_no, +void pci_epc_clear_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar); -int pci_epc_map_addr(struct pci_epc *epc, u8 func_no, +int pci_epc_map_addr(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr, u64 pci_addr, size_t size); -void pci_epc_unmap_addr(struct pci_epc *epc, u8 func_no, +void pci_epc_unmap_addr(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr); -int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 interrupts); -int pci_epc_get_msi(struct pci_epc *epc, u8 func_no); -int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts, - enum pci_barno, u32 offset); -int pci_epc_get_msix(struct pci_epc *epc, u8 func_no); -int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no, +int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + u8 interrupts); +int pci_epc_get_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no); +int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + u16 interrupts, enum pci_barno, u32 offset); +int pci_epc_get_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no); +int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr, u8 interrupt_num, u32 entry_size, u32 *msi_data, u32 *msi_addr_offset); -int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no, +int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no, enum pci_epc_irq_type type, u16 interrupt_num); int pci_epc_start(struct pci_epc *epc); void pci_epc_stop(struct pci_epc *epc); const struct pci_epc_features *pci_epc_get_features(struct pci_epc *epc, - u8 func_no); + u8 func_no, u8 vfunc_no); enum pci_barno pci_epc_get_first_free_bar(const struct pci_epc_features *epc_features); enum pci_barno pci_epc_get_next_free_bar(const struct pci_epc_features -- cgit v1.2.3 From 249dbe74d3c4b568a623fb55c56cddf19fdf0b89 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Aug 2021 08:30:22 +0100 Subject: ARM: 9108/1: oabi-compat: rework epoll_wait/epoll_pwait emulation The epoll_wait() system call wrapper is one of the remaining users of the set_fs() infrasturcture for Arm. Changing it to not require set_fs() is rather complex unfortunately. The approach I'm taking here is to allow architectures to override the code that copies the output to user space, and let the oabi-compat implementation check whether it is getting called from an EABI or OABI system call based on the thread_info->syscall value. The in_oabi_syscall() check here mirrors the in_compat_syscall() and in_x32_syscall() helpers for 32-bit compat implementations on other architectures. Overall, the amount of code goes down, at least with the newly added sys_oabi_epoll_pwait() helper getting removed again. The downside is added complexity in the source code for the native implementation. There should be no difference in runtime performance except for Arm kernels with CONFIG_OABI_COMPAT enabled that now have to go through an external function call to check which of the two variants to use. Acked-by: Christoph Hellwig Signed-off-by: Arnd Bergmann Signed-off-by: Russell King (Oracle) --- include/linux/eventpoll.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 593322c946e6..3337745d81bd 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -68,4 +68,22 @@ static inline void eventpoll_release(struct file *file) {} #endif +#if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT) +/* ARM OABI has an incompatible struct layout and needs a special handler */ +extern struct epoll_event __user * +epoll_put_uevent(__poll_t revents, __u64 data, + struct epoll_event __user *uevent); +#else +static inline struct epoll_event __user * +epoll_put_uevent(__poll_t revents, __u64 data, + struct epoll_event __user *uevent) +{ + if (__put_user(revents, &uevent->events) || + __put_user(data, &uevent->data)) + return NULL; + + return uevent+1; +} +#endif + #endif /* #ifndef _LINUX_EVENTPOLL_H */ -- cgit v1.2.3 From bdec0145286f7e6be9b3134aa35f0f335fa27c38 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Aug 2021 08:30:23 +0100 Subject: ARM: 9114/1: oabi-compat: rework sys_semtimedop emulation sys_oabi_semtimedop() is one of the last users of set_fs() on Arm. To remove this one, expose the internal code of the actual implementation that operates on a kernel pointer and call it directly after copying. There should be no measurable impact on the normal execution of this function, and it makes the overly long function a little shorter, which may help readability. While reworking the oabi version, make it behave a little more like the native one, using kvmalloc_array() and restructure the code flow in a similar way. The naming of __do_semtimedop() is not very good, I hope someone can come up with a better name. One regression was spotted by kernel test robot and fixed before the first mailing list submission. Acked-by: Christoph Hellwig Signed-off-by: Arnd Bergmann Signed-off-by: Russell King (Oracle) --- include/linux/syscalls.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 69c9a7010081..6c6fc3fd5b72 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1373,6 +1373,9 @@ long ksys_old_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, unsigned int nsops, const struct old_timespec32 __user *timeout); +long __do_semtimedop(int semid, struct sembuf *tsems, unsigned int nsops, + const struct timespec64 *timeout, + struct ipc_namespace *ns); int __sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen); -- cgit v1.2.3 From f7403abf5f06f407c50252e003f5fb332325147b Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 20 Aug 2021 14:14:42 +0100 Subject: iommu/io-pgtable: Abstract iommu_iotlb_gather access Previously io-pgtable merely passed the iommu_iotlb_gather pointer through to helpers, but now it has grown its own direct dereference. This turns out to break the build for !IOMMU_API configs where the structure only has a dummy definition. It will probably also crash drivers who don't use the gather mechanism and simply pass in NULL. Wrap this dereference in a suitable helper which can both be stubbed out for !IOMMU_API and encapsulate a NULL check otherwise. Fixes: 7a7c5badf858 ("iommu: Indicate queued flushes via gather data") Reported-by: kernel test robot Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/83672ee76f6405c82845a55c148fa836f56fbbc1.1629465282.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 923a8d1c5e39..a23779c093c7 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -548,6 +548,11 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, gather->start = start; } +static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather) +{ + return gather && gather->queued; +} + /* PCI device grouping function */ extern struct iommu_group *pci_device_group(struct device *dev); /* Generic device grouping function */ @@ -896,6 +901,11 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, { } +static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather) +{ + return false; +} + static inline void iommu_device_unregister(struct iommu_device *iommu) { } -- cgit v1.2.3 From 7491e2c442781a1860181adb5ab472a52075f393 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Thu, 19 Aug 2021 11:26:06 -0400 Subject: tracing: Add a probe that attaches to trace events A new dynamic event is introduced: event probe. The event is attached to an existing tracepoint and uses its fields as arguments. The user can specify custom format string of the new event, select what tracepoint arguments will be printed and how to print them. An event probe is created by writing configuration string in 'dynamic_events' ftrace file: e[:[SNAME/]ENAME] SYSTEM/EVENT [FETCHARGS] - Set an event probe -:SNAME/ENAME - Delete an event probe Where: SNAME - System name, if omitted 'eprobes' is used. ENAME - Name of the new event in SNAME, if omitted the SYSTEM_EVENT is used. SYSTEM - Name of the system, where the tracepoint is defined, mandatory. EVENT - Name of the tracepoint event in SYSTEM, mandatory. FETCHARGS - Arguments: =$[:TYPE] - Fetch given filed of the tracepoint and print it as given TYPE with given name. Supported types are: (u8/u16/u32/u64/s8/s16/s32/s64), basic type (x8/x16/x32/x64), hexadecimal types "string", "ustring" and bitfield. Example, attach an event probe on openat system call and print name of the file that will be opened: echo "e:esys/eopen syscalls/sys_enter_openat file=\$filename:string" >> dynamic_events A new dynamic event is created in events/esys/eopen/ directory. It can be deleted with: echo "-:esys/eopen" >> dynamic_events Filters, triggers and histograms can be attached to the new event, it can be matched in synthetic events. There is one limitation - an event probe can not be attached to kprobe, uprobe or another event probe. Link: https://lkml.kernel.org/r/20210812145805.2292326-1-tz.stoyanov@gmail.com Link: https://lkml.kernel.org/r/20210819152825.142428383@goodmis.org Acked-by: Masami Hiramatsu Co-developed-by: Steven Rostedt (VMware) Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 9564c4d9a3b6..0a0144580bbd 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -313,6 +313,7 @@ enum { TRACE_EVENT_FL_DYNAMIC_BIT, TRACE_EVENT_FL_KPROBE_BIT, TRACE_EVENT_FL_UPROBE_BIT, + TRACE_EVENT_FL_EPROBE_BIT, }; /* @@ -325,6 +326,7 @@ enum { * DYNAMIC - Event is a dynamic event (created at run time) * KPROBE - Event is a kprobe * UPROBE - Event is a uprobe + * EPROBE - Event is an event probe */ enum { TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), @@ -335,6 +337,7 @@ enum { TRACE_EVENT_FL_DYNAMIC = (1 << TRACE_EVENT_FL_DYNAMIC_BIT), TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT), + TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) @@ -680,6 +683,7 @@ enum event_trigger_type { ETT_EVENT_ENABLE = (1 << 3), ETT_EVENT_HIST = (1 << 4), ETT_HIST_ENABLE = (1 << 5), + ETT_EVENT_EPROBE = (1 << 6), }; extern int filter_match_preds(struct event_filter *filter, void *rec); -- cgit v1.2.3 From edb298c663fccad65fe99fcec6a4f96cc344520d Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 10 Aug 2021 23:52:39 +0300 Subject: KVM: x86/mmu: bump mmu notifier count in kvm_zap_gfn_range This together with previous patch, ensures that kvm_zap_gfn_range doesn't race with page fault running on another vcpu, and will make this page fault code retry instead. This is based on a patch suggested by Sean Christopherson: https://lkml.org/lkml/2021/7/22/1025 Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Message-Id: <20210810205251.424103-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f50bfcf225f0..4e43843fe0d7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -991,6 +991,11 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); #endif +void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, + unsigned long end); +void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, + unsigned long end); + long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, -- cgit v1.2.3 From f95937ccf5bd5e0a6bbac2b8e65a87982ffae403 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Mon, 2 Aug 2021 16:56:29 +0000 Subject: KVM: stats: Support linear and logarithmic histogram statistics Add new types of KVM stats, linear and logarithmic histogram. Histogram are very useful for observing the value distribution of time or size related stats. Signed-off-by: Jing Zhang Message-Id: <20210802165633.1866976-2-jingzhangos@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 90 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 75 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4e43843fe0d7..09fc0274b1eb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1356,56 +1356,66 @@ struct _kvm_stats_desc { char name[KVM_STATS_NAME_SIZE]; }; -#define STATS_DESC_COMMON(type, unit, base, exp) \ +#define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz) \ .flags = type | unit | base | \ BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \ BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \ .exponent = exp, \ - .size = 1 + .size = sz, \ + .bucket_size = bsz -#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp) \ +#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vm_stat, generic.stat) \ }, \ .name = #stat, \ } -#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp) \ +#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \ }, \ .name = #stat, \ } -#define VM_STATS_DESC(stat, type, unit, base, exp) \ +#define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vm_stat, stat) \ }, \ .name = #stat, \ } -#define VCPU_STATS_DESC(stat, type, unit, base, exp) \ +#define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vcpu_stat, stat) \ }, \ .name = #stat, \ } /* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */ -#define STATS_DESC(SCOPE, stat, type, unit, base, exp) \ - SCOPE##_STATS_DESC(stat, type, unit, base, exp) +#define STATS_DESC(SCOPE, stat, type, unit, base, exp, sz, bsz) \ + SCOPE##_STATS_DESC(stat, type, unit, base, exp, sz, bsz) #define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent) \ - STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, unit, base, exponent) + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, \ + unit, base, exponent, 1, 0) #define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent) \ - STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, unit, base, exponent) + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, \ + unit, base, exponent, 1, 0) #define STATS_DESC_PEAK(SCOPE, name, unit, base, exponent) \ - STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, unit, base, exponent) + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, \ + unit, base, exponent, 1, 0) +#define STATS_DESC_LINEAR_HIST(SCOPE, name, unit, base, exponent, sz, bsz) \ + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LINEAR_HIST, \ + unit, base, exponent, sz, bsz) +#define STATS_DESC_LOG_HIST(SCOPE, name, unit, base, exponent, sz) \ + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LOG_HIST, \ + unit, base, exponent, sz, 0) /* Cumulative counter, read/write */ #define STATS_DESC_COUNTER(SCOPE, name) \ @@ -1424,6 +1434,14 @@ struct _kvm_stats_desc { #define STATS_DESC_TIME_NSEC(SCOPE, name) \ STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ KVM_STATS_BASE_POW10, -9) +/* Linear histogram for time in nanosecond */ +#define STATS_DESC_LINHIST_TIME_NSEC(SCOPE, name, sz, bsz) \ + STATS_DESC_LINEAR_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ + KVM_STATS_BASE_POW10, -9, sz, bsz) +/* Logarithmic histogram for time in nanosecond */ +#define STATS_DESC_LOGHIST_TIME_NSEC(SCOPE, name, sz) \ + STATS_DESC_LOG_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ + KVM_STATS_BASE_POW10, -9, sz) #define KVM_GENERIC_VM_STATS() \ STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush) @@ -1437,10 +1455,52 @@ struct _kvm_stats_desc { STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns) extern struct dentry *kvm_debugfs_dir; + ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, const struct _kvm_stats_desc *desc, void *stats, size_t size_stats, char __user *user_buffer, size_t size, loff_t *offset); + +/** + * kvm_stats_linear_hist_update() - Update bucket value for linear histogram + * statistics data. + * + * @data: start address of the stats data + * @size: the number of bucket of the stats data + * @value: the new value used to update the linear histogram's bucket + * @bucket_size: the size (width) of a bucket + */ +static inline void kvm_stats_linear_hist_update(u64 *data, size_t size, + u64 value, size_t bucket_size) +{ + size_t index = div64_u64(value, bucket_size); + + index = min(index, size - 1); + ++data[index]; +} + +/** + * kvm_stats_log_hist_update() - Update bucket value for logarithmic histogram + * statistics data. + * + * @data: start address of the stats data + * @size: the number of bucket of the stats data + * @value: the new value used to update the logarithmic histogram's bucket + */ +static inline void kvm_stats_log_hist_update(u64 *data, size_t size, u64 value) +{ + size_t index = fls64(value); + + index = min(index, size - 1); + ++data[index]; +} + +#define KVM_STATS_LINEAR_HIST_UPDATE(array, value, bsize) \ + kvm_stats_linear_hist_update(array, ARRAY_SIZE(array), value, bsize) +#define KVM_STATS_LOG_HIST_UPDATE(array, value) \ + kvm_stats_log_hist_update(array, ARRAY_SIZE(array), value) + + extern const struct kvm_stats_header kvm_vm_stats_header; extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; -- cgit v1.2.3 From 87bcc5fa092f82a9890f9e73e4f4c7016ef64049 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Mon, 2 Aug 2021 16:56:32 +0000 Subject: KVM: stats: Add halt_wait_ns stats for all architectures Add simple stats halt_wait_ns to record the time a VCPU has spent on waiting for all architectures (not just powerpc). Signed-off-by: Jing Zhang Message-Id: <20210802165633.1866976-5-jingzhangos@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 3 ++- include/linux/kvm_types.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 09fc0274b1eb..58a8ffee265e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1452,7 +1452,8 @@ struct _kvm_stats_desc { STATS_DESC_COUNTER(VCPU_GENERIC, halt_poll_invalid), \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns), \ - STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns) + STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns), \ + STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns) extern struct dentry *kvm_debugfs_dir; diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index ed6a985c5680..291ef55125b2 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -87,6 +87,7 @@ struct kvm_vcpu_stat_generic { u64 halt_wakeup; u64 halt_poll_success_ns; u64 halt_poll_fail_ns; + u64 halt_wait_ns; }; #define KVM_STATS_NAME_SIZE 48 -- cgit v1.2.3 From 8ccba534a1a5c6565220c81113d6157571f380cb Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Mon, 2 Aug 2021 16:56:33 +0000 Subject: KVM: stats: Add halt polling related histogram stats Add three log histogram stats to record the distribution of time spent on successful polling, failed polling and VCPU wait. halt_poll_success_hist: Distribution of spent time for a successful poll. halt_poll_fail_hist: Distribution of spent time for a failed poll. halt_wait_hist: Distribution of time a VCPU has spent on waiting. Signed-off-by: Jing Zhang Message-Id: <20210802165633.1866976-6-jingzhangos@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 8 +++++++- include/linux/kvm_types.h | 5 +++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 58a8ffee265e..e4d712e9f760 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1453,7 +1453,13 @@ struct _kvm_stats_desc { STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns), \ - STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns) + STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns), \ + STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_success_hist, \ + HALT_POLL_HIST_COUNT), \ + STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_hist, \ + HALT_POLL_HIST_COUNT), \ + STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist, \ + HALT_POLL_HIST_COUNT) extern struct dentry *kvm_debugfs_dir; diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 291ef55125b2..de7fb5f364d8 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -76,6 +76,8 @@ struct kvm_mmu_memory_cache { }; #endif +#define HALT_POLL_HIST_COUNT 32 + struct kvm_vm_stat_generic { u64 remote_tlb_flush; }; @@ -88,6 +90,9 @@ struct kvm_vcpu_stat_generic { u64 halt_poll_success_ns; u64 halt_poll_fail_ns; u64 halt_wait_ns; + u64 halt_poll_success_hist[HALT_POLL_HIST_COUNT]; + u64 halt_poll_fail_hist[HALT_POLL_HIST_COUNT]; + u64 halt_wait_hist[HALT_POLL_HIST_COUNT]; }; #define KVM_STATS_NAME_SIZE 48 -- cgit v1.2.3 From 76f3c032adad86aad26f8ad3eebc993b4ba32138 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 18 Aug 2021 20:59:31 +0200 Subject: PCI/VPD: Add pci_vpd_alloc() Several users of the VPD API use a fixed-size buffer and read the VPD into it for further usage. This requires special handling for the case that the buffer isn't big enough to hold the full VPD data. Also the buffer is often allocated on the stack, which isn't too nice. Add pci_vpd_alloc() to dynamically allocate buffer of the correct size and read VPD into it. Link: https://lore.kernel.org/r/955ff598-0021-8446-f856-0c2c077635d7@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index e752cc39a1fe..8c681e24be8b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2330,6 +2330,15 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field) return info_field[2]; } +/** + * pci_vpd_alloc - Allocate buffer and read VPD into it + * @dev: PCI device + * @size: pointer to field where VPD length is returned + * + * Returns pointer to allocated buffer or an ERR_PTR in case of failure + */ +void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size); + /** * pci_vpd_find_tag - Locates the Resource Data Type tag provided * @buf: Pointer to buffered vpd data -- cgit v1.2.3 From 9e515c9f6c0b6f0ace6f5cf2202b527d745b494d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 18 Aug 2021 21:00:57 +0200 Subject: PCI/VPD: Add pci_vpd_find_ro_info_keyword() All users of pci_vpd_find_info_keyword() are interested in the VPD RO section only. In addition all calls are followed by the same activities to calculate start of tag data area and size of the data area. Add pci_vpd_find_ro_info_keyword() that combines these functionalities. pci_vpd_find_info_keyword() can be phased out once all users are converted. [bhelgaas: split pci_vpd_check_csum() to separate patch] Link: https://lore.kernel.org/r/1643bd7a-088e-1028-c9b0-9d112cf48d63@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 8c681e24be8b..9e3b60963a52 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2363,6 +2363,19 @@ int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt); int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, unsigned int len, const char *kw); +/** + * pci_vpd_find_ro_info_keyword - Locate info field keyword in VPD RO section + * @buf: Pointer to buffered VPD data + * @len: The length of the buffer area in which to search + * @kw: The keyword to search for + * @size: Pointer to field where length of found keyword data is returned + * + * Returns the index of the information field keyword data or -ENOENT if + * not found. + */ +int pci_vpd_find_ro_info_keyword(const void *buf, unsigned int len, + const char *kw, unsigned int *size); + /* PCI <-> OF binding helpers */ #ifdef CONFIG_OF struct device_node; -- cgit v1.2.3 From 6107e5cb907cffc5576cc1297847f9fc69a8d5d9 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 20 Aug 2021 15:32:42 -0500 Subject: PCI/VPD: Add pci_vpd_check_csum() VPD checksum information and checksum calculation are specified by PCIe r5.0, sec 6.28.2.2. Therefore checksum handling can and should be moved into the PCI VPD core. Add pci_vpd_check_csum() to validate the VPD checksum. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/1643bd7a-088e-1028-c9b0-9d112cf48d63@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e3b60963a52..827b7eefd550 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2376,6 +2376,15 @@ int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, int pci_vpd_find_ro_info_keyword(const void *buf, unsigned int len, const char *kw, unsigned int *size); +/** + * pci_vpd_check_csum - Check VPD checksum + * @buf: Pointer to buffered VPD data + * @len: VPD size + * + * Returns 1 if VPD has no checksum, otherwise 0 or an errno + */ +int pci_vpd_check_csum(const void *buf, unsigned int len); + /* PCI <-> OF binding helpers */ #ifdef CONFIG_OF struct device_node; -- cgit v1.2.3 From f0ab00174eb7574732737fc0734d4b406aed6231 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 2 Aug 2021 17:17:28 -0500 Subject: PCI: Make saved capability state private to core Interfaces and structs for saving and restoring PCI Capability state were declared in include/linux/pci.h, but aren't needed outside drivers/pci/. Move these to drivers/pci/pci.h: struct pci_cap_saved_data struct pci_cap_saved_state void pci_allocate_cap_save_buffers() void pci_free_cap_save_buffers() int pci_add_cap_save_buffer() int pci_add_ext_cap_save_buffer() struct pci_cap_saved_state *pci_find_saved_cap() struct pci_cap_saved_state *pci_find_saved_ext_cap() Link: https://lore.kernel.org/r/20210802221728.1469304-1-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Reviewed-by: Alex Williamson --- include/linux/pci.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..fd35327812af 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -288,18 +288,6 @@ enum pci_bus_speed { enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev); enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev); -struct pci_cap_saved_data { - u16 cap_nr; - bool cap_extended; - unsigned int size; - u32 data[]; -}; - -struct pci_cap_saved_state { - struct hlist_node next; - struct pci_cap_saved_data cap; -}; - struct irq_affinity; struct pcie_link_state; struct pci_vpd; @@ -1278,12 +1266,6 @@ int pci_load_saved_state(struct pci_dev *dev, struct pci_saved_state *state); int pci_load_and_free_saved_state(struct pci_dev *dev, struct pci_saved_state **state); -struct pci_cap_saved_state *pci_find_saved_cap(struct pci_dev *dev, char cap); -struct pci_cap_saved_state *pci_find_saved_ext_cap(struct pci_dev *dev, - u16 cap); -int pci_add_cap_save_buffer(struct pci_dev *dev, char cap, unsigned int size); -int pci_add_ext_cap_save_buffer(struct pci_dev *dev, - u16 cap, unsigned int size); int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state); int pci_set_power_state(struct pci_dev *dev, pci_power_t state); pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state); -- cgit v1.2.3 From ca32b5310a1a3835f81f498367f1bb7450c8b67b Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Tue, 13 Jul 2021 15:22:36 +0800 Subject: PCI: Optimize pci_resource_len() to reduce kernel size pci_resource_end() can be 0 only when pci_resource_start() is 0. Otherwise, it is definitely an error. In this case, pci_resource_len() should be regarded as 0. Therefore, determining whether pci_resource_start() and pci_resource_end() are both 0 can be reduced to determining only whether pci_resource_end() is 0. Although only one condition judgment is reduced, the macro function pci_resource_len() is widely referenced in the kernel. I used defconfig to compile the latest kernel on X86, and its binary code size was reduced by about 3KB. Before: [ 2] .rela.text RELA 0000000000000000 093bfcb0 0000000001a67168 0000000000000018 I 68 1 8 After: [ 2] .rela.text RELA 0000000000000000 093bfcb0 0000000001a66598 0000000000000018 I 68 1 8 Link: https://lore.kernel.org/r/20210713072236.3043-1-thunder.leizhen@huawei.com Signed-off-by: Zhen Lei Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..23ef1a15eb5d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1881,9 +1881,7 @@ int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma); #define pci_resource_end(dev, bar) ((dev)->resource[(bar)].end) #define pci_resource_flags(dev, bar) ((dev)->resource[(bar)].flags) #define pci_resource_len(dev,bar) \ - ((pci_resource_start((dev), (bar)) == 0 && \ - pci_resource_end((dev), (bar)) == \ - pci_resource_start((dev), (bar))) ? 0 : \ + ((pci_resource_end((dev), (bar)) == 0) ? 0 : \ \ (pci_resource_end((dev), (bar)) - \ pci_resource_start((dev), (bar)) + 1)) -- cgit v1.2.3 From 817f9916a6e96ae43acdd4e75459ef4f92d96eb1 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 13 Aug 2021 18:36:19 +0300 Subject: PCI: Sync __pci_register_driver() stub for CONFIG_PCI=n The CONFIG_PCI=y case got a new parameter long time ago. Sync the stub as well. [bhelgaas: add parameter names] Fixes: 725522b5453d ("PCI: add the sysfs driver name to all modules") Link: https://lore.kernel.org/r/20210813153619.89574-1-andriy.shevchenko@linux.intel.com Reported-by: kernel test robot Signed-off-by: Andy Shevchenko Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index fd35327812af..a662f6c1f120 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1722,8 +1722,9 @@ static inline void pci_disable_device(struct pci_dev *dev) { } static inline int pcim_enable_device(struct pci_dev *pdev) { return -EIO; } static inline int pci_assign_resource(struct pci_dev *dev, int i) { return -EBUSY; } -static inline int __pci_register_driver(struct pci_driver *drv, - struct module *owner) +static inline int __must_check __pci_register_driver(struct pci_driver *drv, + struct module *owner, + const char *mod_name) { return 0; } static inline int pci_register_driver(struct pci_driver *drv) { return 0; } -- cgit v1.2.3 From 60bcd91aafd22ef62cef9ae2037fa2e1d4da2fb3 Mon Sep 17 00:00:00 2001 From: Grzegorz Jaszczyk Date: Fri, 18 Jun 2021 21:50:32 +0200 Subject: watchdog: introduce watchdog_dev_suspend/resume The watchdog drivers often disable wdog clock during suspend and then enable it again during resume. Nevertheless the ping worker is still running and can issue low-level ping while the wdog clock is disabled causing the system hang. To prevent such condition register pm notifier in the watchdog core which will call watchdog_dev_suspend/resume and actually cancel ping worker during suspend and restore it back, if needed, during resume. Signed-off-by: Grzegorz Jaszczyk Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20210618195033.3209598-2-grzegorz.jaszczyk@linaro.org Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/watchdog.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 9b19e6bb68b5..99660197a36c 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -107,6 +107,7 @@ struct watchdog_device { unsigned int max_hw_heartbeat_ms; struct notifier_block reboot_nb; struct notifier_block restart_nb; + struct notifier_block pm_nb; void *driver_data; struct watchdog_core_data *wd_data; unsigned long status; @@ -116,6 +117,7 @@ struct watchdog_device { #define WDOG_STOP_ON_REBOOT 2 /* Should be stopped on reboot */ #define WDOG_HW_RUNNING 3 /* True if HW watchdog running */ #define WDOG_STOP_ON_UNREGISTER 4 /* Should be stopped on unregister */ +#define WDOG_NO_PING_ON_SUSPEND 5 /* Ping worker should be stopped on suspend */ struct list_head deferred; }; @@ -156,6 +158,12 @@ static inline void watchdog_stop_on_unregister(struct watchdog_device *wdd) set_bit(WDOG_STOP_ON_UNREGISTER, &wdd->status); } +/* Use the following function to stop the wdog ping worker when suspending */ +static inline void watchdog_stop_ping_on_suspend(struct watchdog_device *wdd) +{ + set_bit(WDOG_NO_PING_ON_SUSPEND, &wdd->status); +} + /* Use the following function to check if a timeout value is invalid */ static inline bool watchdog_timeout_invalid(struct watchdog_device *wdd, unsigned int t) { @@ -209,6 +217,8 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd, unsigned int timeout_parm, struct device *dev); extern int watchdog_register_device(struct watchdog_device *); extern void watchdog_unregister_device(struct watchdog_device *); +int watchdog_dev_suspend(struct watchdog_device *wdd); +int watchdog_dev_resume(struct watchdog_device *wdd); int watchdog_set_last_hw_keepalive(struct watchdog_device *, unsigned int); -- cgit v1.2.3 From 6e7c1770a212239e88ec01ddc7a741505bfd10e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jul 2021 16:23:21 -0400 Subject: fs: simplify get_filesystem_list / get_all_fs_names Just output the '\0' separate list of supported file systems for block devices directly rather than going through a pointless round of string manipulation. Based on an earlier patch from Al Viro . Vivek: Modified list_bdev_fs_names() and split_fs_names() to return number of null terminted strings to caller. Callers now use that information to loop through all the strings instead of relying on one extra null char being present at the end. Signed-off-by: Christoph Hellwig Signed-off-by: Vivek Goyal Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 640574294216..c76dfc01cf9d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3622,7 +3622,7 @@ int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_inodes(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int __init get_filesystem_list(char *buf); +int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) #define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY) -- cgit v1.2.3 From e5e26d80840b69c1bcea4f5b0cb7ed4026a8f6a3 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 21 Aug 2021 00:58:21 +0200 Subject: gpio: max730x: Use the right include despite the placement of the header, is used by drivers/gpio/gpio-max730*. The include needs struct gpio_chip and needs to include not the legacy include. Signed-off-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- include/linux/spi/max7301.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/max7301.h b/include/linux/spi/max7301.h index 433c20e2f46e..21449067aedb 100644 --- a/include/linux/spi/max7301.h +++ b/include/linux/spi/max7301.h @@ -2,7 +2,7 @@ #ifndef LINUX_SPI_MAX7301_H #define LINUX_SPI_MAX7301_H -#include +#include /* * Some registers must be read back to modify. -- cgit v1.2.3 From 15d82ca23c996d50062286d27ed6a42a8105c04a Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 27 Jul 2021 02:06:50 +0800 Subject: PCI: Introduce domain_nr in pci_host_bridge Currently we retrieve the PCI domain number of the host bridge from the bus sysdata (or pci_config_window if PCI_DOMAINS_GENERIC=y). Actually we have the information at PCI host bridge probing time, and it makes sense that we store it into pci_host_bridge. One benefit of doing so is the requirement for supporting PCI on Hyper-V for ARM64, because the host bridge of Hyper-V doesn't have pci_config_window, whereas ARM64 is a PCI_DOMAINS_GENERIC=y arch, so we cannot retrieve the PCI domain number from pci_config_window on ARM64 Hyper-V guest. As the preparation for ARM64 Hyper-V PCI support, we introduce the domain_nr in pci_host_bridge and a sentinel value to allow drivers to set domain numbers properly at probing time. Currently CONFIG_PCI_DOMAINS_GENERIC=y archs are only users of this newly-introduced field. Link: https://lore.kernel.org/r/20210726180657.142727-2-boqun.feng@gmail.com Signed-off-by: Boqun Feng Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- include/linux/pci.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..01aa201e1df0 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -526,6 +526,16 @@ static inline int pci_channel_offline(struct pci_dev *pdev) return (pdev->error_state != pci_channel_io_normal); } +/* + * Currently in ACPI spec, for each PCI host bridge, PCI Segment + * Group number is limited to a 16-bit value, therefore (int)-1 is + * not a valid PCI domain number, and can be used as a sentinel + * value indicating ->domain_nr is not set by the driver (and + * CONFIG_PCI_DOMAINS_GENERIC=y archs will set it with + * pci_bus_find_domain_nr()). + */ +#define PCI_DOMAIN_NR_NOT_SET (-1) + struct pci_host_bridge { struct device dev; struct pci_bus *bus; /* Root bus */ @@ -533,6 +543,7 @@ struct pci_host_bridge { struct pci_ops *child_ops; void *sysdata; int busnr; + int domain_nr; struct list_head windows; /* resource_entry */ struct list_head dma_ranges; /* dma ranges resource list */ u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */ -- cgit v1.2.3 From 90e7a6de62781c27d6a111fccfb19b807f9b6887 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 24 Aug 2021 17:25:29 +0300 Subject: lib/scatterlist: Provide a dedicated function to support table append RDMA is the only in-kernel user that uses __sg_alloc_table_from_pages to append pages dynamically. In the next patch. That mode will be extended and that function will get more parameters. So separate it into a unique function to make such change more clear. Link: https://lore.kernel.org/r/20210824142531.3877007-2-maorg@nvidia.com Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/scatterlist.h | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index ecf87484814f..5c700f2a0d18 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -285,14 +285,45 @@ void sg_free_table(struct sg_table *); int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int, struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *); int sg_alloc_table(struct sg_table *, unsigned int, gfp_t); -struct scatterlist *__sg_alloc_table_from_pages(struct sg_table *sgt, +struct scatterlist *sg_alloc_append_table_from_pages(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, struct scatterlist *prv, unsigned int left_pages, gfp_t gfp_mask); -int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, - unsigned int n_pages, unsigned int offset, - unsigned long size, gfp_t gfp_mask); +int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages, + unsigned int n_pages, unsigned int offset, + unsigned long size, + unsigned int max_segment, gfp_t gfp_mask); + +/** + * sg_alloc_table_from_pages - Allocate and initialize an sg table from + * an array of pages + * @sgt: The sg table header to use + * @pages: Pointer to an array of page pointers + * @n_pages: Number of pages in the pages array + * @offset: Offset from start of the first page to the start of a buffer + * @size: Number of valid bytes in the buffer (after offset) + * @gfp_mask: GFP allocation mask + * + * Description: + * Allocate and initialize an sg table from a list of pages. Contiguous + * ranges of the pages are squashed into a single scatterlist node. A user + * may provide an offset at a start and a size of valid data in a buffer + * specified by the page array. The returned sg table is released by + * sg_free_table. + * + * Returns: + * 0 on success, negative error on failure + */ +static inline int sg_alloc_table_from_pages(struct sg_table *sgt, + struct page **pages, + unsigned int n_pages, + unsigned int offset, + unsigned long size, gfp_t gfp_mask) +{ + return sg_alloc_table_from_pages_segment(sgt, pages, n_pages, offset, + size, UINT_MAX, gfp_mask); +} #ifdef CONFIG_SGL_ALLOC struct scatterlist *sgl_alloc_order(unsigned long long length, -- cgit v1.2.3 From 3e302dbc6774a27edaea39a1d5107f0c12e35cf2 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 24 Aug 2021 17:25:30 +0300 Subject: lib/scatterlist: Fix wrong update of orig_nents orig_nents should represent the number of entries with pages, but __sg_alloc_table_from_pages sets orig_nents as the number of total entries in the table. This is wrong when the API is used for dynamic allocation where not all the table entries are mapped with pages. It wasn't observed until now, since RDMA umem who uses this API in the dynamic form doesn't use orig_nents implicit or explicit by the scatterlist APIs. Fix it by changing the append API to track the SG append table state and have an API to free the append table according to the total number of entries in the table. Now all APIs set orig_nents as number of enries with pages. Fixes: 07da1223ec93 ("lib/scatterlist: Add support in dynamic allocation of SG table from pages") Link: https://lore.kernel.org/r/20210824142531.3877007-3-maorg@nvidia.com Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/scatterlist.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 5c700f2a0d18..266754a55327 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -39,6 +39,12 @@ struct sg_table { unsigned int orig_nents; /* original size of list */ }; +struct sg_append_table { + struct sg_table sgt; /* The scatter list table */ + struct scatterlist *prv; /* last populated sge in the table */ + unsigned int total_nents; /* Total entries in the table */ +}; + /* * Notes on SG table design. * @@ -280,16 +286,17 @@ typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t); typedef void (sg_free_fn)(struct scatterlist *, unsigned int); void __sg_free_table(struct sg_table *, unsigned int, unsigned int, - sg_free_fn *); + sg_free_fn *, unsigned int); void sg_free_table(struct sg_table *); +void sg_free_append_table(struct sg_append_table *sgt); int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int, struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *); int sg_alloc_table(struct sg_table *, unsigned int, gfp_t); -struct scatterlist *sg_alloc_append_table_from_pages(struct sg_table *sgt, - struct page **pages, unsigned int n_pages, unsigned int offset, - unsigned long size, unsigned int max_segment, - struct scatterlist *prv, unsigned int left_pages, - gfp_t gfp_mask); +int sg_alloc_append_table_from_pages(struct sg_append_table *sgt, + struct page **pages, unsigned int n_pages, + unsigned int offset, unsigned long size, + unsigned int max_segment, + unsigned int left_pages, gfp_t gfp_mask); int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, -- cgit v1.2.3 From a7e20e31f6c063d928868ecc8e2effb7d4b9fe1b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 May 2021 14:10:09 +0100 Subject: netfs: Move cookie debug ID to struct netfs_cache_resources Move the cookie debug ID from struct netfs_read_request to struct netfs_cache_resources and drop the 'cookie_' prefix. This makes it available for things that want to use netfs_cache_resources without having a netfs_read_request. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/162431190784.2908479.13386972676539789127.stgit@warthog.procyon.org.uk/ --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 9062adfa2fb9..5d6a4158a9a6 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -102,6 +102,7 @@ struct netfs_cache_resources { const struct netfs_cache_ops *ops; void *cache_priv; void *cache_priv2; + unsigned int debug_id; /* Cookie debug ID */ }; /* @@ -137,7 +138,6 @@ struct netfs_read_request { struct list_head subrequests; /* Requests to fetch I/O from disk or net */ void *netfs_priv; /* Private data for the netfs */ unsigned int debug_id; - unsigned int cookie_debug_id; atomic_t nr_rd_ops; /* Number of read ops in progress */ atomic_t nr_wr_ops; /* Number of write ops in progress */ size_t submitted; /* Amount submitted for I/O so far */ -- cgit v1.2.3 From 343b7258687ecfbb363bfda8833a7cf641aac524 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 26 Aug 2021 13:39:08 +0300 Subject: PCI: Add 'override_only' field to struct pci_device_id Add 'override_only' field to struct pci_device_id to be used as part of pci_match_device(). When set, a driver only matches the entry when dev->driver_override is set to that driver. In addition, add a helper macro named 'PCI_DEVICE_DRIVER_OVERRIDE' to enable setting some data on it. Next patch from this series will use the above functionality. Signed-off-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe Acked-by: Bjorn Helgaas Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20210826103912.128972-10-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/mod_devicetable.h | 2 ++ include/linux/pci.h | 15 +++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 8e291cfdaf06..2e3ba6d9ece0 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -34,12 +34,14 @@ typedef unsigned long kernel_ulong_t; * Best practice is to use driver_data as an index * into a static list of equivalent device types, * instead of using it as a pointer. + * @override_only: Match only when dev->driver_override is this driver. */ struct pci_device_id { __u32 vendor, device; /* Vendor and device ID or PCI_ANY_ID*/ __u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */ __u32 class, class_mask; /* (class,subclass,prog-if) triplet */ kernel_ulong_t driver_data; /* Data private to the driver */ + __u32 override_only; }; diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..0506b1a8c921 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -901,6 +901,21 @@ struct pci_driver { .vendor = (vend), .device = (dev), \ .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID +/** + * PCI_DEVICE_DRIVER_OVERRIDE - macro used to describe a PCI device with + * override_only flags. + * @vend: the 16 bit PCI Vendor ID + * @dev: the 16 bit PCI Device ID + * @driver_override: the 32 bit PCI Device override_only + * + * This macro is used to create a struct pci_device_id that matches only a + * driver_override device. The subvendor and subdevice fields will be set to + * PCI_ANY_ID. + */ +#define PCI_DEVICE_DRIVER_OVERRIDE(vend, dev, driver_override) \ + .vendor = (vend), .device = (dev), .subvendor = PCI_ANY_ID, \ + .subdevice = PCI_ANY_ID, .override_only = (driver_override) + /** * PCI_DEVICE_SUB - macro used to describe a specific PCI device with subsystem * @vend: the 16 bit PCI Vendor ID -- cgit v1.2.3 From cc6711b0bf36de068b10490198d05ac168377989 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 26 Aug 2021 13:39:09 +0300 Subject: PCI / VFIO: Add 'override_only' support for VFIO PCI sub system Expose an 'override_only' helper macro (i.e. PCI_DRIVER_OVERRIDE_DEVICE_VFIO) for VFIO PCI sub system and add the required code to prefix its matching entries with "vfio_" in modules.alias file. It allows VFIO device drivers to include match entries in the modules.alias file produced by kbuild that are not used for normal driver autoprobing and module autoloading. Drivers using these match entries can be connected to the PCI device manually, by userspace, using the existing driver_override sysfs. For example the resulting modules.alias may have: alias pci:v000015B3d00001021sv*sd*bc*sc*i* mlx5_core alias vfio_pci:v000015B3d00001021sv*sd*bc*sc*i* mlx5_vfio_pci alias vfio_pci:v*d*sv*sd*bc*sc*i* vfio_pci In this example mlx5_core and mlx5_vfio_pci match to the same PCI device. The kernel will autoload and autobind to mlx5_core but the kernel and udev mechanisms will ignore mlx5_vfio_pci. When userspace wants to change a device to the VFIO subsystem it can implement a generic algorithm: 1) Identify the sysfs path to the device: /sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0 2) Get the modalias string from the kernel: $ cat /sys/bus/pci/devices/0000:01:00.0/modalias pci:v000015B3d00001021sv000015B3sd00000001bc02sc00i00 3) Prefix it with vfio_: vfio_pci:v000015B3d00001021sv000015B3sd00000001bc02sc00i00 4) Search modules.alias for the above string and select the entry that has the fewest *'s: alias vfio_pci:v000015B3d00001021sv*sd*bc*sc*i* mlx5_vfio_pci 5) modprobe the matched module name: $ modprobe mlx5_vfio_pci 6) cat the matched module name to driver_override: echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:01:00.0/driver_override 7) unbind device from original module echo 0000:01:00.0 > /sys/bus/pci/devices/0000:01:00.0/driver/unbind 8) probe PCI drivers (or explicitly bind to mlx5_vfio_pci) echo 0000:01:00.0 > /sys/bus/pci/drivers_probe The algorithm is independent of bus type. In future the other buses with VFIO device drivers, like platform and ACPI, can use this algorithm as well. This patch is the infrastructure to provide the information in the modules.alias to userspace. Convert the only VFIO pci_driver which results in one new line in the modules.alias: alias vfio_pci:v*d*sv*sd*bc*sc*i* vfio_pci Later series introduce additional HW specific VFIO PCI drivers, such as mlx5_vfio_pci. Signed-off-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe Acked-by: Bjorn Helgaas # for pci.h Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20210826103912.128972-11-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/mod_devicetable.h | 4 ++++ include/linux/pci.h | 14 ++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 2e3ba6d9ece0..ae2e75d15b21 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -16,6 +16,10 @@ typedef unsigned long kernel_ulong_t; #define PCI_ANY_ID (~0) +enum { + PCI_ID_F_VFIO_DRIVER_OVERRIDE = 1, +}; + /** * struct pci_device_id - PCI device ID structure * @vendor: Vendor ID to match (or PCI_ANY_ID) diff --git a/include/linux/pci.h b/include/linux/pci.h index 0506b1a8c921..527a1dfd1d06 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -916,6 +916,20 @@ struct pci_driver { .vendor = (vend), .device = (dev), .subvendor = PCI_ANY_ID, \ .subdevice = PCI_ANY_ID, .override_only = (driver_override) +/** + * PCI_DRIVER_OVERRIDE_DEVICE_VFIO - macro used to describe a VFIO + * "driver_override" PCI device. + * @vend: the 16 bit PCI Vendor ID + * @dev: the 16 bit PCI Device ID + * + * This macro is used to create a struct pci_device_id that matches a + * specific device. The subvendor and subdevice fields will be set to + * PCI_ANY_ID and the driver_override will be set to + * PCI_ID_F_VFIO_DRIVER_OVERRIDE. + */ +#define PCI_DRIVER_OVERRIDE_DEVICE_VFIO(vend, dev) \ + PCI_DEVICE_DRIVER_OVERRIDE(vend, dev, PCI_ID_F_VFIO_DRIVER_OVERRIDE) + /** * PCI_DEVICE_SUB - macro used to describe a specific PCI device with subsystem * @vend: the 16 bit PCI Vendor ID -- cgit v1.2.3 From 7fa005caa35ed92563b9e9d88d319b2623763a77 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 26 Aug 2021 13:39:12 +0300 Subject: vfio/pci: Introduce vfio_pci_core.ko Now that vfio_pci has been split into two source modules, one focusing on the "struct pci_driver" (vfio_pci.c) and a toolbox library of code (vfio_pci_core.c), complete the split and move them into two different kernel modules. As before vfio_pci.ko continues to present the same interface under sysfs and this change will have no functional impact. Splitting into another module and adding exports allows creating new HW specific VFIO PCI drivers that can implement device specific functionality, such as VFIO migration interfaces or specialized device requirements. Signed-off-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20210826103912.128972-14-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 239 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 239 insertions(+) create mode 100644 include/linux/vfio_pci_core.h (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h new file mode 100644 index 000000000000..ef9a44b6cf5d --- /dev/null +++ b/include/linux/vfio_pci_core.h @@ -0,0 +1,239 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson + * + * Derived from original vfio: + * Copyright 2010 Cisco Systems, Inc. All rights reserved. + * Author: Tom Lyon, pugs@cisco.com + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifndef VFIO_PCI_CORE_H +#define VFIO_PCI_CORE_H + +#define VFIO_PCI_OFFSET_SHIFT 40 + +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) + +/* Special capability IDs predefined access */ +#define PCI_CAP_ID_INVALID 0xFF /* default raw access */ +#define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ + +/* Cap maximum number of ioeventfds per device (arbitrary) */ +#define VFIO_PCI_IOEVENTFD_MAX 1000 + +struct vfio_pci_ioeventfd { + struct list_head next; + struct vfio_pci_core_device *vdev; + struct virqfd *virqfd; + void __iomem *addr; + uint64_t data; + loff_t pos; + int bar; + int count; + bool test_mem; +}; + +struct vfio_pci_irq_ctx { + struct eventfd_ctx *trigger; + struct virqfd *unmask; + struct virqfd *mask; + char *name; + bool masked; + struct irq_bypass_producer producer; +}; + +struct vfio_pci_core_device; +struct vfio_pci_region; + +struct vfio_pci_regops { + ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite); + void (*release)(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region); + int (*mmap)(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region, + struct vm_area_struct *vma); + int (*add_capability)(struct vfio_pci_core_device *vdev, + struct vfio_pci_region *region, + struct vfio_info_cap *caps); +}; + +struct vfio_pci_region { + u32 type; + u32 subtype; + const struct vfio_pci_regops *ops; + void *data; + size_t size; + u32 flags; +}; + +struct vfio_pci_dummy_resource { + struct resource resource; + int index; + struct list_head res_next; +}; + +struct vfio_pci_vf_token { + struct mutex lock; + uuid_t uuid; + int users; +}; + +struct vfio_pci_mmap_vma { + struct vm_area_struct *vma; + struct list_head vma_next; +}; + +struct vfio_pci_core_device { + struct vfio_device vdev; + struct pci_dev *pdev; + void __iomem *barmap[PCI_STD_NUM_BARS]; + bool bar_mmap_supported[PCI_STD_NUM_BARS]; + u8 *pci_config_map; + u8 *vconfig; + struct perm_bits *msi_perm; + spinlock_t irqlock; + struct mutex igate; + struct vfio_pci_irq_ctx *ctx; + int num_ctx; + int irq_type; + int num_regions; + struct vfio_pci_region *region; + u8 msi_qmax; + u8 msix_bar; + u16 msix_size; + u32 msix_offset; + u32 rbar[7]; + bool pci_2_3; + bool virq_disabled; + bool reset_works; + bool extended_caps; + bool bardirty; + bool has_vga; + bool needs_reset; + bool nointx; + bool needs_pm_restore; + struct pci_saved_state *pci_saved_state; + struct pci_saved_state *pm_save; + int ioeventfds_nr; + struct eventfd_ctx *err_trigger; + struct eventfd_ctx *req_trigger; + struct list_head dummy_resources_list; + struct mutex ioeventfds_lock; + struct list_head ioeventfds_list; + struct vfio_pci_vf_token *vf_token; + struct notifier_block nb; + struct mutex vma_lock; + struct list_head vma_list; + struct rw_semaphore memory_lock; +}; + +#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) +#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX) +#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX) +#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev))) +#define irq_is(vdev, type) (vdev->irq_type == type) + +extern void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); +extern void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); + +extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, + uint32_t flags, unsigned index, + unsigned start, unsigned count, void *data); + +extern ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite); + +extern ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite); + +extern ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite); + +extern long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, + uint64_t data, int count, int fd); + +extern int vfio_pci_init_perm_bits(void); +extern void vfio_pci_uninit_perm_bits(void); + +extern int vfio_config_init(struct vfio_pci_core_device *vdev); +extern void vfio_config_free(struct vfio_pci_core_device *vdev); + +extern int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, + unsigned int type, unsigned int subtype, + const struct vfio_pci_regops *ops, + size_t size, u32 flags, void *data); + +extern int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, + pci_power_t state); + +extern bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); +extern void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device + *vdev); +extern u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev); +extern void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, + u16 cmd); + +#ifdef CONFIG_VFIO_PCI_IGD +extern int vfio_pci_igd_init(struct vfio_pci_core_device *vdev); +#else +static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) +{ + return -ENODEV; +} +#endif + +#ifdef CONFIG_S390 +extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps); +#else +static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps) +{ + return -ENODEV; +} +#endif + +/* Will be exported for vfio pci drivers usage */ +void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga, + bool is_disable_idle_d3); +void vfio_pci_core_close_device(struct vfio_device *core_vdev); +void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, + struct pci_dev *pdev, + const struct vfio_device_ops *vfio_pci_ops); +int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev); +void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev); +void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev); +int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn); +extern const struct pci_error_handlers vfio_pci_core_err_handlers; +long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, + unsigned long arg); +ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, + size_t count, loff_t *ppos); +ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, + size_t count, loff_t *ppos); +int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma); +void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count); +int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf); +int vfio_pci_core_enable(struct vfio_pci_core_device *vdev); +void vfio_pci_core_disable(struct vfio_pci_core_device *vdev); +void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev); + +static inline bool vfio_pci_is_vga(struct pci_dev *pdev) +{ + return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; +} + +#endif /* VFIO_PCI_CORE_H */ -- cgit v1.2.3 From b3636a3a2c51715736d3ec45f635ed03191962ce Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 31 Jul 2021 22:50:32 +0300 Subject: PM: runtime: add devm_pm_runtime_enable helper A typical code pattern for pm_runtime_enable() call is to call it in the _probe function and to call pm_runtime_disable() both from _probe error path and from _remove function. For some drivers the whole remove function would consist of the call to pm_remove_disable(). Add helper function to replace this bolierplate piece of code. Calling devm_pm_runtime_enable() removes the need for calling pm_runtime_disable() both in the probe()'s error path and in the remove() function. Signed-off-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20210731195034.979084-2-dmitry.baryshkov@linaro.org Acked-by: Rafael J. Wysocki Signed-off-by: Stephen Boyd --- include/linux/pm_runtime.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index aab8b35e9f8a..222da43b7096 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -59,6 +59,8 @@ extern void pm_runtime_put_suppliers(struct device *dev); extern void pm_runtime_new_link(struct device *dev); extern void pm_runtime_drop_link(struct device_link *link); +extern int devm_pm_runtime_enable(struct device *dev); + /** * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter. * @dev: Target device. @@ -253,6 +255,8 @@ static inline void __pm_runtime_disable(struct device *dev, bool c) {} static inline void pm_runtime_allow(struct device *dev) {} static inline void pm_runtime_forbid(struct device *dev) {} +static inline int devm_pm_runtime_enable(struct device *dev) { return 0; } + static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {} static inline void pm_runtime_get_noresume(struct device *dev) {} static inline void pm_runtime_put_noidle(struct device *dev) {} -- cgit v1.2.3 From a649136b17af6361355874a10e9219390c266a2c Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 31 Jul 2021 22:50:33 +0300 Subject: PM: runtime: add devm_pm_clk_create helper A typical code pattern for pm_clk_create() call is to call it in the _probe function and to call pm_clk_destroy() both from _probe error path and from _remove function. For some drivers the whole remove function would consist of the call to pm_remove_disable(). Add helper function to replace this bolierplate piece of code. Calling devm_pm_clk_create() removes the need for calling pm_clk_destroy() both in the probe()'s error path and in the remove() function. Signed-off-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20210731195034.979084-3-dmitry.baryshkov@linaro.org Acked-by: Rafael J. Wysocki Signed-off-by: Stephen Boyd --- include/linux/pm_clock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_clock.h b/include/linux/pm_clock.h index 8ddc7860e131..ada3a0ab10bf 100644 --- a/include/linux/pm_clock.h +++ b/include/linux/pm_clock.h @@ -47,6 +47,7 @@ extern void pm_clk_remove(struct device *dev, const char *con_id); extern void pm_clk_remove_clk(struct device *dev, struct clk *clk); extern int pm_clk_suspend(struct device *dev); extern int pm_clk_resume(struct device *dev); +extern int devm_pm_clk_create(struct device *dev); #else static inline bool pm_clk_no_clocks(struct device *dev) { @@ -83,6 +84,10 @@ static inline void pm_clk_remove(struct device *dev, const char *con_id) static inline void pm_clk_remove_clk(struct device *dev, struct clk *clk) { } +static inline int devm_pm_clk_create(struct device *dev) +{ + return -EINVAL; +} #endif #ifdef CONFIG_HAVE_CLK -- cgit v1.2.3 From 8c09e896cef8d908dd9a20a9f2a5c3fcb9799de3 Mon Sep 17 00:00:00 2001 From: Zhangfei Gao Date: Tue, 13 Jul 2021 10:54:34 +0800 Subject: PCI: Allow PASID on fake PCIe devices without TLP prefixes Some systems, e.g., HiSilicon KunPeng920 and KunPeng930, have devices that appear as PCI but are actually on the AMBA bus. Some of these fake PCI devices support a PASID-like feature and they do have a working PASID capability even though they do not use the PCIe Transport Layer Protocol and do not support TLP prefixes. Add a pasid_no_tlp bit for this "PASID works without TLP prefixes" case and update pci_enable_pasid() so it can enable PASID on these devices. Set this bit for HiSilicon KunPeng920 and KunPeng930. [bhelgaas: squashed, commit log] Suggested-by: Bjorn Helgaas Link: https://lore.kernel.org/r/1626144876-11352-2-git-send-email-zhangfei.gao@linaro.org Link: https://lore.kernel.org/r/1626144876-11352-3-git-send-email-zhangfei.gao@linaro.org Signed-off-by: Zhangfei Gao Signed-off-by: Jean-Philippe Brucker Signed-off-by: Zhou Wang Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..28165dc5b221 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -388,6 +388,7 @@ struct pci_dev { supported from root to here */ u16 l1ss; /* L1SS Capability pointer */ #endif + unsigned int pasid_no_tlp:1; /* PASID works without TLP Prefix */ unsigned int eetlp_prefix_path:1; /* End-to-End TLP Prefix */ pci_channel_state_t error_state; /* Current connectivity state */ -- cgit v1.2.3 From 1b7646014e0d838b06be7288e2dec3262948cc56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Aug 2021 15:55:05 +0200 Subject: dax: mark dax_get_by_host static And move the code around a bit to avoid a forward declaration. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20210826135510.6293-5-hch@lst.de Signed-off-by: Dan Williams --- include/linux/dax.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index b52f084aa643..379739b55408 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -41,7 +41,6 @@ struct dax_operations { extern struct attribute_group dax_attribute_group; #if IS_ENABLED(CONFIG_DAX) -struct dax_device *dax_get_by_host(const char *host); struct dax_device *alloc_dax(void *private, const char *host, const struct dax_operations *ops, unsigned long flags); void put_dax(struct dax_device *dax_dev); @@ -73,10 +72,6 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, return dax_synchronous(dax_dev); } #else -static inline struct dax_device *dax_get_by_host(const char *host) -{ - return NULL; -} static inline struct dax_device *alloc_dax(void *private, const char *host, const struct dax_operations *ops, unsigned long flags) { -- cgit v1.2.3 From cd93a2a4d1b076f5c73d70d836c202bbcbeea49e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Aug 2021 15:55:07 +0200 Subject: dax: remove __generic_fsdax_supported Just implement generic_fsdax_supported directly out of line instead of adding a wrapper. Given that generic_fsdax_supported is only supplied for CONFIG_FS_DAX builds this also allows to not provide it at all for !CONFIG_FS_DAX builds. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20210826135510.6293-7-hch@lst.de Signed-off-by: Dan Williams --- include/linux/dax.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 379739b55408..0a3ef9701e03 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -123,16 +123,9 @@ static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize) return __bdev_dax_supported(bdev, blocksize); } -bool __generic_fsdax_supported(struct dax_device *dax_dev, +bool generic_fsdax_supported(struct dax_device *dax_dev, struct block_device *bdev, int blocksize, sector_t start, sector_t sectors); -static inline bool generic_fsdax_supported(struct dax_device *dax_dev, - struct block_device *bdev, int blocksize, sector_t start, - sector_t sectors) -{ - return __generic_fsdax_supported(dax_dev, bdev, blocksize, start, - sectors); -} static inline void fs_put_dax(struct dax_device *dax_dev) { @@ -154,12 +147,7 @@ static inline bool bdev_dax_supported(struct block_device *bdev, return false; } -static inline bool generic_fsdax_supported(struct dax_device *dax_dev, - struct block_device *bdev, int blocksize, sector_t start, - sector_t sectors) -{ - return false; -} +#define generic_fsdax_supported NULL static inline void fs_put_dax(struct dax_device *dax_dev) { -- cgit v1.2.3 From 60b8340f0d6587d7b51990689fcdae567f309fbf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Aug 2021 15:55:08 +0200 Subject: dax: stub out dax_supported for !CONFIG_FS_DAX dax_supported calls into ->dax_supported which checks for fsdax support. Don't bother building it for !CONFIG_FS_DAX as it will always return false. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20210826135510.6293-8-hch@lst.de Signed-off-by: Dan Williams --- include/linux/dax.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 0a3ef9701e03..32dce5763f2c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -57,8 +57,6 @@ static inline void set_dax_synchronous(struct dax_device *dax_dev) { __set_dax_synchronous(dax_dev); } -bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, - int blocksize, sector_t start, sector_t len); /* * Check if given mapping is supported by the file / underlying device. */ @@ -101,12 +99,6 @@ static inline bool dax_synchronous(struct dax_device *dax_dev) static inline void set_dax_synchronous(struct dax_device *dax_dev) { } -static inline bool dax_supported(struct dax_device *dax_dev, - struct block_device *bdev, int blocksize, sector_t start, - sector_t len) -{ - return false; -} static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, struct dax_device *dax_dev) { @@ -127,6 +119,9 @@ bool generic_fsdax_supported(struct dax_device *dax_dev, struct block_device *bdev, int blocksize, sector_t start, sector_t sectors); +bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, + int blocksize, sector_t start, sector_t len); + static inline void fs_put_dax(struct dax_device *dax_dev) { put_dax(dax_dev); @@ -149,6 +144,13 @@ static inline bool bdev_dax_supported(struct block_device *bdev, #define generic_fsdax_supported NULL +static inline bool dax_supported(struct dax_device *dax_dev, + struct block_device *bdev, int blocksize, sector_t start, + sector_t len) +{ + return false; +} + static inline void fs_put_dax(struct dax_device *dax_dev) { } -- cgit v1.2.3 From bdd3c50d83bf7f6acc869b48d02670d19030ae03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Aug 2021 15:55:10 +0200 Subject: dax: remove bdev_dax_supported All callers already have a dax_device obtained from fs_dax_get_by_bdev at hand, so just pass that to dax_supported() insted of doing another lookup. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20210826135510.6293-10-hch@lst.de Signed-off-by: Dan Williams --- include/linux/dax.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 32dce5763f2c..2619d94c308d 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -109,12 +109,6 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, struct writeback_control; int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); #if IS_ENABLED(CONFIG_FS_DAX) -bool __bdev_dax_supported(struct block_device *bdev, int blocksize); -static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize) -{ - return __bdev_dax_supported(bdev, blocksize); -} - bool generic_fsdax_supported(struct dax_device *dax_dev, struct block_device *bdev, int blocksize, sector_t start, sector_t sectors); @@ -136,12 +130,6 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t st dax_entry_t dax_lock_page(struct page *page); void dax_unlock_page(struct page *page, dax_entry_t cookie); #else -static inline bool bdev_dax_supported(struct block_device *bdev, - int blocksize) -{ - return false; -} - #define generic_fsdax_supported NULL static inline bool dax_supported(struct dax_device *dax_dev, -- cgit v1.2.3 From 2908f5e101e3fb1d478cff1c556966e1af816641 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Feb 2020 10:00:22 +0000 Subject: fscache: Add a cookie debug ID and use that in traces Add a cookie debug ID and use that in traces and in procfiles rather than displaying the (hashed) pointer to the cookie. This is easier to correlate and we don't lose anything when interpreting oops output since that shows unhashed addresses and registers that aren't comparable to the hashed values. Changes: ver #2: - Fix the fscache_op tracepoint to handle a NULL cookie pointer. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/158861210988.340223.11688464116498247790.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/159465769844.1376105.14119502774019865432.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/160588459097.3465195.1273313637721852165.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/162431193544.2908479.17556704572948300790.stgit@warthog.procyon.org.uk/ --- include/linux/fscache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index abc1c4737fb8..ba58c427cf9a 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -126,6 +126,7 @@ struct fscache_cookie { atomic_t usage; /* number of users of this cookie */ atomic_t n_children; /* number of children of this cookie */ atomic_t n_active; /* number of active users of netfs ptrs */ + unsigned int debug_id; spinlock_t lock; spinlock_t stores_lock; /* lock on page store tree */ struct hlist_head backing_objects; /* object(s) backing this file/index */ -- cgit v1.2.3 From 884a76881fc5f5c9c04de1b640bed2c340929842 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Feb 2020 10:00:22 +0000 Subject: fscache: Procfile to display cookies Add /proc/fs/fscache/cookies to display active cookies. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/158861211871.340223.7223853943667440807.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/159465771021.1376105.6933857529128238020.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/160588460994.3465195.16963417803501149328.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/162431194785.2908479.786917990782538164.stgit@warthog.procyon.org.uk/ --- include/linux/fscache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index ba58c427cf9a..ea61e54a6bc5 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -133,6 +133,7 @@ struct fscache_cookie { const struct fscache_cookie_def *def; /* definition */ struct fscache_cookie *parent; /* parent of this entry */ struct hlist_bl_node hash_link; /* Link in hash table */ + struct list_head proc_link; /* Link in proc list */ void *netfs_data; /* back pointer to netfs */ struct radix_tree_root stores; /* pages to be stored on this cookie */ #define FSCACHE_COOKIE_PENDING_TAG 0 /* pages tag: pending write to cache */ -- cgit v1.2.3 From 6ae9bd8bb037b7c422bafde746f2338a716f6058 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 May 2021 09:40:19 +0100 Subject: fscache, cachefiles: Remove the histogram stuff Remove the histogram stuff as it's mostly going to be outdated. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/162431195953.2908479.16770977195634296638.stgit@warthog.procyon.org.uk/ --- include/linux/fscache-cache.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 3235ddbdcc09..fbff0b7e3ef1 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -147,7 +147,6 @@ struct fscache_retrieval { fscache_rw_complete_t end_io_func; /* function to call on I/O completion */ void *context; /* netfs read context (pinned) */ struct list_head to_do; /* list of things to be done by the backend */ - unsigned long start_time; /* time at which retrieval started */ atomic_t n_pages; /* number of pages to be retrieved */ }; -- cgit v1.2.3 From 58f386a73f16cea1f78e8466cc5c402eb7f6fcf8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 May 2021 09:59:17 +0100 Subject: fscache: Remove the object list procfile Remove the object list procfile from fscache as objects will become entirely internal to the cache. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/162431198332.2908479.5847286163455099669.stgit@warthog.procyon.org.uk/ --- include/linux/fscache-cache.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index fbff0b7e3ef1..8d39491c5f9f 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -384,9 +384,6 @@ struct fscache_object { struct list_head dependents; /* FIFO of dependent objects */ struct list_head dep_link; /* link in parent's dependents list */ struct list_head pending_ops; /* unstarted operations on this object */ -#ifdef CONFIG_FSCACHE_OBJECT_LIST - struct rb_node objlist_link; /* link in global object list */ -#endif pgoff_t store_limit; /* current storage limit */ loff_t store_limit_l; /* current storage limit */ }; -- cgit v1.2.3 From 20ec197bfa13c5b799fc9527790ea7b5374fc8f2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 29 Mar 2021 13:53:50 +0100 Subject: fscache: Use refcount_t for the cookie refcount instead of atomic_t Use refcount_t for the fscache_cookie refcount instead of atomic_t and rename the 'usage' member to 'ref' in such cases. The tracepoints that reference it change from showing "u=%d" to "r=%d". Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/162431204358.2908479.8006938388213098079.stgit@warthog.procyon.org.uk/ --- include/linux/fscache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index ea61e54a6bc5..a4dab5998613 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -123,7 +123,7 @@ struct fscache_netfs { * - indices are created on disk just-in-time */ struct fscache_cookie { - atomic_t usage; /* number of users of this cookie */ + refcount_t ref; /* number of users of this cookie */ atomic_t n_children; /* number of children of this cookie */ atomic_t n_active; /* number of active users of netfs ptrs */ unsigned int debug_id; -- cgit v1.2.3 From 3a3f976639f267823e443fdd8bffa03848fa1c3f Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 27 Aug 2021 14:37:14 -0400 Subject: SUNRPC keep track of number of transports to unique addresses Currently, xprt_switch keeps a number of all xprts (xps_nxprts) that were added to the switch regardless of whethere it's an nconnect transport or a transport to a trunkable address. Introduce a new counter to keep track of transports to unique destination addresses per xprt_switch. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprtmultipath.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h index b19addc8b715..bbb8a5fa0816 100644 --- a/include/linux/sunrpc/xprtmultipath.h +++ b/include/linux/sunrpc/xprtmultipath.h @@ -18,6 +18,7 @@ struct rpc_xprt_switch { unsigned int xps_id; unsigned int xps_nxprts; unsigned int xps_nactive; + unsigned int xps_nunique_destaddr_xprts; atomic_long_t xps_queuelen; struct list_head xps_xprt_list; -- cgit v1.2.3 From 7e134205f62955369619021a695cd78fefd32451 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 27 Aug 2021 14:37:17 -0400 Subject: NFSv4 introduce max_connect mount options This option will control up to how many xprts can the client establish to the server with a distinct address (that means nconnect connections are not counted towards this new limit). This patch is setting up nfs structures to keeep track of the max_connect limit (does not enforce it). The default value is kept at 1 so that no current mounts that don't want any additional connections would be effected. The maximum value is set at 16. Mounts to DS are not limited to default value of 1 but instead set to the maximum default value of 16 (NFS_MAX_TRANSPORTS). Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- include/linux/nfs_fs.h | 5 +++++ include/linux/nfs_fs_sb.h | 1 + 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index ce6474594872..b9a8b925db43 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -40,6 +40,11 @@ #include +/* + * These are the default for number of transports to different server IPs + */ +#define NFS_MAX_TRANSPORTS 16 + /* * These are the default flags for swap requests */ diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index d71a0e90faeb..2a9acbfe00f0 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -62,6 +62,7 @@ struct nfs_client { u32 cl_minorversion;/* NFSv4 minorversion */ unsigned int cl_nconnect; /* Number of connections */ + unsigned int cl_max_connect; /* max number of xprts allowed */ const char * cl_principal; /* used for machine cred */ #if IS_ENABLED(CONFIG_NFS_V4) -- cgit v1.2.3 From dc48e0abee245e2f0361bd8d4e3b00f70450fab2 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 27 Aug 2021 14:37:18 -0400 Subject: SUNRPC enforce creation of no more than max_connect xprts If we are adding new transports via rpc_clnt_test_and_add_xprt() then check if we've reached the limit. Currently only pnfs path adds transports via that function but this is done in preparation when the client would add new transports when session trunking is detected. A warning is logged if the limit is reached. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index b2edd5fc2f0c..a4661646adc9 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -82,6 +82,7 @@ struct rpc_clnt { struct work_struct cl_work; }; const struct cred *cl_cred; + unsigned int cl_max_connect; /* max number of transports not to the same IP */ }; /* @@ -136,6 +137,7 @@ struct rpc_create_args { char *client_name; struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ const struct cred *cred; + unsigned int max_connect; }; struct rpc_add_xprt_test { -- cgit v1.2.3 From ab959c7d4ea086852f35c7ff20ecd79b7471cfad Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 6 Aug 2021 10:53:21 +0100 Subject: dmaengine: Extend the dma_slave_width for 128 bytes Add DMA_SLAVE_BUSWIDTH_128_BYTES to dma_slave_width for DMA engines and users to select 128 bytes as bus width. Signed-off-by: Biju Das Reviewed-by: Lad Prabhakar Link: https://lore.kernel.org/r/20210806095322.2326-3-biju.das.jz@bp.renesas.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 93c3ca5fdafd..e5c2c9e71bf1 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -380,6 +380,7 @@ enum dma_slave_buswidth { DMA_SLAVE_BUSWIDTH_16_BYTES = 16, DMA_SLAVE_BUSWIDTH_32_BYTES = 32, DMA_SLAVE_BUSWIDTH_64_BYTES = 64, + DMA_SLAVE_BUSWIDTH_128_BYTES = 128, }; /** @@ -398,7 +399,7 @@ enum dma_slave_buswidth { * @src_addr_width: this is the width in bytes of the source (RX) * register where DMA data shall be read. If the source * is memory this may be ignored depending on architecture. - * Legal values: 1, 2, 3, 4, 8, 16, 32, 64. + * Legal values: 1, 2, 3, 4, 8, 16, 32, 64, 128. * @dst_addr_width: same as src_addr_width but for destination * target (TX) mutatis mutandis. * @src_maxburst: the maximum number of words (note: words, as in -- cgit v1.2.3 From d7e7747ac5c2496c98291944c6066adaa9f3b975 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 26 Aug 2021 15:54:22 +0200 Subject: netfilter: refuse insertion if chain has grown too large Also add a stat counter for this that gets exported both via old /proc interface and ctnetlink. Assuming the old default size of 16536 buckets and max hash occupancy of 64k, this results in 128k insertions (origin+reply), so ~8 entries per chain on average. The revised settings in this series will result in about two entries per bucket on average. This allows a hard-limit ceiling of 64. This is not tunable at the moment, but its possible to either increase nf_conntrack_buckets or decrease nf_conntrack_max to reduce average lengths. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 0c7d8d1e945d..700ea077ce2d 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -18,6 +18,7 @@ struct ip_conntrack_stat { unsigned int expect_create; unsigned int expect_delete; unsigned int search_restart; + unsigned int chaintoolong; }; #define NFCT_INFOMASK 7UL -- cgit v1.2.3 From a61590892ef097c180144fa469abe2256b9ae715 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 26 Aug 2021 20:53:42 +0200 Subject: PCI/VPD: Stop exporting pci_vpd_find_tag() Now that the last users have been migrated to pci_vpd_find_ro_keyword() we can stop exporting this function. It's still used in VPD core code. Link: https://lore.kernel.org/r/71131eca-0502-7878-365f-30b6614161cf@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 827b7eefd550..4fb233e374c5 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2339,17 +2339,6 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field) */ void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size); -/** - * pci_vpd_find_tag - Locates the Resource Data Type tag provided - * @buf: Pointer to buffered vpd data - * @len: The length of the vpd buffer - * @rdt: The Resource Data Type to search for - * - * Returns the index where the Resource Data Type was found or - * -ENOENT otherwise. - */ -int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt); - /** * pci_vpd_find_info_keyword - Locates an information field keyword in the VPD * @buf: Pointer to buffered vpd data -- cgit v1.2.3 From 59b83b29bb5532bbff54a271e0b4f321e28b954f Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 26 Aug 2021 20:54:23 +0200 Subject: PCI/VPD: Stop exporting pci_vpd_find_info_keyword() Now that the last users have been migrated to pci_vpd_find_ro_keyword() we can stop exporting this function. It's still used in VPD core code. Link: https://lore.kernel.org/r/96ca2a56-383e-9b61-9cba-4f1e5611dc15@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 4fb233e374c5..196cbf4c76a1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2339,19 +2339,6 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field) */ void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size); -/** - * pci_vpd_find_info_keyword - Locates an information field keyword in the VPD - * @buf: Pointer to buffered vpd data - * @off: The offset into the buffer at which to begin the search - * @len: The length of the buffer area, relative to off, in which to search - * @kw: The keyword to search for - * - * Returns the index where the information field keyword was found or - * -ENOENT otherwise. - */ -int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, - unsigned int len, const char *kw); - /** * pci_vpd_find_ro_info_keyword - Locate info field keyword in VPD RO section * @buf: Pointer to buffered VPD data -- cgit v1.2.3 From acfbb1b8a494d7bfd316dfb363a820e6df637e8d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 26 Aug 2021 20:55:43 +0200 Subject: PCI/VPD: Add pci_vpd_find_id_string() Add a pci_vpd_find_id_string() API function to retrieve the ID string from VPD. This way callers don't need pci_vpd_lrdt_size() any longer, and it can be made private to the VPD core. Link: https://lore.kernel.org/r/c5225bf6-8d29-970d-e271-0d7b52252630@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 196cbf4c76a1..ea330ca0501a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2339,6 +2339,16 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field) */ void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size); +/** + * pci_vpd_find_id_string - Locate id string in VPD + * @buf: Pointer to buffered VPD data + * @len: The length of the buffer area in which to search + * @size: Pointer to field where length of id string is returned + * + * Returns the index of the id string or -ENOENT if not found. + */ +int pci_vpd_find_id_string(const u8 *buf, unsigned int len, unsigned int *size); + /** * pci_vpd_find_ro_info_keyword - Locate info field keyword in VPD RO section * @buf: Pointer to buffered VPD data -- cgit v1.2.3 From 06e1913d457121a98ee276179734c34dab30f388 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 26 Aug 2021 20:57:01 +0200 Subject: PCI/VPD: Clean up public VPD defines and inline functions After recent introduction of new VPD API functions and user migration these defines and inline functions aren't used outside VPD core any longer. Link: https://lore.kernel.org/r/d33e06bf-bc5e-ece7-bf35-7245ae224d1b@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 69 ----------------------------------------------------- 1 file changed, 69 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index ea330ca0501a..303034d03c33 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2255,81 +2255,12 @@ int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask); #define PCI_VPD_LRDT_RO_DATA PCI_VPD_LRDT_ID(PCI_VPD_LTIN_RO_DATA) #define PCI_VPD_LRDT_RW_DATA PCI_VPD_LRDT_ID(PCI_VPD_LTIN_RW_DATA) -/* Small Resource Data Type Tag Item Names */ -#define PCI_VPD_STIN_END 0x0f /* End */ - -#define PCI_VPD_SRDT_END (PCI_VPD_STIN_END << 3) - -#define PCI_VPD_SRDT_TIN_MASK 0x78 -#define PCI_VPD_SRDT_LEN_MASK 0x07 -#define PCI_VPD_LRDT_TIN_MASK 0x7f - -#define PCI_VPD_LRDT_TAG_SIZE 3 -#define PCI_VPD_SRDT_TAG_SIZE 1 - -#define PCI_VPD_INFO_FLD_HDR_SIZE 3 - #define PCI_VPD_RO_KEYWORD_PARTNO "PN" #define PCI_VPD_RO_KEYWORD_SERIALNO "SN" #define PCI_VPD_RO_KEYWORD_MFR_ID "MN" #define PCI_VPD_RO_KEYWORD_VENDOR0 "V0" #define PCI_VPD_RO_KEYWORD_CHKSUM "RV" -/** - * pci_vpd_lrdt_size - Extracts the Large Resource Data Type length - * @lrdt: Pointer to the beginning of the Large Resource Data Type tag - * - * Returns the extracted Large Resource Data Type length. - */ -static inline u16 pci_vpd_lrdt_size(const u8 *lrdt) -{ - return (u16)lrdt[1] + ((u16)lrdt[2] << 8); -} - -/** - * pci_vpd_lrdt_tag - Extracts the Large Resource Data Type Tag Item - * @lrdt: Pointer to the beginning of the Large Resource Data Type tag - * - * Returns the extracted Large Resource Data Type Tag item. - */ -static inline u16 pci_vpd_lrdt_tag(const u8 *lrdt) -{ - return (u16)(lrdt[0] & PCI_VPD_LRDT_TIN_MASK); -} - -/** - * pci_vpd_srdt_size - Extracts the Small Resource Data Type length - * @srdt: Pointer to the beginning of the Small Resource Data Type tag - * - * Returns the extracted Small Resource Data Type length. - */ -static inline u8 pci_vpd_srdt_size(const u8 *srdt) -{ - return (*srdt) & PCI_VPD_SRDT_LEN_MASK; -} - -/** - * pci_vpd_srdt_tag - Extracts the Small Resource Data Type Tag Item - * @srdt: Pointer to the beginning of the Small Resource Data Type tag - * - * Returns the extracted Small Resource Data Type Tag Item. - */ -static inline u8 pci_vpd_srdt_tag(const u8 *srdt) -{ - return ((*srdt) & PCI_VPD_SRDT_TIN_MASK) >> 3; -} - -/** - * pci_vpd_info_field_size - Extracts the information field length - * @info_field: Pointer to the beginning of an information field header - * - * Returns the extracted information field length. - */ -static inline u8 pci_vpd_info_field_size(const u8 *info_field) -{ - return info_field[2]; -} - /** * pci_vpd_alloc - Allocate buffer and read VPD into it * @dev: PCI device -- cgit v1.2.3 From 4b92d4add5f6dcf21275185c997d6ecb800054cd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 31 Aug 2021 13:48:34 +0200 Subject: drivers: base: cacheinfo: Get rid of DEFINE_SMP_CALL_CACHE_FUNCTION() DEFINE_SMP_CALL_CACHE_FUNCTION() was usefel before the CPU hotplug rework to ensure that the cache related functions are called on the upcoming CPU because the notifier itself could run on any online CPU. The hotplug state machine guarantees that the callbacks are invoked on the upcoming CPU. So there is no need to have this SMP function call obfuscation. That indirection was missed when the hotplug notifiers were converted. This also solves the problem of ARM64 init_cache_level() invoking ACPI functions which take a semaphore in that context. That's invalid as SMP function calls run with interrupts disabled. Running it just from the callback in context of the CPU hotplug thread solves this. Fixes: 8571890e1513 ("arm64: Add support for ACPI based firmware tables") Reported-by: Guenter Roeck Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Acked-by: Will Deacon Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/871r69ersb.ffs@tglx --- include/linux/cacheinfo.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 4f72b47973c3..2f909ed084c6 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -79,24 +79,6 @@ struct cpu_cacheinfo { bool cpu_map_populated; }; -/* - * Helpers to make sure "func" is executed on the cpu whose cache - * attributes are being detected - */ -#define DEFINE_SMP_CALL_CACHE_FUNCTION(func) \ -static inline void _##func(void *ret) \ -{ \ - int cpu = smp_processor_id(); \ - *(int *)ret = __##func(cpu); \ -} \ - \ -int func(unsigned int cpu) \ -{ \ - int ret; \ - smp_call_function_single(cpu, _##func, &ret, true); \ - return ret; \ -} - struct cpu_cacheinfo *get_cpu_cacheinfo(unsigned int cpu); int init_cache_level(unsigned int cpu); int populate_cache_leaves(unsigned int cpu); -- cgit v1.2.3 From ef6c8da71eaffe4e251b0ff2a1d0da96f89fe6b0 Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Wed, 1 Sep 2021 15:25:50 +0530 Subject: octeontx2-pf: cn10K: Reserve LMTST lines per core This patch reserves the LMTST lines per cpu instead of separate LMTST lines for NPA(buffer free) and NIX(sqe flush). LMTST line of the core on which SQ or RQ is processed is used for LMTST operation. This patch also replace STEOR with STEORL release semantics and updates driver name in ethtool file. Signed-off-by: Geetha sowjanya Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller --- include/linux/soc/marvell/octeontx2/asm.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/marvell/octeontx2/asm.h b/include/linux/soc/marvell/octeontx2/asm.h index 28c04d918f0f..fa1d6af0164e 100644 --- a/include/linux/soc/marvell/octeontx2/asm.h +++ b/include/linux/soc/marvell/octeontx2/asm.h @@ -22,12 +22,17 @@ : [rs]"r" (ioaddr)); \ (result); \ }) +/* + * STEORL store to memory with release semantics. + * This will avoid using DMB barrier after each LMTST + * operation. + */ #define cn10k_lmt_flush(val, addr) \ ({ \ __asm__ volatile(".cpu generic+lse\n" \ - "steor %x[rf],[%[rs]]" \ - : [rf]"+r"(val) \ - : [rs]"r"(addr)); \ + "steorl %x[rf],[%[rs]]" \ + : [rf] "+r"(val) \ + : [rs] "r"(addr)); \ }) #else #define otx2_lmt_flush(ioaddr) ({ 0; }) -- cgit v1.2.3 From 59dc33252ee777e02332774fbdf3381b1d5d5f5d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 24 Aug 2021 16:43:55 +0200 Subject: PCI: VMD: ACPI: Make ACPI companion lookup work for VMD bus On some systems, in order to get to the deepest low-power state of the platform (which may be necessary to save significant enough amounts of energy while suspended to idle. for example), devices on the PCI bus exposed by the VMD driver need to be power-managed via ACPI. However, the layout of the ACPI namespace below the VMD controller device object does not reflect the layout of the PCI bus under the VMD host bridge, so in order to identify the ACPI companion objects for the devices on that bus, it is necessary to use a special _ADR encoding on the ACPI side. In other words, acpi_pci_find_companion() does not work for these devices, so it needs to be amended with a special lookup logic specific to the VMD bus. Address this issue by allowing the VMD driver to temporarily install an ACPI companion lookup hook containing the code matching the devices on the VMD PCI bus with the corresponding objects in the ACPI namespace. Signed-off-by: Rafael J. Wysocki Acked-by: Jon Derrick --- include/linux/pci-acpi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 5ba475ca9078..f16de399d2de 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -122,6 +122,9 @@ static inline void pci_acpi_add_edr_notifier(struct pci_dev *pdev) { } static inline void pci_acpi_remove_edr_notifier(struct pci_dev *pdev) { } #endif /* CONFIG_PCIE_EDR */ +int pci_acpi_set_companion_lookup_hook(struct acpi_device *(*func)(struct pci_dev *)); +void pci_acpi_clear_companion_lookup_hook(void); + #else /* CONFIG_ACPI */ static inline void acpi_pci_add_bus(struct pci_bus *bus) { } static inline void acpi_pci_remove_bus(struct pci_bus *bus) { } -- cgit v1.2.3 From 4bf8e582119ed9767f907abb6dc62ef9dddf10df Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 1 Sep 2021 14:41:57 +0530 Subject: cpufreq: Remove ready() callback This isn't used anymore, get rid of it. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index c65a1d7385f8..fe6acc04e5e5 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -367,9 +367,6 @@ struct cpufreq_driver { int (*suspend)(struct cpufreq_policy *policy); int (*resume)(struct cpufreq_policy *policy); - /* Will be called after the driver is fully initialized */ - void (*ready)(struct cpufreq_policy *policy); - struct freq_attr **attr; /* platform specific boost support code */ -- cgit v1.2.3 From 8083f58d08fd52f547c0a62c0f4e448e15e6726b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 7 Jul 2021 18:28:35 +0200 Subject: pwm: Make pwmchip_remove() return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since some time pwmchip_remove() always returns 0 so the return value isn't usefull. Now that all callers are converted to ignore its value the function can be changed to return void. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- include/linux/pwm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index a0b7e43049d5..725c9b784e60 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -404,7 +404,7 @@ int pwm_set_chip_data(struct pwm_device *pwm, void *data); void *pwm_get_chip_data(struct pwm_device *pwm); int pwmchip_add(struct pwm_chip *chip); -int pwmchip_remove(struct pwm_chip *chip); +void pwmchip_remove(struct pwm_chip *chip); int devm_pwmchip_add(struct device *dev, struct pwm_chip *chip); -- cgit v1.2.3 From 15eb7c888e749fbd1cc0370f3d38de08ad903700 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 31 Aug 2021 08:38:19 +0200 Subject: locking/rwsem: Add missing __init_rwsem() for PREEMPT_RT 730633f0b7f95 became the first direct caller of __init_rwsem() vs the usual init_rwsem(), exposing PREEMPT_RT's lack thereof. Add it. [ tglx: Move it out of line ] Signed-off-by: Mike Galbraith Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/50a936b7d8f12277d6ec7ed2ef0421a381056909.camel@gmx.de --- include/linux/rwsem.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 426e98e0b675..352c6127cb90 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -142,22 +142,14 @@ struct rw_semaphore { #define DECLARE_RWSEM(lockname) \ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -extern void __rwsem_init(struct rw_semaphore *rwsem, const char *name, +extern void __init_rwsem(struct rw_semaphore *rwsem, const char *name, struct lock_class_key *key); -#else -static inline void __rwsem_init(struct rw_semaphore *rwsem, const char *name, - struct lock_class_key *key) -{ -} -#endif #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - init_rwbase_rt(&(sem)->rwbase); \ - __rwsem_init((sem), #sem, &__key); \ + __init_rwsem((sem), #sem, &__key); \ } while (0) static __always_inline int rwsem_is_locked(struct rw_semaphore *sem) -- cgit v1.2.3 From d095559ce4100f0c02aea229705230deac329c97 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 5 Jul 2021 09:22:56 +0800 Subject: ceph: flush mdlog before umounting Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index e41a811026f6..bc2699feddbe 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -299,6 +299,7 @@ enum { CEPH_SESSION_FLUSHMSG_ACK, CEPH_SESSION_FORCE_RO, CEPH_SESSION_REJECT, + CEPH_SESSION_REQUEST_FLUSH_MDLOG, }; extern const char *ceph_session_op_name(int op); -- cgit v1.2.3 From 7a8526a5cd51cf5f070310c6c37dd7293334ac49 Mon Sep 17 00:00:00 2001 From: Kate Hsuan Date: Fri, 3 Sep 2021 17:44:11 +0800 Subject: libata: Add ATA_HORKAGE_NO_NCQ_ON_ATI for Samsung 860 and 870 SSD. Many users are reporting that the Samsung 860 and 870 SSD are having various issues when combined with AMD/ATI (vendor ID 0x1002) SATA controllers and only completely disabling NCQ helps to avoid these issues. Always disabling NCQ for Samsung 860/870 SSDs regardless of the host SATA adapter vendor will cause I/O performance degradation with well behaved adapters. To limit the performance impact to ATI adapters, introduce the ATA_HORKAGE_NO_NCQ_ON_ATI flag to force disable NCQ only for these adapters. Also, two libata.force parameters (noncqati and ncqati) are introduced to disable and enable the NCQ for the system which equipped with ATI SATA adapter and Samsung 860 and 870 SSDs. The user can determine NCQ function to be enabled or disabled according to the demand. After verifying the chipset from the user reports, the issue appears on AMD/ATI SB7x0/SB8x0/SB9x0 SATA Controllers and does not appear on recent AMD SATA adapters. The vendor ID of ATI should be 0x1002. Therefore, ATA_HORKAGE_NO_NCQ_ON_AMD was modified to ATA_HORKAGE_NO_NCQ_ON_ATI. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=201693 Signed-off-by: Kate Hsuan Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20210903094411.58749-1-hpa@redhat.com Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 860e63f5667b..c0c64f03e107 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -426,6 +426,7 @@ enum { ATA_HORKAGE_NOTRIM = (1 << 24), /* don't use TRIM */ ATA_HORKAGE_MAX_SEC_1024 = (1 << 25), /* Limit max sects to 1024 */ ATA_HORKAGE_MAX_TRIM_128M = (1 << 26), /* Limit max trim size to 128M */ + ATA_HORKAGE_NO_NCQ_ON_ATI = (1 << 27), /* Disable NCQ on ATI chipset */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3 From 35d7bdc86031a2c1ae05ac27dfa93b2acdcbaecc Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Apr 2021 10:20:25 +0200 Subject: kernel/fork: factor out replacing the current MM exe_file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's factor the main logic out into replace_mm_exe_file(), such that all mm->exe_file logic is contained in kernel/fork.c. While at it, perform some simple cleanups that are possible now that we're simplifying the individual functions. Acked-by: Christian Brauner Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ca22e6e694a..48c6fa9ab792 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2581,6 +2581,7 @@ extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); +extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); -- cgit v1.2.3 From fe69d560b5bd9ec77b5d5749bd7027344daef47e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Apr 2021 10:29:59 +0200 Subject: kernel/fork: always deny write access to current MM exe_file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to remove VM_DENYWRITE only currently only used when mapping the executable during exec. During exec, we already deny_write_access() the executable, however, after exec completes the VMAs mapped with VM_DENYWRITE effectively keeps write access denied via deny_write_access(). Let's deny write access when setting or replacing the MM exe_file. With this change, we can remove VM_DENYWRITE for mapping executables. Make set_mm_exe_file() return an error in case deny_write_access() fails; note that this should never happen, because exec code does a deny_write_access() early and keeps write access denied when calling set_mm_exe_file. However, it makes the code easier to read and makes set_mm_exe_file() and replace_mm_exe_file() look more similar. This represents a minor user space visible change: sys_prctl(PR_SET_MM_MAP/EXE_FILE) can now fail if the file is already opened writable. Also, after sys_prctl(PR_SET_MM_MAP/EXE_FILE) the file cannot be opened writable. Note that we can already fail with -EACCES if the file doesn't have execute permissions. Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 48c6fa9ab792..56b1cd41db61 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2580,7 +2580,7 @@ static inline int check_data_rlimit(unsigned long rlim, extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); -extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); +extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); -- cgit v1.2.3 From 8d0920bde5eb8ec7e567939b85e65a0596c8580d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 22 Apr 2021 12:08:20 +0200 Subject: mm: remove VM_DENYWRITE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All in-tree users of MAP_DENYWRITE are gone. MAP_DENYWRITE cannot be set from user space, so all users are gone; let's remove it. Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- include/linux/mm.h | 1 - include/linux/mman.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 56b1cd41db61..257995f62e83 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -281,7 +281,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ -#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 diff --git a/include/linux/mman.h b/include/linux/mman.h index ebb09a964272..bd9aadda047b 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -153,7 +153,6 @@ static inline unsigned long calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | - _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | arch_calc_vm_flag_bits(flags); -- cgit v1.2.3 From 6128b3af2a5e42386aa7faf37609b57f39fb7d00 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Apr 2021 09:38:14 +0200 Subject: mm: ignore MAP_DENYWRITE in ksys_mmap_pgoff() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's also remove masking off MAP_DENYWRITE from ksys_mmap_pgoff(): the last in-tree occurrence of MAP_DENYWRITE is now in LEGACY_MAP_MASK, which accepts the flag e.g., for MAP_SHARED_VALIDATE; however, the flag is ignored throughout the kernel now. Add a comment to LEGACY_MAP_MASK stating that MAP_DENYWRITE is ignored. Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- include/linux/mman.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mman.h b/include/linux/mman.h index bd9aadda047b..b66e91b8176c 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -32,7 +32,8 @@ * The historical set of flags that all mmap implementations implicitly * support when a ->mmap_validate() op is not provided in file_operations. * - * MAP_EXECUTABLE is completely ignored throughout the kernel. + * MAP_EXECUTABLE and MAP_DENYWRITE are completely ignored throughout the + * kernel. */ #define LEGACY_MAP_MASK (MAP_SHARED \ | MAP_PRIVATE \ -- cgit v1.2.3 From 592ca09be8333bd226f50100328a905bfc377133 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Apr 2021 09:45:45 +0200 Subject: fs: update documentation of get_write_access() and friends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As VM_DENYWRITE does no longer exists, let's spring-clean the documentation of get_write_access() and friends. Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- include/linux/fs.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 640574294216..e0dc3e96ed72 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3055,15 +3055,20 @@ static inline void file_end_write(struct file *file) } /* + * This is used for regular files where some users -- especially the + * currently executed binary in a process, previously handled via + * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap + * read-write shared) accesses. + * * get_write_access() gets write permission for a file. * put_write_access() releases this write permission. - * This is used for regular files. - * We cannot support write (and maybe mmap read-write shared) accesses and - * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode - * can have the following values: - * 0: no writers, no VM_DENYWRITE mappings - * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist - * > 0: (i_writecount) users are writing to the file. + * deny_write_access() denies write access to a file. + * allow_write_access() re-enables write access to a file. + * + * The i_writecount field of an inode can have the following values: + * 0: no write access, no denied write access + * < 0: (-i_writecount) users that denied write access to the file. + * > 0: (i_writecount) users that have write access to the file. * * Normally we operate on that counter with atomic_{inc,dec} and it's safe * except for the cases where we don't hold i_writecount yet. Then we need to -- cgit v1.2.3 From 4f3eaf452a14ff3982f71c1ca8bdf757254231fa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 2 Sep 2021 14:52:58 -0700 Subject: mm: report a more useful address for reclaim acquisition A recent lockdep report included these lines: [ 96.177910] 3 locks held by containerd/770: [ 96.177934] #0: ffff88810815ea28 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x115/0x770 [ 96.177999] #1: ffffffff82915020 (rcu_read_lock){....}-{1:2}, at: get_swap_device+0x33/0x140 [ 96.178057] #2: ffffffff82955ba0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 While it was not useful to that bug report to know where the reclaim lock had been acquired, it might be useful under other circumstances. Allow the caller of __fs_reclaim_acquire to specify the instruction pointer to use. Link: https://lkml.kernel.org/r/20210719185709.1755149-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Omar Sandoval Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Boqun Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index e24b1fe348e3..8894825cc4db 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -174,13 +174,13 @@ static inline gfp_t current_gfp_context(gfp_t flags) } #ifdef CONFIG_LOCKDEP -extern void __fs_reclaim_acquire(void); -extern void __fs_reclaim_release(void); +extern void __fs_reclaim_acquire(unsigned long ip); +extern void __fs_reclaim_release(unsigned long ip); extern void fs_reclaim_acquire(gfp_t gfp_mask); extern void fs_reclaim_release(gfp_t gfp_mask); #else -static inline void __fs_reclaim_acquire(void) { } -static inline void __fs_reclaim_release(void) { } +static inline void __fs_reclaim_acquire(unsigned long ip) { } +static inline void __fs_reclaim_release(unsigned long ip) { } static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif -- cgit v1.2.3 From 633a2abb9e1cd5c95f3b600f4b2c12cce22ae4a0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Sep 2021 14:53:04 -0700 Subject: writeback: track number of inodes under writeback Patch series "writeback: Fix bandwidth estimates", v4. Fix estimate of writeback throughput when device is not fully busy doing writeback. Michael Stapelberg has reported that such workload (e.g. generated by linking) tends to push estimated throughput down to 0 and as a result writeback on the device is practically stalled. The first three patches fix the reported issue, the remaining two patches are unrelated cleanups of problems I've noticed when reading the code. This patch (of 4): Track number of inodes under writeback for each bdi_writeback structure. We will use this to decide whether wb does any IO and so we can estimate its writeback throughput. In principle we could use number of pages under writeback (WB_WRITEBACK counter) for this however normal percpu counter reads are too inaccurate for our purposes and summing the counter is too expensive. Link: https://lkml.kernel.org/r/20210713104519.16394-1-jack@suse.cz Link: https://lkml.kernel.org/r/20210713104716.22868-1-jack@suse.cz Signed-off-by: Jan Kara Cc: Wu Fengguang Cc: Michael Stapelberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 1d7edad9914f..06fb8e13f6bc 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -116,6 +116,7 @@ struct bdi_writeback { struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ + atomic_t writeback_inodes; /* number of inodes under writeback */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; unsigned long congested; /* WB_[a]sync_congested flags */ -- cgit v1.2.3 From fee468fdf41cdf36ba6b5a780e2474d0a3e066ac Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Sep 2021 14:53:06 -0700 Subject: writeback: reliably update bandwidth estimation Currently we trigger writeback bandwidth estimation from balance_dirty_pages() and from wb_writeback(). However neither of these need to trigger when the system is relatively idle and writeback is triggered e.g. from fsync(2). Make sure writeback estimates happen reliably by triggering them from do_writepages(). Link: https://lkml.kernel.org/r/20210713104716.22868-2-jack@suse.cz Signed-off-by: Jan Kara Cc: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 19 +++++++++++++++++++ include/linux/writeback.h | 1 - 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 44df4fcef65c..8a886bca51e5 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -288,6 +288,17 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) return inode->i_wb; } +static inline struct bdi_writeback *inode_to_wb_wbc( + struct inode *inode, + struct writeback_control *wbc) +{ + /* + * If wbc does not have inode attached, it means cgroup writeback was + * disabled when wbc started. Just use the default wb in that case. + */ + return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb; +} + /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode @@ -366,6 +377,14 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return &inode_to_bdi(inode)->wb; } +static inline struct bdi_writeback *inode_to_wb_wbc( + struct inode *inode, + struct writeback_control *wbc) +{ + return inode_to_wb(inode); +} + + static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 667e86cfbdcf..2480322c06a7 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -379,7 +379,6 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); -void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time); void balance_dirty_pages_ratelimited(struct address_space *mapping); bool wb_over_bg_thresh(struct bdi_writeback *wb); -- cgit v1.2.3 From 45a2966fd64147518dc5bca25f447bd0fb5359ac Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Sep 2021 14:53:09 -0700 Subject: writeback: fix bandwidth estimate for spiky workload Michael Stapelberg has reported that for workload with short big spikes of writes (GCC linker seem to trigger this frequently) the write throughput is heavily underestimated and tends to steadily sink until it reaches zero. This has rather bad impact on writeback throttling (causing stalls). The problem is that writeback throughput estimate gets updated at most once per 200 ms. One update happens early after we submit pages for writeback (at that point writeout of only small fraction of pages is completed and thus observed throughput is tiny). Next update happens only during the next write spike (updates happen only from inode writeback and dirty throttling code) and if that is more than 1s after previous spike, we decide system was idle and just ignore whatever was written until this moment. Fix the problem by making sure writeback throughput estimate is also updated shortly after writeback completes to get reasonable estimate of throughput for spiky workloads. [jack@suse.cz: avoid division by 0 in wb_update_dirty_ratelimit()] Link: https://lore.kernel.org/lkml/20210617095309.3542373-1-stapelberg+linux@google.com Link: https://lkml.kernel.org/r/20210713104716.22868-3-jack@suse.cz Signed-off-by: Jan Kara Reported-by: Michael Stapelberg Tested-by: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev-defs.h | 1 + include/linux/writeback.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 06fb8e13f6bc..33207004cfde 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -143,6 +143,7 @@ struct bdi_writeback { spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ unsigned long dirty_sleep; /* last wait */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 2480322c06a7..cbaef099645e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -379,6 +379,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); +void wb_update_bandwidth(struct bdi_writeback *wb); void balance_dirty_pages_ratelimited(struct address_space *mapping); bool wb_over_bg_thresh(struct bdi_writeback *wb); -- cgit v1.2.3 From 7490a2d248145d8694e1e9828801b496250fd697 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:53:27 -0700 Subject: writeback: memcg: simplify cgroup_writeback_by_id Currently cgroup_writeback_by_id calls mem_cgroup_wb_stats() to get dirty pages for a memcg. However mem_cgroup_wb_stats() does a lot more than just get the number of dirty pages. Just directly get the number of dirty pages instead of calling mem_cgroup_wb_stats(). Also cgroup_writeback_by_id() is only called for best-effort dirty flushing, so remove the unused 'nr' parameter and no need to explicitly flush memcg stats. Link: https://lkml.kernel.org/r/20210722182627.2267368-1-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Jan Kara Cc: Tejun Heo Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 15 +++++++++++++++ include/linux/writeback.h | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 24797929d8a1..3403ec77528a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -955,6 +955,16 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, local_irq_restore(flags); } +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = READ_ONCE(memcg->vmstats.state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { @@ -1391,6 +1401,11 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, { } +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + return 0; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index cbaef099645e..aeda2c0c9986 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -218,7 +218,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes); -int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages, +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done); void cgroup_writeback_umount(void); bool cleanup_offline_cgwb(struct bdi_writeback *wb); -- cgit v1.2.3 From 6de522d1667f628376517e4b177af10c739d745b Mon Sep 17 00:00:00 2001 From: Jing Yangyang Date: Thu, 2 Sep 2021 14:53:30 -0700 Subject: include/linux/buffer_head.h: fix boolreturn.cocci warnings ./include/linux/buffer_head.h:412:64-65:WARNING:return of 0/1 in function 'has_bh_in_lru' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Generated by: scripts/coccinelle/misc/boolreturn.cocci Link: https://lkml.kernel.org/r/20210824055828.58783-1-deng.changcheng@zte.com.cn Signed-off-by: Jing Yangyang Reported-by: Zeal Robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index e7e99da31349..6486d3c19463 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -409,7 +409,7 @@ static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } static inline void invalidate_bh_lrus_cpu(int cpu) {} -static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; } +static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } #define buffer_heads_over_limit 0 #endif /* CONFIG_BLOCK */ -- cgit v1.2.3 From 54d516b1d62ff8f17cee2da06e5e4706a0d00b8a Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 Sep 2021 14:53:51 -0700 Subject: mm/gup: small refactoring: simplify try_grab_page() try_grab_page() does the same thing as try_grab_compound_head(..., refs=1, ...), just with a different API. So there is a lot of code duplication there. Change try_grab_page() to call try_grab_compound_head(), while keeping the API contract identical for callers. Also, now that try_grab_compound_head() always has a caller, remove the __maybe_unused annotation. Link: https://lkml.kernel.org/r/20210813044133.1536842-3-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ca22e6e694a..af4845e81b84 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1214,8 +1214,8 @@ static inline void get_page(struct page *page) } bool __must_check try_grab_page(struct page *page, unsigned int flags); -__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs, - unsigned int flags); +struct page *try_grab_compound_head(struct page *page, int refs, + unsigned int flags); static inline __must_check bool try_get_page(struct page *page) -- cgit v1.2.3 From 9857a17f206ff374aea78bccfb687f145368be2e Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 Sep 2021 14:53:54 -0700 Subject: mm/gup: remove try_get_page(), call try_get_compound_head() directly try_get_page() is very similar to try_get_compound_head(), and in fact try_get_page() has fallen a little behind in terms of maintenance: try_get_compound_head() handles speculative page references more thoroughly. There are only two try_get_page() callsites, so just call try_get_compound_head() directly from those, and remove try_get_page() entirely. Also, seeing as how this changes try_get_compound_head() into a non-static function, provide some kerneldoc documentation for it. Link: https://lkml.kernel.org/r/20210813044133.1536842-4-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index af4845e81b84..d3439dd4f4ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1217,15 +1217,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags); struct page *try_grab_compound_head(struct page *page, int refs, unsigned int flags); - -static inline __must_check bool try_get_page(struct page *page) -{ - page = compound_head(page); - if (WARN_ON_ONCE(page_ref_count(page) <= 0)) - return false; - page_ref_inc(page); - return true; -} +struct page *try_get_compound_head(struct page *page, int refs); static inline void put_page(struct page *page) { -- cgit v1.2.3 From 3969b1a654fb09b7915efc1aa4ad45daf932e12f Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 Sep 2021 14:54:00 -0700 Subject: mm: delete unused get_kernel_page() get_kernel_page() was added in 2012 by [1]. It was used for a while for NFS, but then in 2014, a refactoring [2] removed all callers, and it has apparently not been used since. Remove get_kernel_page() because it has no callers. [1] commit 18022c5d8627 ("mm: add get_kernel_page[s] for pinning of kernel addresses for I/O") [2] commit 91f79c43d1b5 ("new helper: iov_iter_get_pages_alloc()") Link: https://lkml.kernel.org/r/20210729221847.1165665-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: David Hildenbrand Cc: Mel Gorman Cc: Rik van Riel Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mike Christie Cc: Neil Brown Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Cc: Mark Salter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d3439dd4f4ba..35bbac32b6f6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1839,7 +1839,6 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct kvec; int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, struct page **pages); -int get_kernel_page(unsigned long start, int write, struct page **pages); struct page *get_dump_page(unsigned long addr); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); -- cgit v1.2.3 From bf11b9a8e9a93c1fc0ebfc2929622d5cf7d43888 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 2 Sep 2021 14:54:03 -0700 Subject: shmem: use raw_spinlock_t for ->stat_lock Each CPU has SHMEM_INO_BATCH inodes available in `->ino_batch' which is per-CPU. Access here is serialized by disabling preemption. If the pool is empty, it gets reloaded from `->next_ino'. Access here is serialized by ->stat_lock which is a spinlock_t and can not be acquired with disabled preemption. One way around it would make per-CPU ino_batch struct containing the inode number a local_lock_t. Another solution is to promote ->stat_lock to a raw_spinlock_t. The critical sections are short. The mpol_put() must be moved outside of the critical section to avoid invoking the destructor with disabled preemption. Link: https://lkml.kernel.org/r/20210806142916.jdwkb5bx62q5fwfo@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 8e775ce517bb..0a8499fb9c3c 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -31,7 +31,7 @@ struct shmem_sb_info { struct percpu_counter used_blocks; /* How many are allocated */ unsigned long max_inodes; /* How many inodes are allowed */ unsigned long free_inodes; /* How many are left for allocation */ - spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ + raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ umode_t mode; /* Mount mode for root directory */ unsigned char huge; /* Whether to try for hugepages */ kuid_t uid; /* Mount uid for root directory */ -- cgit v1.2.3 From d144bf6205342a4b5fed5d204ae18849a4de741b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:21 -0700 Subject: huge tmpfs: fix split_huge_page() after FALLOC_FL_KEEP_SIZE A successful shmem_fallocate() guarantees that the extent has been reserved, even beyond i_size when the FALLOC_FL_KEEP_SIZE flag was used. But that guarantee is broken by shmem_unused_huge_shrink()'s attempts to split huge pages and free their excess beyond i_size; and by other uses of split_huge_page() near i_size. It's sad to add a shmem inode field just for this, but I did not find a better way to keep the guarantee. A flag to say KEEP_SIZE has been used would be cheaper, but I'm averse to unclearable flags. The fallocend field is not perfect either (many disjoint ranges might be fallocated), but good enough; and gains another use later on. Link: https://lkml.kernel.org/r/ca9a146-3a59-6cd3-7f28-e9a044bb1052@google.com Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure") Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0a8499fb9c3c..bfc5899d18e0 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -18,6 +18,7 @@ struct shmem_inode_info { unsigned long flags; unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ + pgoff_t fallocend; /* highest fallocate endindex */ struct list_head shrinklist; /* shrinkable hpage inodes */ struct list_head swaplist; /* chain of maybes on swap */ struct shared_policy policy; /* NUMA memory alloc policy */ @@ -119,6 +120,18 @@ static inline bool shmem_file(struct file *file) return shmem_mapping(file->f_mapping); } +/* + * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages + * beyond i_size's notion of EOF, which fallocate has committed to reserving: + * which split_huge_page() must therefore not delete. This use of a single + * "fallocend" per inode errs on the side of not deleting a reservation when + * in doubt: there are plenty of cases when it preserves unreserved pages. + */ +static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) +{ + return max(eof, SHMEM_I(inode)->fallocend); +} + extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); -- cgit v1.2.3 From acdd9f8e0fed9f1bd7e83a8ff934694bb4c9a72b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:34 -0700 Subject: huge tmpfs: SGP_NOALLOC to stop collapse_file() on race khugepaged's collapse_file() currently uses SGP_NOHUGE to tell shmem_getpage() not to try allocating a huge page, in the very unlikely event that a racing hole-punch removes the swapped or fallocated page as soon as i_pages lock is dropped. We want to consolidate shmem's huge decisions, removing SGP_HUGE and SGP_NOHUGE; but cannot quite persuade ourselves that it's okay to regress the protection in this case - Yang Shi points out that the huge page would remain indefinitely, charged to root instead of the intended memcg. collapse_file() should not even allocate a small page in this case: why proceed if someone is punching a hole? SGP_READ is almost the right flag here, except that it optimizes away from a fallocated page, with NULL to tell caller to fill with zeroes (like a hole); whereas collapse_file()'s sequence relies on using a cache page. Add SGP_NOALLOC just for this. There are too many consecutive "if (page"s there in shmem_getpage_gfp(): group it better; and fix the outdated "bring it back from swap" comment. Link: https://lkml.kernel.org/r/1355343b-acf-4653-ef79-6aee40214ac5@google.com Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index bfc5899d18e0..a3f4502ec8a9 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -94,6 +94,7 @@ extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, /* Flag allocation requirements to shmem_getpage */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ + SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */ SGP_HUGE, /* like SGP_CACHE, huge pages preferred */ -- cgit v1.2.3 From 5e6e5a12a44ca5ff2b130d8d39aaf9b8c026de94 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:37 -0700 Subject: huge tmpfs: shmem_is_huge(vma, inode, index) Extend shmem_huge_enabled(vma) to shmem_is_huge(vma, inode, index), so that a consistent set of checks can be applied, even when the inode is accessed through read/write syscalls (with NULL vma) instead of mmaps (the index argument is seldom of interest, but required by mount option "huge=within_size"). Clean up and rearrange the checks a little. This then replaces the checks which shmem_fault() and shmem_getpage_gfp() were making, and eliminates the SGP_HUGE and SGP_NOHUGE modes. Replace a couple of 0s by explicit SHMEM_HUGE_NEVERs; and replace the obscure !shmem_mapping() symlink check by explicit S_ISLNK() - nothing else needs that symlink check, so leave it there in shmem_getpage_gfp(). Link: https://lkml.kernel.org/r/23a77889-2ddc-b030-75cd-44ca27fd4d1@google.com Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a3f4502ec8a9..166158b6e917 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -86,7 +86,12 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); extern int shmem_unuse(unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse); -extern bool shmem_huge_enabled(struct vm_area_struct *vma); +extern bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index); +static inline bool shmem_huge_enabled(struct vm_area_struct *vma) +{ + return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff); +} extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); @@ -96,8 +101,6 @@ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */ - SGP_HUGE, /* like SGP_CACHE, huge pages preferred */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; -- cgit v1.2.3 From 2c8d8f97ae2272f1455ee31a5af62b326772eb31 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 14:54:50 -0700 Subject: mm, memcg: inline mem_cgroup_{charge/uncharge} to improve disabled memcg config Inline mem_cgroup_{charge/uncharge} and mem_cgroup_uncharge_list functions functions to perform mem_cgroup_disabled static key check inline before calling the main body of the function. This minimizes the memcg overhead in the pagefault and exit_mmap paths when memcgs are disabled using cgroup_disable=memory command-line option. This change results in ~0.4% overhead reduction when running PFT test [1] comparing {CONFIG_MEMCG=n} against {CONFIG_MEMCG=y, cgroup_disable=memory} configuration on an 8-core ARM64 Android device. [1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite Link: https://lkml.kernel.org/r/20210713010934.299876-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Jens Axboe Cc: Joonsoo Kim Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Minchan Kim Cc: Roman Gushchin Cc: Tejun Heo Cc: Wei Yang Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3403ec77528a..a00bf337567b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -693,13 +693,35 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) page_counter_read(&memcg->memory); } -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); +int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask); +static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask) +{ + if (mem_cgroup_disabled()) + return 0; + return __mem_cgroup_charge(page, mm, gfp_mask); +} + int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); -void mem_cgroup_uncharge(struct page *page); -void mem_cgroup_uncharge_list(struct list_head *page_list); +void __mem_cgroup_uncharge(struct page *page); +static inline void mem_cgroup_uncharge(struct page *page) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge(page); +} + +void __mem_cgroup_uncharge_list(struct list_head *page_list); +static inline void mem_cgroup_uncharge_list(struct list_head *page_list) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge_list(page_list); +} void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); -- cgit v1.2.3 From 01c4b28cd2e600160566a7d83b4703800381dae1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 14:54:54 -0700 Subject: mm, memcg: inline swap-related functions to improve disabled memcg config Inline mem_cgroup_try_charge_swap, mem_cgroup_uncharge_swap and cgroup_throttle_swaprate functions to perform mem_cgroup_disabled static key check inline before calling the main body of the function. This minimizes the memcg overhead in the pagefault and exit_mmap paths when memcgs are disabled using cgroup_disable=memory command-line option. This change results in ~1% overhead reduction when running PFT test [1] comparing {CONFIG_MEMCG=n} against {CONFIG_MEMCG=y, cgroup_disable=memory} configuration on an 8-core ARM64 Android device. [1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite Link: https://lkml.kernel.org/r/20210713010934.299876-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Roman Gushchin Cc: Yang Shi Cc: Alex Shi Cc: Wei Yang Cc: Jens Axboe Cc: Joonsoo Kim Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Alistair Popple Cc: Minchan Kim Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 6f5a43251593..f30d26b0f71d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -721,7 +721,13 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) #endif #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -extern void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); +extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); +static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) +{ + if (mem_cgroup_disabled()) + return; + __cgroup_throttle_swaprate(page, gfp_mask); +} #else static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { @@ -730,8 +736,22 @@ static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) #ifdef CONFIG_MEMCG_SWAP extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); -extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); -extern void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); +extern int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); +static inline int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +{ + if (mem_cgroup_disabled()) + return 0; + return __mem_cgroup_try_charge_swap(page, entry); +} + +extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); +static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge_swap(entry, nr_pages); +} + extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct page *page); #else -- cgit v1.2.3 From 7e1c0d6f58207e7e60674647d3935f446f05613c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:55:00 -0700 Subject: memcg: switch lruvec stats to rstat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 2d146aa3aa84 ("mm: memcontrol: switch to rstat") switched memcg stats to rstat infrastructure but skipped the conversion of the lruvec stats as such stats are read in the performance critical code paths and flushing stats may have impacted the performances of the applications. This patch converts the lruvec stats to rstat and later patches add mechanisms to keep the performance impact to minimum. The rstat conversion comes with the price i.e. memory cost. Effectively this patch reverts the savings done by the commit f3344adf38bd ("mm: memcontrol: optimize per-lruvec stats counter memory usage"). However this cost is justified due to negative impact of the inaccurate lruvec stats on many heuristics. One such case is reported in [1]. The memory reclaim code is filled with plethora of heuristics and many of those heuristics reads the lruvec stats. So, inaccurate stats can make such heuristics ineffective. [1] reports the impact of inaccurate lruvec stats on the "cache trim mode" heuristic. Inaccurate lruvec stats can impact the deactivation and aging anon heuristics as well. [1] https://lore.kernel.org/linux-mm/20210311004449.1170308-1-ying.huang@intel.com/ Link: https://lkml.kernel.org/r/20210716212137.1391164-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20210714013948.270662-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Tejun Heo Cc: Johannes Weiner Cc: Muchun Song Cc: Michal Hocko Cc: Roman Gushchin Cc: Huang Ying Cc: Hillf Danton Cc: Michal Koutný Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a00bf337567b..47d35bef9f6e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -105,14 +105,6 @@ struct mem_cgroup_reclaim_iter { unsigned int generation; }; -struct lruvec_stat { - long count[NR_VM_NODE_STAT_ITEMS]; -}; - -struct batched_lruvec_stat { - s32 count[NR_VM_NODE_STAT_ITEMS]; -}; - /* * Bitmap and deferred work of shrinker::id corresponding to memcg-aware * shrinkers, which have elements charged to this memcg. @@ -123,24 +115,30 @@ struct shrinker_info { unsigned long *map; }; +struct lruvec_stats_percpu { + /* Local (CPU and cgroup) state */ + long state[NR_VM_NODE_STAT_ITEMS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[NR_VM_NODE_STAT_ITEMS]; +}; + +struct lruvec_stats { + /* Aggregated (CPU and subtree) state */ + long state[NR_VM_NODE_STAT_ITEMS]; + + /* Pending child counts during tree propagation */ + long state_pending[NR_VM_NODE_STAT_ITEMS]; +}; + /* * per-node information in memory controller. */ struct mem_cgroup_per_node { struct lruvec lruvec; - /* - * Legacy local VM stats. This should be struct lruvec_stat and - * cannot be optimized to struct batched_lruvec_stat. Because - * the threshold of the lruvec_stat_cpu can be as big as - * MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this - * filed has no upper limit. - */ - struct lruvec_stat __percpu *lruvec_stat_local; - - /* Subtree VM stats (batched updates) */ - struct batched_lruvec_stat __percpu *lruvec_stat_cpu; - atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; + struct lruvec_stats_percpu __percpu *lruvec_stats_percpu; + struct lruvec_stats lruvec_stats; unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; @@ -997,7 +995,7 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - x = atomic_long_read(&pn->lruvec_stat[idx]); + x = READ_ONCE(pn->lruvec_stats.state[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -1017,7 +1015,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); for_each_possible_cpu(cpu) - x += per_cpu(pn->lruvec_stat_local->count[idx], cpu); + x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; -- cgit v1.2.3 From aa48e47e3906c332eaf1e5d7b58be11d3509ad9f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:55:04 -0700 Subject: memcg: infrastructure to flush memcg stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the moment memcg stats are read in four contexts: 1. memcg stat user interfaces 2. dirty throttling 3. page fault 4. memory reclaim Currently the kernel flushes the stats for first two cases. Flushing the stats for remaining two casese may have performance impact. Always flushing the memcg stats on the page fault code path may negatively impacts the performance of the applications. In addition flushing in the memory reclaim code path, though treated as slowpath, can become the source of contention for the global lock taken for stat flushing because when system or memcg is under memory pressure, many tasks may enter the reclaim path. This patch uses following mechanisms to solve these challenges: 1. Periodically flush the stats from root memcg every 2 seconds. This will time limit the out of sync stats. 2. Asynchronously flush the stats after fixed number of stat updates. In the worst case the stat can be out of sync by O(nr_cpus * BATCH) for 2 seconds. 3. For avoiding thundering herd to flush the stats particularly from the memory reclaim context, introduce memcg local spinlock and let only one flusher active at a time. This could have been done through cgroup_rstat_lock lock but that lock is used by other subsystem and for userspace reading memcg stats. So, it is better to keep flushers introduced by this patch decoupled from cgroup_rstat_lock. However we would have to use irqsafe version of rstat flush but that is fine as this code path will be flushing for whole tree and do the work for everyone. No one will be waiting for that worker. [shakeelb@google.com: fix sleep-in-wrong context bug] Link: https://lkml.kernel.org/r/20210716212137.1391164-2-shakeelb@google.com Link: https://lkml.kernel.org/r/20210714013948.270662-2-shakeelb@google.com Signed-off-by: Shakeel Butt Tested-by: Marek Szyprowski Cc: Hillf Danton Cc: Huang Ying Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 47d35bef9f6e..c4c5b652d3af 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1023,6 +1023,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return x; } +void mem_cgroup_flush_stats(void); + void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); @@ -1438,6 +1440,10 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); } +static inline void mem_cgroup_flush_stats(void) +{ +} + static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { -- cgit v1.2.3 From 96e51ccf1af33e82f429a0d6baebba29c6448d0f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:55:46 -0700 Subject: memcg: cleanup racy sum avoidance code We used to have per-cpu memcg and lruvec stats and the readers have to traverse and sum the stats from each cpu. This summing was racy and may expose transient negative values. So, an explicit check was added to avoid such scenarios. Now these stats are moved to rstat infrastructure and are no more per-cpu, so we can remove the fixup for transient negative values. Link: https://lkml.kernel.org/r/20210728012243.3369123-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c4c5b652d3af..2ee20509ac71 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -977,30 +977,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long x = READ_ONCE(memcg->vmstats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; + return READ_ONCE(memcg->vmstats.state[idx]); } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; - long x; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - x = READ_ONCE(pn->lruvec_stats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; + return READ_ONCE(pn->lruvec_stats.state[idx]); } static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, -- cgit v1.2.3 From 55a68c823951855f3ec313fdb85100db84284505 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:49 -0700 Subject: memcg: replace in_interrupt() by !in_task() in active_memcg() set_active_memcg() uses in_interrupt() check to select proper storage for cgroup: pointer on task struct or per-cpu pointer. It isn't fully correct: obsoleted in_interrupt() includes tasks with disabled BH. It's better to use '!in_task()' instead. Link: https://lkml.org/lkml/2021/7/26/487 Link: https://lkml.kernel.org/r/ed4448b0-4970-616f-7368-ef9dd3cb628d@virtuozzo.com Fixes: 37d5985c003d ("mm: kmem: prepare remote memcg charging infra for interrupt contexts") Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8894825cc4db..5561486fddef 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -306,7 +306,7 @@ set_active_memcg(struct mem_cgroup *memcg) { struct mem_cgroup *old; - if (in_interrupt()) { + if (!in_task()) { old = this_cpu_read(int_active_memcg); this_cpu_write(int_active_memcg, memcg); } else { -- cgit v1.2.3 From bec49c067c679e9b7ca7c1aac50b56618c12d879 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:55:56 -0700 Subject: mm, memcg: remove unused functions Since commit 2d146aa3aa84 ("mm: memcontrol: switch to rstat"), last user of memcg_stat_item_in_bytes() is gone. And since commit fa40d1ee9f15 ("mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()"), only the declaration of mem_cgroup_select_victim_node() is remained here. Remove them. Link: https://lkml.kernel.org/r/20210807082835.61281-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Alex Shi Cc: Matthew Wilcox (Oracle) Cc: Vladimir Davydov Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2ee20509ac71..69e6c5462c43 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -593,13 +593,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) } #endif -static __always_inline bool memcg_stat_item_in_bytes(int idx) -{ - if (idx == MEMCG_PERCPU_B) - return true; - return vmstat_item_in_bytes(idx); -} - static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); @@ -904,11 +897,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) return !!(memcg->css.flags & CSS_ONLINE); } -/* - * For memory reclaim. - */ -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); - void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages); -- cgit v1.2.3 From f358afc52c3066f4e8cd7b3a2d75b31e822519e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Sep 2021 14:56:36 -0700 Subject: mm: remove flush_kernel_dcache_page flush_kernel_dcache_page is a rather confusing interface that implements a subset of flush_dcache_page by not being able to properly handle page cache mapped pages. The only callers left are in the exec code as all other previous callers were incorrect as they could have dealt with page cache pages. Replace the calls to flush_kernel_dcache_page with calls to flush_dcache_page, which for all architectures does either exactly the same thing, can contains one or more of the following: 1) an optimization to defer the cache flush for page cache pages not mapped into userspace 2) additional flushing for mapped page cache pages if cache aliases are possible Link: https://lkml.kernel.org/r/20210712060928.4161649-7-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Linus Torvalds Reviewed-by: Ira Weiny Cc: Alex Shi Cc: Geoff Levand Cc: Greentime Hu Cc: Guo Ren Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Nick Hu Cc: Paul Cercueil Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Ulf Hansson Cc: Vincent Chen Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index d9a606a9fc64..b4c49f9cc379 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -130,10 +130,7 @@ static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page } #endif -#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -static inline void flush_kernel_dcache_page(struct page *page) -{ -} +#ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE static inline void flush_kernel_vmap_range(void *vaddr, int size) { } -- cgit v1.2.3 From e15710bf04063766f428048f4dad7b73b646203f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 2 Sep 2021 14:56:43 -0700 Subject: mm: change fault_in_pages_* to have an unsigned size parameter fault_in_pages_writeable() and fault_in_pages_readable() treat the size parameter as unsigned, doing pointer math with the value, so make this explicit and set it to be a size_t type which all callers currently treat it as anyway. This solves the issue where static checkers get nervous seeing pointer arithmetic happening with a signed value. Link: https://lkml.kernel.org/r/20210727111136.457638-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman Reported-by: Jordy Zomer Cc: Matthew Wilcox Cc: David Howells Cc: William Kucharski Cc: "Darrick J. Wong" Cc: Hugh Dickins Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ed02aa522263..5dcf446f42e5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -736,7 +736,7 @@ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); /* * Fault everything in given userspace address range in. */ -static inline int fault_in_pages_writeable(char __user *uaddr, int size) +static inline int fault_in_pages_writeable(char __user *uaddr, size_t size) { char __user *end = uaddr + size - 1; @@ -763,7 +763,7 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size) return 0; } -static inline int fault_in_pages_readable(const char __user *uaddr, int size) +static inline int fault_in_pages_readable(const char __user *uaddr, size_t size) { volatile char c; const char __user *end = uaddr + size - 1; -- cgit v1.2.3 From fc1f5e980a463325cf41d39ac6a69aa3cca73995 Mon Sep 17 00:00:00 2001 From: Ohhoon Kwon Date: Thu, 2 Sep 2021 14:57:01 -0700 Subject: mm: sparse: pass section_nr to find_memory_block With CONFIG_SPARSEMEM_EXTREME enabled, __section_nr() which converts mem_section to section_nr could be costly since it iterates all section roots to check if the given mem_section is in its range. On the other hand, __nr_to_section() which converts section_nr to mem_section can be done in O(1). Let's pass section_nr instead of mem_section ptr to find_memory_block() in order to reduce needless iterations. Link: https://lkml.kernel.org/r/20210707150212.855-3-ohoono.kwon@samsung.com Signed-off-by: Ohhoon Kwon Acked-by: Michal Hocko Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 97e92e8b556a..d9a0b61cd432 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -90,7 +90,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); -extern struct memory_block *find_memory_block(struct mem_section *); +extern struct memory_block *find_memory_block(unsigned long section_nr); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); -- cgit v1.2.3 From 11e02d3729da1a2d4a33db5ea61291770d411884 Mon Sep 17 00:00:00 2001 From: Ohhoon Kwon Date: Thu, 2 Sep 2021 14:57:04 -0700 Subject: mm: sparse: remove __section_nr() function As the last users of __section_nr() are gone, let's remove unused function __section_nr(). Link: https://lkml.kernel.org/r/20210707150212.855-4-ohoono.kwon@samsung.com Signed-off-by: Ohhoon Kwon Acked-by: Michal Hocko Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fcb535560028..8827f4d081d4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1342,7 +1342,6 @@ static inline struct mem_section *__nr_to_section(unsigned long nr) return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } -extern unsigned long __section_nr(struct mem_section *ms); extern size_t mem_section_usage_size(void); /* -- cgit v1.2.3 From 01c8d337d195ed105cabab95bc4dcb9e145bf5ea Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 2 Sep 2021 14:57:07 -0700 Subject: mm/sparse: set SECTION_NID_SHIFT to 6 Currently SECTION_NID_SHIFT is set to 3, which is incorrect because bit 3 and 4 can be overlapped by sub-field for early NID, and can be unexpectedly set on NUMA systems. There are a few non-critical issues related to this: - Having SECTION_TAINT_ZONE_DEVICE set for wrong sections forces pfn_to_online_page() through the slow path, but doesn't actually break the kernel. - A kdump generation tool like makedumpfile uses this field to calculate the physical address to read. So wrong bits can make the tool access to wrong address and fail to create kdump. This can be avoided by the tool, so it's not critical. To fix it, set SECTION_NID_SHIFT to 6 which is the minimum number of available bits of section flag field. Link: https://lkml.kernel.org/r/20210707045548.810271-1-naoya.horiguchi@linux.dev Fixes: 1f90a3477df3 ("mm: teach pfn_to_online_page() about ZONE_DEVICE section collisions") Signed-off-by: Naoya Horiguchi Reported-by: Kazuhito Hagio Suggested-by: Dan Williams Acked-by: David Hildenbrand Cc: Oscar Salvador Cc: Wang Wensheng Cc: Rui Xiang Cc: Kazu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8827f4d081d4..59bad25ce78e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1364,7 +1364,7 @@ extern size_t mem_section_usage_size(void); #define SECTION_TAINT_ZONE_DEVICE (1UL<<4) #define SECTION_MAP_LAST_BIT (1UL<<5) #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) -#define SECTION_NID_SHIFT 3 +#define SECTION_NID_SHIFT 6 static inline struct page *__section_mem_map_addr(struct mem_section *section) { -- cgit v1.2.3 From d0505e9f7dcec85da6634ec66da2b17656ee177b Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 2 Sep 2021 14:58:31 -0700 Subject: mm: hwpoison: don't drop slab caches for offlining non-LRU page In the current implementation of soft offline, if non-LRU page is met, all the slab caches will be dropped to free the page then offline. But if the page is not slab page all the effort is wasted in vain. Even though it is a slab page, it is not guaranteed the page could be freed at all. However the side effect and cost is quite high. It does not only drop the slab caches, but also may drop a significant amount of page caches which are associated with inode caches. It could make the most workingset gone in order to just offline a page. And the offline is not guaranteed to succeed at all, actually I really doubt the success rate for real life workload. Furthermore the worse consequence is the system may be locked up and unusable since the page cache release may incur huge amount of works queued for memcg release. Actually we ran into such unpleasant case in our production environment. Firstly, the workqueue of memory_failure_work_func is locked up as below: BUG: workqueue lockup - pool cpus=1 node=0 flags=0x0 nice=0 stuck for 53s! Showing busy workqueues and worker pools: workqueue events: flags=0x0 pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=14/256 refcnt=15 in-flight: 409271:memory_failure_work_func pending: kfree_rcu_work, kfree_rcu_monitor, kfree_rcu_work, rht_deferred_worker, rht_deferred_worker, rht_deferred_worker, rht_deferred_worker, kfree_rcu_work, kfree_rcu_work, kfree_rcu_work, kfree_rcu_work, drain_local_stock, kfree_rcu_work workqueue mm_percpu_wq: flags=0x8 pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/256 refcnt=2 pending: vmstat_update workqueue cgroup_destroy: flags=0x0 pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/1 refcnt=12072 pending: css_release_work_fn There were over 12K css_release_work_fn queued, and this caused a few lockups due to the contention of worker pool lock with IRQ disabled, for example: NMI watchdog: Watchdog detected hard LOCKUP on cpu 1 Modules linked in: amd64_edac_mod edac_mce_amd crct10dif_pclmul crc32_pclmul ghash_clmulni_intel xt_DSCP iptable_mangle kvm_amd bpfilter vfat fat acpi_ipmi i2c_piix4 usb_storage ipmi_si k10temp i2c_core ipmi_devintf ipmi_msghandler acpi_cpufreq sch_fq_codel xfs libcrc32c crc32c_intel mlx5_core mlxfw nvme xhci_pci ptp nvme_core pps_core xhci_hcd CPU: 1 PID: 205500 Comm: kworker/1:0 Tainted: G L 5.10.32-t1.el7.twitter.x86_64 #1 Hardware name: TYAN F5AMT /z /S8026GM2NRE-CGN, BIOS V8.030 03/30/2021 Workqueue: events memory_failure_work_func RIP: 0010:queued_spin_lock_slowpath+0x41/0x1a0 Code: 41 f0 0f ba 2f 08 0f 92 c0 0f b6 c0 c1 e0 08 89 c2 8b 07 30 e4 09 d0 a9 00 01 ff ff 75 1b 85 c0 74 0e 8b 07 84 c0 74 08 f3 90 <8b> 07 84 c0 75 f8 b8 01 00 00 00 66 89 07 c3 f6 c4 01 75 04 c6 47 RSP: 0018:ffff9b2ac278f900 EFLAGS: 00000002 RAX: 0000000000480101 RBX: ffff8ce98ce71800 RCX: 0000000000000084 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8ce98ce6a140 RBP: 00000000000284c8 R08: ffffd7248dcb6808 R09: 0000000000000000 R10: 0000000000000003 R11: ffff9b2ac278f9b0 R12: 0000000000000001 R13: ffff8cb44dab9c00 R14: ffffffffbd1ce6a0 R15: ffff8cacaa37f068 FS: 0000000000000000(0000) GS:ffff8ce98ce40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fcf6e8cb000 CR3: 0000000a0c60a000 CR4: 0000000000350ee0 Call Trace: __queue_work+0xd6/0x3c0 queue_work_on+0x1c/0x30 uncharge_batch+0x10e/0x110 mem_cgroup_uncharge_list+0x6d/0x80 release_pages+0x37f/0x3f0 __pagevec_release+0x1c/0x50 __invalidate_mapping_pages+0x348/0x380 inode_lru_isolate+0x10a/0x160 __list_lru_walk_one+0x7b/0x170 list_lru_walk_one+0x4a/0x60 prune_icache_sb+0x37/0x50 super_cache_scan+0x123/0x1a0 do_shrink_slab+0x10c/0x2c0 shrink_slab+0x1f1/0x290 drop_slab_node+0x4d/0x70 soft_offline_page+0x1ac/0x5b0 memory_failure_work_func+0x6a/0x90 process_one_work+0x19e/0x340 worker_thread+0x30/0x360 kthread+0x116/0x130 The lockup made the machine is quite unusable. And it also made the most workingset gone, the reclaimabled slab caches were reduced from 12G to 300MB, the page caches were decreased from 17G to 4G. But the most disappointing thing is all the effort doesn't make the page offline, it just returns: soft_offline: 0x1469f2: unknown non LRU page type 5ffff0000000000 () It seems the aggressive behavior for non-LRU page didn't pay back, so it doesn't make too much sense to keep it considering the terrible side effect. Link: https://lkml.kernel.org/r/20210819054116.266126-1-shy828301@gmail.com Signed-off-by: Yang Shi Reported-by: David Mackey Acked-by: David Hildenbrand Acked-by: Naoya Horiguchi Cc: Oscar Salvador Cc: Matthew Wilcox (Oracle) Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 35bbac32b6f6..11c38550627c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3110,7 +3110,7 @@ extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; -extern void shake_page(struct page *p, int access); +extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); -- cgit v1.2.3 From 09a26e832705fdb7a9484495b71a05e0bbc65207 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 2 Sep 2021 14:58:53 -0700 Subject: hugetlb: fix hugetlb cgroup refcounting during vma split Guillaume Morin reported hitting the following WARNING followed by GPF or NULL pointer deference either in cgroups_destroy or in the kill_css path.: percpu ref (css_release) <= 0 (-1) after switching to atomic WARNING: CPU: 23 PID: 130 at lib/percpu-refcount.c:196 percpu_ref_switch_to_atomic_rcu+0x127/0x130 CPU: 23 PID: 130 Comm: ksoftirqd/23 Kdump: loaded Tainted: G O 5.10.60 #1 RIP: 0010:percpu_ref_switch_to_atomic_rcu+0x127/0x130 Call Trace: rcu_core+0x30f/0x530 rcu_core_si+0xe/0x10 __do_softirq+0x103/0x2a2 run_ksoftirqd+0x2b/0x40 smpboot_thread_fn+0x11a/0x170 kthread+0x10a/0x140 ret_from_fork+0x22/0x30 Upon further examination, it was discovered that the css structure was associated with hugetlb reservations. For private hugetlb mappings the vma points to a reserve map that contains a pointer to the css. At mmap time, reservations are set up and a reference to the css is taken. This reference is dropped in the vma close operation; hugetlb_vm_op_close. However, if a vma is split no additional reference to the css is taken yet hugetlb_vm_op_close will be called twice for the split vma resulting in an underflow. Fix by taking another reference in hugetlb_vm_op_open. Note that the reference is only taken for the owner of the reserve map. In the more common fork case, the pointer to the reserve map is cleared for non-owning vmas. Link: https://lkml.kernel.org/r/20210830215015.155224-1-mike.kravetz@oracle.com Fixes: e9fe92ae0cd2 ("hugetlb_cgroup: add reservation accounting for private mappings") Signed-off-by: Mike Kravetz Reported-by: Guillaume Morin Suggested-by: Guillaume Morin Tested-by: Guillaume Morin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb_cgroup.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 0b8d1fdda3a1..c137396129db 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -121,6 +121,13 @@ static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg) css_put(&h_cg->css); } +static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( + struct resv_map *resv_map) +{ + if (resv_map->css) + css_get(resv_map->css); +} + extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, @@ -199,6 +206,11 @@ static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg) { } +static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( + struct resv_map *resv_map) +{ +} + static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { -- cgit v1.2.3 From a759a909d42d727e918bd5248d6cff7562fa8109 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 2 Sep 2021 14:58:56 -0700 Subject: userfaultfd: change mmap_changing to atomic Patch series "userfaultfd: minor bug fixes". Three unrelated bug fixes. The first two addresses possible issues (not too theoretical ones), but I did not encounter them in practice. The third patch addresses a test bug that causes the test to fail on my system. It has been sent before as part of a bigger RFC. This patch (of 3): mmap_changing is currently a boolean variable, which is set and cleared without any lock that protects against concurrent modifications. mmap_changing is supposed to mark whether userfaultfd page-faults handling should be retried since mappings are undergoing a change. However, concurrent calls, for instance to madvise(MADV_DONTNEED), might cause mmap_changing to be false, although the remove event was still not read (hence acknowledged) by the user. Change mmap_changing to atomic_t and increase/decrease appropriately. Add a debug assertion to see whether mmap_changing is negative. Link: https://lkml.kernel.org/r/20210808020724.1022515-1-namit@vmware.com Link: https://lkml.kernel.org/r/20210808020724.1022515-2-namit@vmware.com Fixes: df2cc96e77011 ("userfaultfd: prevent non-cooperative events vs mcopy_atomic races") Signed-off-by: Nadav Amit Cc: Mike Rapoport Cc: Peter Xu Cc: Axel Rasmussen Cc: Alexander Viro Cc: Andrea Arcangeli Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/userfaultfd_k.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 331d2ccf0bcc..33cea484d1ad 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -60,16 +60,16 @@ extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing, __u64 mode); + atomic_t *mmap_changing, __u64 mode); extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, - bool *mmap_changing); + atomic_t *mmap_changing); extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, bool *mmap_changing); + unsigned long len, atomic_t *mmap_changing); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, - bool enable_wp, bool *mmap_changing); + bool enable_wp, atomic_t *mmap_changing); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, -- cgit v1.2.3 From 5ac95884a784e822b8cbe3d4bd6e9f96b3b71e3f Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 2 Sep 2021 14:59:13 -0700 Subject: mm/migrate: enable returning precise migrate_pages() success count Under normal circumstances, migrate_pages() returns the number of pages migrated. In error conditions, it returns an error code. When returning an error code, there is no way to know how many pages were migrated or not migrated. Make migrate_pages() return how many pages are demoted successfully for all cases, including when encountering errors. Page reclaim behavior will depend on this in subsequent patches. Link: https://lkml.kernel.org/r/20210721063926.3024591-3-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-4-ying.huang@intel.com Signed-off-by: Yang Shi Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Suggested-by: Oscar Salvador [optional parameter] Reviewed-by: Yang Shi Reviewed-by: Zi Yan Cc: Michal Hocko Cc: Wei Xu Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 23dadf7aeba8..8ab88d46318e 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -41,7 +41,8 @@ extern int migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, - unsigned long private, enum migrate_mode mode, int reason); + unsigned long private, enum migrate_mode mode, int reason, + unsigned int *ret_succeeded); extern struct page *alloc_migration_target(struct page *page, unsigned long private); extern int isolate_movable_page(struct page *page, isolate_mode_t mode); @@ -56,7 +57,7 @@ extern int migrate_page_move_mapping(struct address_space *mapping, static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, - int reason) + int reason, unsigned int *ret_succeeded) { return -ENOSYS; } static inline struct page *alloc_migration_target(struct page *page, unsigned long private) -- cgit v1.2.3 From 26aa2d199d6f2cfa6f2ef2a5dfe891f2250e71a0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Sep 2021 14:59:16 -0700 Subject: mm/migrate: demote pages during reclaim This is mostly derived from a patch from Yang Shi: https://lore.kernel.org/linux-mm/1560468577-101178-10-git-send-email-yang.shi@linux.alibaba.com/ Add code to the reclaim path (shrink_page_list()) to "demote" data to another NUMA node instead of discarding the data. This always avoids the cost of I/O needed to read the page back in and sometimes avoids the writeout cost when the page is dirty. A second pass through shrink_page_list() will be made if any demotions fail. This essentially falls back to normal reclaim behavior in the case that demotions fail. Previous versions of this patch may have simply failed to reclaim pages which were eligible for demotion but were unable to be demoted in practice. For some cases, for example, MADV_PAGEOUT, the pages are always discarded instead of demoted to follow the kernel API definition. Because MADV_PAGEOUT is defined as freeing specified pages regardless in which tier they are. Note: This just adds the start of infrastructure for migration. It is actually disabled next to the FIXME in migrate_demote_page_ok(). [dave.hansen@linux.intel.com: v11] Link: https://lkml.kernel.org/r/20210715055145.195411-5-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210721063926.3024591-4-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-5-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Wei Xu Reviewed-by: Oscar Salvador Reviewed-by: Zi Yan Cc: Michal Hocko Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 8ab88d46318e..326250996b4e 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -28,6 +28,7 @@ enum migrate_reason { MR_NUMA_MISPLACED, MR_CONTIG_RANGE, MR_LONGTERM_PIN, + MR_DEMOTION, MR_TYPES }; @@ -167,6 +168,14 @@ struct migrate_vma { int migrate_vma_setup(struct migrate_vma *args); void migrate_vma_pages(struct migrate_vma *migrate); void migrate_vma_finalize(struct migrate_vma *migrate); +int next_demotion_node(int node); + +#else /* CONFIG_MIGRATION disabled: */ + +static inline int next_demotion_node(int node) +{ + return NUMA_NO_NODE; +} #endif /* CONFIG_MIGRATION */ -- cgit v1.2.3 From 668e4147d8850df32ca41e28f52c146025ca45c6 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 2 Sep 2021 14:59:19 -0700 Subject: mm/vmscan: add page demotion counter Account the number of demoted pages. Add pgdemote_kswapd and pgdemote_direct VM counters showed in /proc/vmstat. [ daveh: - __count_vm_events() a bit, and made them look at the THP size directly rather than getting data from migrate_pages() ] Link: https://lkml.kernel.org/r/20210721063926.3024591-5-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-6-ying.huang@intel.com Signed-off-by: Yang Shi Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Wei Xu Reviewed-by: Zi Yan Cc: Michal Hocko Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Oscar Salvador Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index ae0dd1948c2b..a185cc75ff52 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -33,6 +33,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGREUSE, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, + PGDEMOTE_KSWAPD, + PGDEMOTE_DIRECT, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_DIRECT_THROTTLE, -- cgit v1.2.3 From 20b51af15e014cac63b58a4f8b8b323ac35bccce Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 2 Sep 2021 14:59:33 -0700 Subject: mm/migrate: add sysfs interface to enable reclaim migration Some method is obviously needed to enable reclaim-based migration. Just like traditional autonuma, there will be some workloads that will benefit like workloads with more "static" configurations where hot pages stay hot and cold pages stay cold. If pages come and go from the hot and cold sets, the benefits of this approach will be more limited. The benefits are truly workload-based and *not* hardware-based. We do not believe that there is a viable threshold where certain hardware configurations should have this mechanism enabled while others do not. To be conservative, earlier work defaulted to disable reclaim- based migration and did not include a mechanism to enable it. This proposes add a new sysfs file /sys/kernel/mm/numa/demotion_enabled as a method to enable it. We are open to any alternative that allows end users to enable this mechanism or disable it if workload harm is detected (just like traditional autonuma). Once this is enabled page demotion may move data to a NUMA node that does not fall into the cpuset of the allocating process. This could be construed to violate the guarantees of cpusets. However, since this is an opt-in mechanism, the assumption is that anyone enabling it is content to relax the guarantees. Link: https://lkml.kernel.org/r/20210721063926.3024591-9-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-10-ying.huang@intel.com Signed-off-by: Huang Ying Originally-by: Dave Hansen Cc: Michal Hocko Cc: Wei Xu Cc: Yang Shi Cc: Zi Yan Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 0aaf91b496e2..4ca025e2a77e 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -184,6 +184,8 @@ extern bool vma_migratable(struct vm_area_struct *vma); extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *); +extern bool numa_demotion_enabled; + #else struct mempolicy {}; @@ -292,5 +294,7 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp) { return NULL; } + +#define numa_demotion_enabled false #endif /* CONFIG_NUMA */ #endif -- cgit v1.2.3 From 9647875be52b33fe22cb034ec3074896c581543f Mon Sep 17 00:00:00 2001 From: Hui Su Date: Thu, 2 Sep 2021 14:59:36 -0700 Subject: mm/vmpressure: replace vmpressure_to_css() with vmpressure_to_memcg() We can get memcg directly form vmpr instead of vmpr->memcg->css->memcg, so add a new func helper vmpressure_to_memcg(). And no code will use vmpressure_to_css(), so delete it. Link: https://lkml.kernel.org/r/20210630112146.455103-1-suhui@zeku.com Signed-off-by: Hui Su Acked-by: Michal Hocko Acked-by: Chris Down Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmpressure.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 6d28bc433c1c..6a2f51ebbfd3 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -37,7 +37,7 @@ extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); extern void vmpressure_init(struct vmpressure *vmpr); extern void vmpressure_cleanup(struct vmpressure *vmpr); extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); -extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); +extern struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr); extern int vmpressure_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args); -- cgit v1.2.3 From b87c517ac5de168aec6e8318ca0707b11b2ccfaf Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:59:46 -0700 Subject: mm/vmscan: remove unneeded return value of kswapd_run() The return value of kswapd_run() is unused now. Clean it up. Link: https://lkml.kernel.org/r/20210717065911.61497-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Michal Hocko Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Minchan Kim Cc: Shaohua Li Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index f30d26b0f71d..ba52f3a3478e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -408,7 +408,7 @@ static inline bool node_reclaim_enabled(void) extern void check_move_unevictable_pages(struct pagevec *pvec); -extern int kswapd_run(int nid); +extern void kswapd_run(int nid); extern void kswapd_stop(int nid); #ifdef CONFIG_SWAP -- cgit v1.2.3 From 65d759c8f9f57b96c199f3fe5cfb93ac7da095e9 Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Thu, 2 Sep 2021 14:59:59 -0700 Subject: mm: compaction: support triggering of proactive compaction by user The proactive compaction[1] gets triggered for every 500msec and run compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9) pages based on the value set to sysctl.compaction_proactiveness. Triggering the compaction for every 500msec in search of COMPACTION_HPAGE_ORDER pages is not needed for all applications, especially on the embedded system usecases which may have few MB's of RAM. Enabling the proactive compaction in its state will endup in running almost always on such systems. Other side, proactive compaction can still be very much useful for getting a set of higher order pages in some controllable manner(controlled by using the sysctl.compaction_proactiveness). So, on systems where enabling the proactive compaction always may proove not required, can trigger the same from user space on write to its sysctl interface. As an example, say app launcher decide to launch the memory heavy application which can be launched fast if it gets more higher order pages thus launcher can prepare the system in advance by triggering the proactive compaction from userspace. This triggering of proactive compaction is done on a write to sysctl.compaction_proactiveness by user. [1]https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=facdaa917c4d5a376d09d25865f5a863f906234a [akpm@linux-foundation.org: tweak vm.rst, per Mike] Link: https://lkml.kernel.org/r/1627653207-12317-1-git-send-email-charante@codeaurora.org Signed-off-by: Charan Teja Reddy Acked-by: Vlastimil Babka Acked-by: Rafael Aquini Cc: Mike Rapoport Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Dave Hansen Cc: Mel Gorman Cc: Nitin Gupta Cc: Jonathan Corbet Cc: Khalid Aziz Cc: David Rientjes Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 2 ++ include/linux/mmzone.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index c24098c7acca..34bce35c808d 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -84,6 +84,8 @@ static inline unsigned long compact_gap(unsigned int order) extern unsigned int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); +extern int compaction_proactiveness_sysctl_handler(struct ctl_table *table, + int write, void *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 59bad25ce78e..1bd5f5955f9a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -846,6 +846,7 @@ typedef struct pglist_data { enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; + bool proactive_compact_trigger; #endif /* * This is a per-node reserve of pages that are not available -- cgit v1.2.3 From cfcaa66f803233c50e17239469f6c96136a673a1 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Thu, 2 Sep 2021 15:00:13 -0700 Subject: mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY Implement the missing huge page allocation functionality while obeying the preferred node semantics. This is similar to the implementation for general page allocation, as it uses a fallback mechanism to try multiple preferred nodes first, and then all other nodes. To avoid adding too many "#ifdef CONFIG_NUMA" check, add a helper function in mempolicy.h to check whether a mempolicy is MPOL_PREFERRED_MANY. [akpm@linux-foundation.org: fix compiling issue when merging with other hugetlb patch] [Thanks to 0day bot for catching the !CONFIG_NUMA compiling issue] [mhocko@suse.com: suggest to remove the #ifdef CONFIG_NUMA check] [ben.widawsky@intel.com: add helpers to avoid ifdefs] Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-4-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/20210809024430.GA46432@shbuild999.sh.intel.com [nathan@kernel.org: initialize page to NULL in alloc_buddy_huge_page_with_mpol()] Link: https://lkml.kernel.org/r/20210810200632.3812797-1-nathan@kernel.org Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-4-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/20210809024430.GA46432@shbuild999.sh.intel.com Signed-off-by: Ben Widawsky Signed-off-by: Feng Tang Signed-off-by: Nathan Chancellor Co-developed-by: Feng Tang Suggested-by: Michal Hocko Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 4ca025e2a77e..4091692bed8c 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -186,6 +186,12 @@ extern void mpol_put_task_policy(struct task_struct *); extern bool numa_demotion_enabled; +static inline bool mpol_is_preferred_many(struct mempolicy *pol) +{ + return (pol->mode == MPOL_PREFERRED_MANY); +} + + #else struct mempolicy {}; @@ -296,5 +302,11 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp) } #define numa_demotion_enabled false + +static inline bool mpol_is_preferred_many(struct mempolicy *pol) +{ + return false; +} + #endif /* CONFIG_NUMA */ #endif -- cgit v1.2.3 From a7259df7670240ee03b0cfce8a3e5d3773911e24 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 2 Sep 2021 15:00:26 -0700 Subject: memblock: make memblock_find_in_range method private There are a lot of uses of memblock_find_in_range() along with memblock_reserve() from the times memblock allocation APIs did not exist. memblock_find_in_range() is the very core of memblock allocations, so any future changes to its internal behaviour would mandate updates of all the users outside memblock. Replace the calls to memblock_find_in_range() with an equivalent calls to memblock_phys_alloc() and memblock_phys_alloc_range() and make memblock_find_in_range() private method of memblock. This simplifies the callers, ensures that (unlikely) errors in memblock_reserve() are handled and improves maintainability of memblock_find_in_range(). Link: https://lkml.kernel.org/r/20210816122622.30279-1-rppt@kernel.org Signed-off-by: Mike Rapoport Reviewed-by: Catalin Marinas [arm64] Acked-by: Kirill A. Shutemov Acked-by: Rafael J. Wysocki [ACPI] Acked-by: Russell King (Oracle) Acked-by: Nick Kossifidis [riscv] Tested-by: Guenter Roeck Acked-by: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 4a53c3ca86bd..b066024c62e3 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -99,8 +99,6 @@ void memblock_discard(void); static inline void memblock_discard(void) {} #endif -phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align); void memblock_allow_resize(void); int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); int memblock_add(phys_addr_t base, phys_addr_t size); -- cgit v1.2.3 From dce49103962840dd61423d7627748d6c558d58c5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 15:00:33 -0700 Subject: mm: wire up syscall process_mrelease Split off from prev patch in the series that implements the syscall. Link: https://lkml.kernel.org/r/20210809185259.405936-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Geert Uytterhoeven Cc: Andy Lutomirski Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Florian Weimer Cc: Jan Engelhardt Cc: Jann Horn Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 69c9a7010081..00bc170a50f0 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -915,6 +915,7 @@ asmlinkage long sys_mincore(unsigned long start, size_t len, asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, size_t vlen, int behavior, unsigned int flags); +asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); -- cgit v1.2.3 From 0b303fb402862dcb7948eeeed2439bd8c99948b5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sat, 8 May 2021 02:28:02 +0200 Subject: mm, slub: do initial checks in ___slab_alloc() with irqs enabled As another step of shortening irq disabled sections in ___slab_alloc(), delay disabling irqs until we pass the initial checks if there is a cached percpu slab and it's suitable for our allocation. Now we have to recheck c->page after actually disabling irqs as an allocation in irq handler might have replaced it. Because we call pfmemalloc_match() as one of the checks, we might hit VM_BUG_ON_PAGE(!PageSlab(page)) in PageSlabPfmemalloc in case we get interrupted and the page is freed. Thus introduce a pfmemalloc_match_unsafe() variant that lacks the PageSlab check. Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman --- include/linux/page-flags.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5922031ffab6..7fda4fb85bdc 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -815,6 +815,15 @@ static inline int PageSlabPfmemalloc(struct page *page) return PageActive(page); } +/* + * A version of PageSlabPfmemalloc() for opportunistic checks where the page + * might have been freed under us and not be a PageSlab anymore. + */ +static inline int __PageSlabPfmemalloc(struct page *page) +{ + return PageActive(page); +} + static inline void SetPageSlabPfmemalloc(struct page *page) { VM_BUG_ON_PAGE(!PageSlab(page), page); -- cgit v1.2.3 From 2112ff5ce0c1128fe7b4d19cfe7f2b8ce5b595fa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Aug 2021 11:18:44 +0100 Subject: iov_iter: track truncated size Remember how many bytes were truncated and reverted back. Because not reexpanded iterators don't always work well with reverting, we may need to know that to reexpand ourselves when needed. Signed-off-by: Pavel Begunkov Signed-off-by: Al Viro --- include/linux/uio.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 82c3c3e819e0..5265024e8b90 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -47,6 +47,7 @@ struct iov_iter { }; loff_t xarray_start; }; + size_t truncated; }; static inline enum iter_type iov_iter_type(const struct iov_iter *i) @@ -254,8 +255,10 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) * conversion in assignement is by definition greater than all * values of size_t, including old i->count. */ - if (i->count > count) + if (i->count > count) { + i->truncated += i->count - count; i->count = count; + } } /* @@ -264,6 +267,7 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) */ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { + i->truncated -= count - i->count; i->count = count; } -- cgit v1.2.3 From bd0e7491a931f5a2960555b10b9551464ff8cc8e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sat, 22 May 2021 01:59:38 +0200 Subject: mm, slub: convert kmem_cpu_slab protection to local_lock Embed local_lock into struct kmem_cpu_slab and use the irq-safe versions of local_lock instead of plain local_irq_save/restore. On !PREEMPT_RT that's equivalent, with better lockdep visibility. On PREEMPT_RT that means better preemption. However, the cost on PREEMPT_RT is the loss of lockless fast paths which only work with cpu freelist. Those are designed to detect and recover from being preempted by other conflicting operations (both fast or slow path), but the slow path operations assume they cannot be preempted by a fast path operation, which is guaranteed naturally with disabled irqs. With local locks on PREEMPT_RT, the fast paths now also need to take the local lock to avoid races. In the allocation fastpath slab_alloc_node() we can just defer to the slowpath __slab_alloc() which also works with cpu freelist, but under the local lock. In the free fastpath do_slab_free() we have to add a new local lock protected version of freeing to the cpu freelist, as the existing slowpath only works with the page freelist. Also update the comment about locking scheme in SLUB to reflect changes done by this series. [ Mike Galbraith : use local_lock() without irq in PREEMPT_RT scope; debugging of RT crashes resulting in put_cpu_partial() locking changes ] Signed-off-by: Vlastimil Babka --- include/linux/slub_def.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index dcde82a4434c..85499f0586b0 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -10,6 +10,7 @@ #include #include #include +#include enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ @@ -40,6 +41,10 @@ enum stat_item { CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ NR_SLUB_STAT_ITEMS }; +/* + * When changing the layout, make sure freelist and tid are still compatible + * with this_cpu_cmpxchg_double() alignment requirements. + */ struct kmem_cache_cpu { void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ @@ -47,6 +52,7 @@ struct kmem_cache_cpu { #ifdef CONFIG_SLUB_CPU_PARTIAL struct page *partial; /* Partially allocated frozen slabs */ #endif + local_lock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS unsigned stat[NR_SLUB_STAT_ITEMS]; #endif -- cgit v1.2.3 From 48eab831ae8b9f7002a533fa4235eed63ea1f1a3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 2 Sep 2021 11:10:37 -0700 Subject: net: create netdev->dev_addr assignment helpers Recent work on converting address list to a tree made it obvious we need an abstraction around writing netdev->dev_addr. Without such abstraction updating the main device address is invisible to the core. Introduce a number of helpers which for now just wrap memcpy() but in the future can make necessary changes to the address tree. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 12 ++++++++++++ include/linux/netdevice.h | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 330345b1be54..928c411bd509 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -299,6 +299,18 @@ static inline void ether_addr_copy(u8 *dst, const u8 *src) #endif } +/** + * eth_hw_addr_set - Assign Ethernet address to a net_device + * @dev: pointer to net_device structure + * @addr: address to assign + * + * Assign given address to the net_device, addr_assign_type is not changed. + */ +static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr) +{ + ether_addr_copy(dev->dev_addr, addr); +} + /** * eth_hw_addr_inherit - Copy dev_addr from another net_device * @dst: pointer to net_device to copy dev_addr to diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7c41593c1d6a..d79163208dfd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4641,6 +4641,24 @@ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, void __hw_addr_init(struct netdev_hw_addr_list *list); /* Functions used for device addresses handling */ +static inline void +__dev_addr_set(struct net_device *dev, const u8 *addr, size_t len) +{ + memcpy(dev->dev_addr, addr, len); +} + +static inline void dev_addr_set(struct net_device *dev, const u8 *addr) +{ + __dev_addr_set(dev, addr, dev->addr_len); +} + +static inline void +dev_addr_mod(struct net_device *dev, unsigned int offset, + const u8 *addr, size_t len) +{ + memcpy(&dev->dev_addr[offset], addr, len); +} + int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); int dev_addr_del(struct net_device *dev, const unsigned char *addr, -- cgit v1.2.3 From 8486a32dd484a7d7ec25295c7439094608f54915 Mon Sep 17 00:00:00 2001 From: "Hector.Yuan" Date: Fri, 3 Sep 2021 16:39:23 +0800 Subject: cpufreq: Add of_perf_domain_get_sharing_cpumask Add of_perf_domain_get_sharing_cpumask function to group cpu to specific performance domain. Signed-off-by: Hector.Yuan [ Viresh: create separate routine parse_perf_domain() and always set the cpumask. ] Signed-off-by: Viresh Kumar --- include/linux/cpufreq.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index c65a1d7385f8..acd3ee5b8b0a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include #include @@ -1003,6 +1005,55 @@ static inline int cpufreq_table_count_valid_entries(const struct cpufreq_policy return count; } + +static inline int parse_perf_domain(int cpu, const char *list_name, + const char *cell_name) +{ + struct device_node *cpu_np; + struct of_phandle_args args; + int ret; + + cpu_np = of_cpu_device_node_get(cpu); + if (!cpu_np) + return -ENODEV; + + ret = of_parse_phandle_with_args(cpu_np, list_name, cell_name, 0, + &args); + if (ret < 0) + return ret; + + of_node_put(cpu_np); + + return args.args[0]; +} + +static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_name, + const char *cell_name, struct cpumask *cpumask) +{ + int target_idx; + int cpu, ret; + + ret = parse_perf_domain(pcpu, list_name, cell_name); + if (ret < 0) + return ret; + + target_idx = ret; + cpumask_set_cpu(pcpu, cpumask); + + for_each_possible_cpu(cpu) { + if (cpu == pcpu) + continue; + + ret = parse_perf_domain(pcpu, list_name, cell_name); + if (ret < 0) + continue; + + if (target_idx == ret) + cpumask_set_cpu(cpu, cpumask); + } + + return target_idx; +} #else static inline int cpufreq_boost_trigger_state(int state) { @@ -1022,6 +1073,12 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) { return false; } + +static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_name, + const char *cell_name, struct cpumask *cpumask) +{ + return -EOPNOTSUPP; +} #endif #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) @@ -1043,7 +1100,6 @@ void arch_set_freq_scale(const struct cpumask *cpus, { } #endif - /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs; -- cgit v1.2.3 From 3cc4e148b96263313e3dce926eae569c942bb74e Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Tue, 17 Aug 2021 00:26:39 +0000 Subject: KVM: stats: Add VM stat for remote tlb flush requests Add a new stat that counts the number of times a remote TLB flush is requested, regardless of whether it kicks vCPUs out of guest mode. This allows us to look at how often flushes are initiated. Unlike remote_tlb_flush, this one applies to ARM's instruction-set-based TLB flush implementation, so apply it there too. Original-by: David Matlack Signed-off-by: Jing Zhang Message-Id: <20210817002639.3856694-1-jingzhangos@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 3 ++- include/linux/kvm_types.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e4d712e9f760..c177789a8542 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1444,7 +1444,8 @@ struct _kvm_stats_desc { KVM_STATS_BASE_POW10, -9, sz) #define KVM_GENERIC_VM_STATS() \ - STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush) + STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush), \ + STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_requests) #define KVM_GENERIC_VCPU_STATS() \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_successful_poll), \ diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index de7fb5f364d8..2237abb93ccd 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -80,6 +80,7 @@ struct kvm_mmu_memory_cache { struct kvm_vm_stat_generic { u64 remote_tlb_flush; + u64 remote_tlb_flush_requests; }; struct kvm_vcpu_stat_generic { -- cgit v1.2.3 From 9c930054f2f5326d59ee4bf8d7d1cf6c82f5643b Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 31 Aug 2021 18:36:24 +0800 Subject: file: Export receive_fd() to modules Export receive_fd() so that some modules can use it to pass file descriptor between processes without missing any security stuffs. Signed-off-by: Xie Yongji Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210831103634.33-4-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- include/linux/file.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/file.h b/include/linux/file.h index 2de2e4613d7b..51e830b4fe3a 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -94,6 +94,9 @@ extern void fd_install(unsigned int fd, struct file *file); extern int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); + +extern int receive_fd(struct file *file, unsigned int o_flags); + static inline int receive_fd_user(struct file *file, int __user *ufd, unsigned int o_flags) { @@ -101,10 +104,6 @@ static inline int receive_fd_user(struct file *file, int __user *ufd, return -EFAULT; return __receive_fd(file, ufd, o_flags); } -static inline int receive_fd(struct file *file, unsigned int o_flags) -{ - return __receive_fd(file, NULL, o_flags); -} int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); extern void flush_delayed_fput(void); -- cgit v1.2.3 From 86e17a51c1a5a299009f8b1645e3e9da0d59faae Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 31 Aug 2021 18:36:25 +0800 Subject: vdpa: Fix some coding style issues Fix some code indent issues and following checkpatch warning: WARNING: Prefer 'unsigned int' to bare use of 'unsigned' 371: FILE: include/linux/vdpa.h:371: +static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset, Signed-off-by: Xie Yongji Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210831103634.33-5-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 8cfe49d201dd..8ae1134070eb 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -43,17 +43,17 @@ struct vdpa_vq_state_split { * @last_used_idx: used index */ struct vdpa_vq_state_packed { - u16 last_avail_counter:1; - u16 last_avail_idx:15; - u16 last_used_counter:1; - u16 last_used_idx:15; + u16 last_avail_counter:1; + u16 last_avail_idx:15; + u16 last_used_counter:1; + u16 last_used_idx:15; }; struct vdpa_vq_state { - union { - struct vdpa_vq_state_split split; - struct vdpa_vq_state_packed packed; - }; + union { + struct vdpa_vq_state_split split; + struct vdpa_vq_state_packed packed; + }; }; struct vdpa_mgmt_dev; @@ -131,7 +131,7 @@ struct vdpa_iova_range { * @vdev: vdpa device * @idx: virtqueue index * @state: pointer to returned state (last_avail_idx) - * @get_vq_notification: Get the notification area for a virtqueue + * @get_vq_notification: Get the notification area for a virtqueue * @vdev: vdpa device * @idx: virtqueue index * Returns the notifcation area @@ -350,25 +350,25 @@ static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev) static inline void vdpa_reset(struct vdpa_device *vdev) { - const struct vdpa_config_ops *ops = vdev->config; + const struct vdpa_config_ops *ops = vdev->config; vdev->features_valid = false; - ops->set_status(vdev, 0); + ops->set_status(vdev, 0); } static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features) { - const struct vdpa_config_ops *ops = vdev->config; + const struct vdpa_config_ops *ops = vdev->config; vdev->features_valid = true; - return ops->set_features(vdev, features); + return ops->set_features(vdev, features); } - -static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset, - void *buf, unsigned int len) +static inline void vdpa_get_config(struct vdpa_device *vdev, + unsigned int offset, void *buf, + unsigned int len) { - const struct vdpa_config_ops *ops = vdev->config; + const struct vdpa_config_ops *ops = vdev->config; /* * Config accesses aren't supposed to trigger before features are set. -- cgit v1.2.3 From 0686082dbf7a204ca0fab326a820779e31666639 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 31 Aug 2021 18:36:26 +0800 Subject: vdpa: Add reset callback in vdpa_config_ops This adds a new callback to support device specific reset behavior. The vdpa bus driver will call the reset function instead of setting status to zero during resetting. Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20210831103634.33-6-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 8ae1134070eb..e1eae8c7483d 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -171,6 +171,9 @@ struct vdpa_iova_range { * @set_status: Set the device status * @vdev: vdpa device * @status: virtio device status + * @reset: Reset device + * @vdev: vdpa device + * Returns integer: success (0) or error (< 0) * @get_config_size: Get the size of the configuration space * @vdev: vdpa device * Returns size_t: configuration size @@ -255,6 +258,7 @@ struct vdpa_config_ops { u32 (*get_vendor_id)(struct vdpa_device *vdev); u8 (*get_status)(struct vdpa_device *vdev); void (*set_status)(struct vdpa_device *vdev, u8 status); + int (*reset)(struct vdpa_device *vdev); size_t (*get_config_size)(struct vdpa_device *vdev); void (*get_config)(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len); @@ -348,12 +352,12 @@ static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev) return vdev->dma_dev; } -static inline void vdpa_reset(struct vdpa_device *vdev) +static inline int vdpa_reset(struct vdpa_device *vdev) { const struct vdpa_config_ops *ops = vdev->config; vdev->features_valid = false; - ops->set_status(vdev, 0); + return ops->reset(vdev); } static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features) -- cgit v1.2.3 From 59dfe4f1e810b5820443c84f9863b04b033143e8 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 31 Aug 2021 18:36:28 +0800 Subject: vhost-iotlb: Add an opaque pointer for vhost IOTLB Add an opaque pointer for vhost IOTLB. And introduce vhost_iotlb_add_range_ctx() to accept it. Suggested-by: Jason Wang Signed-off-by: Xie Yongji Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210831103634.33-8-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- include/linux/vhost_iotlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vhost_iotlb.h b/include/linux/vhost_iotlb.h index 6b09b786a762..2d0e2f52f938 100644 --- a/include/linux/vhost_iotlb.h +++ b/include/linux/vhost_iotlb.h @@ -17,6 +17,7 @@ struct vhost_iotlb_map { u32 perm; u32 flags_padding; u64 __subtree_last; + void *opaque; }; #define VHOST_IOTLB_FLAG_RETIRE 0x1 @@ -29,6 +30,8 @@ struct vhost_iotlb { unsigned int flags; }; +int vhost_iotlb_add_range_ctx(struct vhost_iotlb *iotlb, u64 start, u64 last, + u64 addr, unsigned int perm, void *opaque); int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, u64 start, u64 last, u64 addr, unsigned int perm); void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last); -- cgit v1.2.3 From c10fb9454adc80c062151c6a436047e1fa59e99f Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 31 Aug 2021 18:36:29 +0800 Subject: vdpa: Add an opaque pointer for vdpa_config_ops.dma_map() Add an opaque pointer for DMA mapping. Suggested-by: Jason Wang Signed-off-by: Xie Yongji Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210831103634.33-9-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index e1eae8c7483d..f3014aaca47e 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -270,7 +270,7 @@ struct vdpa_config_ops { /* DMA ops */ int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb); int (*dma_map)(struct vdpa_device *vdev, u64 iova, u64 size, - u64 pa, u32 perm); + u64 pa, u32 perm, void *opaque); int (*dma_unmap)(struct vdpa_device *vdev, u64 iova, u64 size); /* Free device resources */ -- cgit v1.2.3 From d8945ec411209272bcd4ae9e75ea1b078257e492 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 31 Aug 2021 18:36:31 +0800 Subject: vdpa: Support transferring virtual addressing during DMA mapping This patch introduces an attribute for vDPA device to indicate whether virtual address can be used. If vDPA device driver set it, vhost-vdpa bus driver will not pin user page and transfer userspace virtual address instead of physical address during DMA mapping. And corresponding vma->vm_file and offset will be also passed as an opaque pointer. Suggested-by: Jason Wang Signed-off-by: Xie Yongji Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210831103634.33-11-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index f3014aaca47e..3972ab765de1 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -65,6 +65,7 @@ struct vdpa_mgmt_dev; * @config: the configuration ops for this device. * @index: device index * @features_valid: were features initialized? for legacy guests + * @use_va: indicate whether virtual address must be used by this device * @nvqs: maximum number of supported virtqueues * @mdev: management device pointer; caller must setup when registering device as part * of dev_add() mgmtdev ops callback before invoking _vdpa_register_device(). @@ -75,6 +76,7 @@ struct vdpa_device { const struct vdpa_config_ops *config; unsigned int index; bool features_valid; + bool use_va; int nvqs; struct vdpa_mgmt_dev *mdev; }; @@ -89,6 +91,16 @@ struct vdpa_iova_range { u64 last; }; +/** + * Corresponding file area for device memory mapping + * @file: vma->vm_file for the mapping + * @offset: mapping offset in the vm_file + */ +struct vdpa_map_file { + struct file *file; + u64 offset; +}; + /** * struct vdpa_config_ops - operations for configuring a vDPA device. * Note: vDPA device drivers are required to implement all of the @@ -279,7 +291,8 @@ struct vdpa_config_ops { struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, - size_t size, const char *name); + size_t size, const char *name, + bool use_va); /** * vdpa_alloc_device - allocate and initilaize a vDPA device @@ -289,15 +302,16 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, * @parent: the parent device * @config: the bus operations that is supported by this device * @name: name of the vdpa device + * @use_va: indicate whether virtual address must be used by this device * * Return allocated data structure or ERR_PTR upon error */ -#define vdpa_alloc_device(dev_struct, member, parent, config, name) \ +#define vdpa_alloc_device(dev_struct, member, parent, config, name, use_va) \ container_of(__vdpa_alloc_device( \ parent, config, \ sizeof(dev_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ - dev_struct, member)), name), \ + dev_struct, member)), name, use_va), \ dev_struct, member) int vdpa_register_device(struct vdpa_device *vdev, int nvqs); -- cgit v1.2.3 From f97493657c6372eeefe70faadd214bf31488c44e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 7 Sep 2021 18:56:46 +0800 Subject: net: phylink: add suspend/resume support Joakim Zhang reports that Wake-on-Lan with the stmmac ethernet driver broke when moving the incorrect handling of mac link state out of mac_config(). This reason this breaks is because the stmmac's WoL is handled by the MAC rather than the PHY, and phylink doesn't cater for that scenario. This patch adds the necessary phylink code to handle suspend/resume events according to whether the MAC still needs a valid link or not. This is the barest minimum for this support. Reported-by: Joakim Zhang Tested-by: Joakim Zhang Signed-off-by: Russell King (Oracle) Signed-off-by: Joakim Zhang Signed-off-by: David S. Miller --- include/linux/phylink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index afb3ded0b691..237291196ce2 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -451,6 +451,9 @@ void phylink_mac_change(struct phylink *, bool up); void phylink_start(struct phylink *); void phylink_stop(struct phylink *); +void phylink_suspend(struct phylink *pl, bool mac_wol); +void phylink_resume(struct phylink *pl); + void phylink_ethtool_get_wol(struct phylink *, struct ethtool_wolinfo *); int phylink_ethtool_set_wol(struct phylink *, struct ethtool_wolinfo *); -- cgit v1.2.3 From cd1adf1b63a112d762832e9c64b0a886fbb840d6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 7 Sep 2021 11:03:45 -0700 Subject: Revert "mm/gup: remove try_get_page(), call try_get_compound_head() directly" This reverts commit 9857a17f206ff374aea78bccfb687f145368be2e. That commit was completely broken, and I should have caught on to it earlier. But happily, the kernel test robot noticed the breakage fairly quickly. The breakage is because "try_get_page()" is about avoiding the page reference count overflow case, but is otherwise the exact same as a plain "get_page()". In contrast, "try_get_compound_head()" is an entirely different beast, and uses __page_cache_add_speculative() because it's not just about the page reference count, but also about possibly racing with the underlying page going away. So all the commentary about how "try_get_page() has fallen a little behind in terms of maintenance, try_get_compound_head() handles speculative page references more thoroughly" was just completely wrong: yes, try_get_compound_head() handles speculative page references, but the point is that try_get_page() does not, and must not. So there's no lack of maintainance - there are fundamentally different semantics. A speculative page reference would be entirely wrong in "get_page()", and it's entirely wrong in "try_get_page()". It's not about speculation, it's purely about "uhhuh, you can't get this page because you've tried to increment the reference count too much already". The reason the kernel test robot noticed this bug was that it hit the VM_BUG_ON() in __page_cache_add_speculative(), which is all about verifying that the context of any speculative page access is correct. But since that isn't what try_get_page() is all about, the VM_BUG_ON() tests things that are not correct to test for try_get_page(). Reported-by: kernel test robot Cc: John Hubbard Cc: Christoph Hellwig Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 50e2c2914ac2..73a52aba448f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1218,7 +1218,15 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags); struct page *try_grab_compound_head(struct page *page, int refs, unsigned int flags); -struct page *try_get_compound_head(struct page *page, int refs); + +static inline __must_check bool try_get_page(struct page *page) +{ + page = compound_head(page); + if (WARN_ON_ONCE(page_ref_count(page) <= 0)) + return false; + page_ref_inc(page); + return true; +} static inline void put_page(struct page *page) { -- cgit v1.2.3 From ca67408ad57a5a67ad6801d792c40c010451bdef Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 6 Sep 2021 09:44:52 +0100 Subject: PM: EM: fix kernel-doc comments Fix the kernel-doc comments for the improved Energy Model documentation. Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 1834752c5617..39dcadd492b5 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -11,7 +11,7 @@ #include /** - * em_perf_state - Performance state of a performance domain + * struct em_perf_state - Performance state of a performance domain * @frequency: The frequency in KHz, for consistency with CPUFreq * @power: The power consumed at this level (by 1 CPU or by a registered * device). It can be a total power: static and dynamic. @@ -25,7 +25,7 @@ struct em_perf_state { }; /** - * em_perf_domain - Performance domain + * struct em_perf_domain - Performance domain * @table: List of performance states, in ascending order * @nr_perf_states: Number of performance states * @milliwatts: Flag indicating the power values are in milli-Watts @@ -103,12 +103,12 @@ void em_dev_unregister_perf_domain(struct device *dev); /** * em_cpu_energy() - Estimates the energy consumed by the CPUs of a - performance domain + * performance domain * @pd : performance domain for which energy has to be estimated * @max_util : highest utilization among CPUs of the domain * @sum_util : sum of the utilization of all CPUs in the domain * @allowed_cpu_cap : maximum allowed CPU capacity for the @pd, which - might reflect reduced frequency (due to thermal) + * might reflect reduced frequency (due to thermal) * * This function must be used only for CPU devices. There is no validation, * i.e. if the EM is a CPU type and has cpumask allocated. It is called from -- cgit v1.2.3 From 39ff83f2f6cc5cc1458dfcea9697f96338210beb Mon Sep 17 00:00:00 2001 From: Lukas Hannen Date: Wed, 25 Aug 2021 10:12:43 +0000 Subject: time: Handle negative seconds correctly in timespec64_to_ns() timespec64_ns() prevents multiplication overflows by comparing the seconds value of the timespec to KTIME_SEC_MAX. If the value is greater or equal it returns KTIME_MAX. But that check casts the signed seconds value to unsigned which makes the comparision true for all negative values and therefore return wrongly KTIME_MAX. Negative second values are perfectly valid and required in some places, e.g. ptp_clock_adjtime(). Remove the cast and add a check for the negative boundary which is required to prevent undefined behaviour due to multiplication underflow. Fixes: cb47755725da ("time: Prevent undefined behaviour in timespec64_to_ns()")' Signed-off-by: Lukas Hannen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/AM6PR01MB541637BD6F336B8FFB72AF80EEC69@AM6PR01MB5416.eurprd01.prod.exchangelabs.com --- include/linux/time64.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/time64.h b/include/linux/time64.h index 5117cb5b5656..81b9686a2079 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -25,7 +25,9 @@ struct itimerspec64 { #define TIME64_MIN (-TIME64_MAX - 1) #define KTIME_MAX ((s64)~((u64)1 << 63)) +#define KTIME_MIN (-KTIME_MAX - 1) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) +#define KTIME_SEC_MIN (KTIME_MIN / NSEC_PER_SEC) /* * Limits for settimeofday(): @@ -124,10 +126,13 @@ static inline bool timespec64_valid_settod(const struct timespec64 *ts) */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { - /* Prevent multiplication overflow */ - if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) + /* Prevent multiplication overflow / underflow */ + if (ts->tv_sec >= KTIME_SEC_MAX) return KTIME_MAX; + if (ts->tv_sec <= KTIME_SEC_MIN) + return KTIME_MIN; + return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } -- cgit v1.2.3 From 859a85ddf90e714092dea71a0e54c7b9896621be Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 7 Sep 2021 19:54:52 -0700 Subject: mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE Patch series "mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE". After recent updates to freeing unused parts of the memory map, no architecture can have holes in the memory map within a pageblock. This makes pfn_valid_within() check and CONFIG_HOLES_IN_ZONE configuration option redundant. The first patch removes them both in a mechanical way and the second patch simplifies memory_hotplug::test_pages_in_a_zone() that had pfn_valid_within() surrounded by more logic than simple if. This patch (of 2): After recent changes in freeing of the unused parts of the memory map and rework of pfn_valid() in arm and arm64 there are no architectures that can have holes in the memory map within a pageblock and so nothing can enable CONFIG_HOLES_IN_ZONE which guards non trivial implementation of pfn_valid_within(). With that, pfn_valid_within() is always hardwired to 1 and can be completely removed. Remove calls to pfn_valid_within() and CONFIG_HOLES_IN_ZONE. Link: https://lkml.kernel.org/r/20210713080035.7464-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20210713080035.7464-2-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fcb535560028..ee3a86830519 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1525,18 +1525,6 @@ void sparse_init(void); #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ -/* - * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we - * need to check pfn validity within that MAX_ORDER_NR_PAGES block. - * pfn_valid_within() should be used in this case; we optimise this away - * when we have no holes within a MAX_ORDER_NR_PAGES block. - */ -#ifdef CONFIG_HOLES_IN_ZONE -#define pfn_valid_within(pfn) pfn_valid(pfn) -#else -#define pfn_valid_within(pfn) (1) -#endif - #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */ -- cgit v1.2.3 From 7cf209ba8a86410939a24cb1aeb279479a7e0ca6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:54:59 -0700 Subject: mm/memory_hotplug: use "unsigned long" for PFN in zone_for_pfn_range() Patch series "mm/memory_hotplug: preparatory patches for new online policy and memory" These are all cleanups and one fix previously sent as part of [1]: [PATCH v1 00/12] mm/memory_hotplug: "auto-movable" online policy and memory groups. These patches make sense even without the other series, therefore I pulled them out to make the other series easier to digest. [1] https://lkml.kernel.org/r/20210607195430.48228-1-david@redhat.com This patch (of 4): Checkpatch complained on a follow-up patch that we are using "unsigned" here, which defaults to "unsigned int" and checkpatch is correct. As we will search for a fitting zone using the wrong pfn, we might end up onlining memory to one of the special kernel zones, such as ZONE_DMA, which can end badly as the onlined memory does not satisfy properties of these zones. Use "unsigned long" instead, just as we do in other places when handling PFNs. This can bite us once we have physical addresses in the range of multiple TB. Link: https://lkml.kernel.org/r/20210712124052.26491-2-david@redhat.com Fixes: e5e689302633 ("mm, memory_hotplug: display allowed zones in the preferred ordering") Signed-off-by: David Hildenbrand Reviewed-by: Pankaj Gupta Reviewed-by: Muchun Song Reviewed-by: Oscar Salvador Cc: David Hildenbrand Cc: Vitaly Kuznetsov Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Cc: Wei Yang Cc: Michal Hocko Cc: Dan Williams Cc: Anshuman Khandual Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mike Rapoport Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Pavel Tatashin Cc: Heiko Carstens Cc: Michael Ellerman Cc: Catalin Marinas Cc: virtualization@lists.linux-foundation.org Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V" Cc: Anton Blanchard Cc: Ard Biesheuvel Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Dave Jiang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jia He Cc: Joe Perches Cc: Kefeng Wang Cc: Laurent Dufour Cc: Michel Lespinasse Cc: Nathan Lynch Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Pierre Morel Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Scott Cheloha Cc: Sergei Trofimovich Cc: Thiago Jung Bauermann Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Cc: Will Deacon Cc: Yoshinori Sato Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index a7fd2c3ccb77..d01b504ce06f 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -339,8 +339,8 @@ extern void sparse_remove_section(struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, - unsigned long nr_pages); +extern struct zone *zone_for_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages); extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); -- cgit v1.2.3 From 65a2aa5f482ed0c1b5afb9e6b0b9e0b16bb8b616 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:04 -0700 Subject: mm/memory_hotplug: remove nid parameter from arch_remove_memory() The parameter is unused, let's remove it. Link: https://lkml.kernel.org/r/20210712124052.26491-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Catalin Marinas Acked-by: Michael Ellerman [powerpc] Acked-by: Heiko Carstens [s390] Reviewed-by: Pankaj Gupta Reviewed-by: Oscar Salvador Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Yoshinori Sato Cc: Rich Felker Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Pavel Tatashin Cc: Baoquan He Cc: Laurent Dufour Cc: Sergei Trofimovich Cc: Kefeng Wang Cc: Michel Lespinasse Cc: Christophe Leroy Cc: "Aneesh Kumar K.V" Cc: Thiago Jung Bauermann Cc: Joe Perches Cc: Pierre Morel Cc: Jia He Cc: Anton Blanchard Cc: Dan Williams Cc: Dave Jiang Cc: Jason Wang Cc: Len Brown Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Nathan Lynch Cc: Pankaj Gupta Cc: "Rafael J. Wysocki" Cc: "Rafael J. Wysocki" Cc: Scott Cheloha Cc: Vishal Verma Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index d01b504ce06f..010a192298b5 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -130,8 +130,7 @@ static inline bool movable_node_is_enabled(void) return movable_node_enabled; } -extern void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap); +extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap); extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); -- cgit v1.2.3 From e1c158e4956612e7bada4c03dfb99210af4d6cde Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:09 -0700 Subject: mm/memory_hotplug: remove nid parameter from remove_memory() and friends There is only a single user remaining. We can simply lookup the nid only used for node offlining purposes when walking our memory blocks. We don't expect to remove multi-nid ranges; and if we'd ever do, we most probably don't care about removing multi-nid ranges that actually result in empty nodes. If ever required, we can detect the "multi-nid" scenario and simply try offlining all online nodes. Link: https://lkml.kernel.org/r/20210712124052.26491-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michael Ellerman (powerpc) Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Nathan Lynch Cc: Laurent Dufour Cc: "Aneesh Kumar K.V" Cc: Scott Cheloha Cc: Anton Blanchard Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Baoquan He Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Dave Hansen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jia He Cc: Joe Perches Cc: Kefeng Wang Cc: Michal Hocko Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Pierre Morel Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Sergei Trofimovich Cc: Thiago Jung Bauermann Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 010a192298b5..068e3dcf4690 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -292,9 +292,9 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); -extern int remove_memory(int nid, u64 start, u64 size); -extern void __remove_memory(int nid, u64 start, u64 size); -extern int offline_and_remove_memory(int nid, u64 start, u64 size); +extern int remove_memory(u64 start, u64 size); +extern void __remove_memory(u64 start, u64 size); +extern int offline_and_remove_memory(u64 start, u64 size); #else static inline void try_offline_node(int nid) {} @@ -304,12 +304,12 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) return -EINVAL; } -static inline int remove_memory(int nid, u64 start, u64 size) +static inline int remove_memory(u64 start, u64 size) { return -EBUSY; } -static inline void __remove_memory(int nid, u64 start, u64 size) {} +static inline void __remove_memory(u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ extern void set_zone_contiguous(struct zone *zone); -- cgit v1.2.3 From 4b0970024408afb17886e0c76e9761c4264db2a8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:19 -0700 Subject: mm: track present early pages per zone Patch series "mm/memory_hotplug: "auto-movable" online policy and memory groups", v3. I. Goal The goal of this series is improving in-kernel auto-online support. It tackles the fundamental problems that: 1) We can create zone imbalances when onlining all memory blindly to ZONE_MOVABLE, in the worst case crashing the system. We have to know upfront how much memory we are going to hotplug such that we can safely enable auto-onlining of all hotplugged memory to ZONE_MOVABLE via "online_movable". This is far from practical and only applicable in limited setups -- like inside VMs under the RHV/oVirt hypervisor which will never hotplug more than 3 times the boot memory (and the limitation is only in place due to the Linux limitation). 2) We see more setups that implement dynamic VM resizing, hot(un)plugging memory to resize VM memory. In these setups, we might hotplug a lot of memory, but it might happen in various small steps in both directions (e.g., 2 GiB -> 8 GiB -> 4 GiB -> 16 GiB ...). virtio-mem is the primary driver of this upstream right now, performing such dynamic resizing NUMA-aware via multiple virtio-mem devices. Onlining all hotplugged memory to ZONE_NORMAL means we basically have no hotunplug guarantees. Onlining all to ZONE_MOVABLE means we can easily run into zone imbalances when growing a VM. We want a mixture, and we want as much memory as reasonable/configured in ZONE_MOVABLE. Details regarding zone imbalances can be found at [1]. 3) Memory devices consist of 1..X memory block devices, however, the kernel doesn't really track the relationship. Consequently, also user space has no idea. We want to make per-device decisions. As one example, for memory hotunplug it doesn't make sense to use a mixture of zones within a single DIMM: we want all MOVABLE if possible, otherwise all !MOVABLE, because any !MOVABLE part will easily block the whole DIMM from getting hotunplugged. As another example, virtio-mem operates on individual units that span 1..X memory blocks. Similar to a DIMM, we want a unit to either be all MOVABLE or !MOVABLE. A "unit" can be thought of like a DIMM, however, all units of a virtio-mem device logically belong together and are managed (added/removed) by a single driver. We want as much memory of a virtio-mem device to be MOVABLE as possible. 4) We want memory onlining to be done right from the kernel while adding memory, not triggered by user space via udev rules; for example, this is reqired for fast memory hotplug for drivers that add individual memory blocks, like virito-mem. We want a way to configure a policy in the kernel and avoid implementing advanced policies in user space. The auto-onlining support we have in the kernel is not sufficient. All we have is a) online everything MOVABLE (online_movable) b) online everything !MOVABLE (online_kernel) c) keep zones contiguous (online). This series allows configuring c) to mean instead "online movable if possible according to the coniguration, driven by a maximum MOVABLE:KERNEL ratio" -- a new onlining policy. II. Approach This series does 3 things: 1) Introduces the "auto-movable" online policy that initially operates on individual memory blocks only. It uses a maximum MOVABLE:KERNEL ratio to make a decision whether a memory block will be onlined to ZONE_MOVABLE or not. However, in the basic form, hotplugged KERNEL memory does not allow for more MOVABLE memory (details in the patches). CMA memory is treated like MOVABLE memory. 2) Introduces static (e.g., DIMM) and dynamic (e.g., virtio-mem) memory groups and uses group information to make decisions in the "auto-movable" online policy across memory blocks of a single memory device (modeled as memory group). More details can be found in patch #3 or in the DIMM example below. 3) Maximizes ZONE_MOVABLE memory within dynamic memory groups, by allowing ZONE_NORMAL memory within a dynamic memory group to allow for more ZONE_MOVABLE memory within the same memory group. The target use case is dynamic VM resizing using virtio-mem. See the virtio-mem example below. I remember that the basic idea of using a ratio to implement a policy in the kernel was once mentioned by Vitaly Kuznetsov, but I might be wrong (I lost the pointer to that discussion). For me, the main use case is using it along with virtio-mem (and DIMMs / ppc64 dlpar where necessary) for dynamic resizing of VMs, increasing the amount of memory we can hotunplug reliably again if we might eventually hotplug a lot of memory to a VM. III. Target Usage The target usage will be: 1) Linux boots with "mhp_default_online_type=offline" 2) User space (e.g., systemd unit) configures memory onlining (according to a config file and system properties), for example: * Setting memory_hotplug.online_policy=auto-movable * Setting memory_hotplug.auto_movable_ratio=301 * Setting memory_hotplug.auto_movable_numa_aware=true 3) User space enabled auto onlining via "echo online > /sys/devices/system/memory/auto_online_blocks" 4) User space triggers manual onlining of all already-offline memory blocks (go over offline memory blocks and set them to "online") IV. Example For DIMMs, hotplugging 4 GiB DIMMs to a 4 GiB VM with a configured ratio of 301% results in the following layout: Memory block 0-15: DMA32 (early) Memory block 32-47: Normal (early) Memory block 48-79: Movable (DIMM 0) Memory block 80-111: Movable (DIMM 1) Memory block 112-143: Movable (DIMM 2) Memory block 144-275: Normal (DIMM 3) Memory block 176-207: Normal (DIMM 4) ... all Normal (-> hotplugged Normal memory does not allow for more Movable memory) For virtio-mem, using a simple, single virtio-mem device with a 4 GiB VM will result in the following layout: Memory block 0-15: DMA32 (early) Memory block 32-47: Normal (early) Memory block 48-143: Movable (virtio-mem, first 12 GiB) Memory block 144: Normal (virtio-mem, next 128 MiB) Memory block 145-147: Movable (virtio-mem, next 384 MiB) Memory block 148: Normal (virtio-mem, next 128 MiB) Memory block 149-151: Movable (virtio-mem, next 384 MiB) ... Normal/Movable mixture as above (-> hotplugged Normal memory allows for more Movable memory within the same device) Which gives us maximum flexibility when dynamically growing/shrinking a VM in smaller steps. V. Doc Update I'll update the memory-hotplug.rst documentation, once the overhaul [1] is usptream. Until then, details can be found in patch #2. VI. Future Work 1) Use memory groups for ppc64 dlpar 2) Being able to specify a portion of (early) kernel memory that will be excluded from the ratio. Like "128 MiB globally/per node" are excluded. This might be helpful when starting VMs with extremely small memory footprint (e.g., 128 MiB) and hotplugging memory later -- not wanting the first hotplugged units getting onlined to ZONE_MOVABLE. One alternative would be a trigger to not consider ZONE_DMA memory in the ratio. We'll have to see if this is really rrequired. 3) Indicate to user space that MOVABLE might be a bad idea -- especially relevant when memory ballooning without support for balloon compaction is active. This patch (of 9): For implementing a new memory onlining policy, which determines when to online memory blocks to ZONE_MOVABLE semi-automatically, we need the number of present early (boot) pages -- present pages excluding hotplugged pages. Let's track these pages per zone. Pass a page instead of the zone to adjust_present_page_count(), similar as adjust_managed_page_count() and derive the zone from the page. It's worth noting that a memory block to be offlined/onlined is either completely "early" or "not early". add_memory() and friends can only add complete memory blocks and we only online/offline complete (individual) memory blocks. Link: https://lkml.kernel.org/r/20210806124715.17090-1-david@redhat.com Link: https://lkml.kernel.org/r/20210806124715.17090-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Vitaly Kuznetsov Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Marek Kedzierski Cc: Hui Zhu Cc: Pankaj Gupta Cc: Wei Yang Cc: Oscar Salvador Cc: Michal Hocko Cc: Dan Williams Cc: Anshuman Khandual Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mike Rapoport Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Pavel Tatashin Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 +- include/linux/mmzone.h | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 068e3dcf4690..39b04e99a30e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -95,7 +95,7 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); -extern void adjust_present_page_count(struct zone *zone, long nr_pages); +extern void adjust_present_page_count(struct page *page, long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, struct zone *zone); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ee3a86830519..1c0e3bf42521 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -540,6 +540,10 @@ struct zone { * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * + * present_early_pages is present pages existing within the zone + * located on memory available since early boot, excluding hotplugged + * memory. + * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): @@ -572,6 +576,9 @@ struct zone { atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; +#if defined(CONFIG_MEMORY_HOTPLUG) + unsigned long present_early_pages; +#endif #ifdef CONFIG_CMA unsigned long cma_pages; #endif -- cgit v1.2.3 From 028fc57a1c361116e3bcebfeba4ca87878baaf4f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:26 -0700 Subject: drivers/base/memory: introduce "memory groups" to logically group memory blocks In our "auto-movable" memory onlining policy, we want to make decisions across memory blocks of a single memory device. Examples of memory devices include ACPI memory devices (in the simplest case a single DIMM) and virtio-mem. For now, we don't have a connection between a single memory block device and the real memory device. Each memory device consists of 1..X memory block devices. Let's logically group memory blocks belonging to the same memory device in "memory groups". Memory groups can span multiple physical ranges and a memory group itself does not contain any information regarding physical ranges, only properties (e.g., "max_pages") necessary for improved memory onlining. Introduce two memory group types: 1) Static memory group: E.g., a single ACPI memory device, consisting of 1..X memory resources. A memory group consists of 1..Y memory blocks. The whole group is added/removed in one go. If any part cannot get offlined, the whole group cannot be removed. 2) Dynamic memory group: E.g., a single virtio-mem device. Memory is dynamically added/removed in a fixed granularity, called a "unit", consisting of 1..X memory blocks. A unit is added/removed in one go. If any part of a unit cannot get offlined, the whole unit cannot be removed. In case of 1) we usually want either all memory managed by ZONE_MOVABLE or none. In case of 2) we usually want to have as many units as possible managed by ZONE_MOVABLE. We want a single unit to be of the same type. For now, memory groups are an internal concept that is not exposed to user space; we might want to change that in the future, though. add_memory() users can specify a mgid instead of a nid when passing the MHP_NID_IS_MGID flag. Link: https://lkml.kernel.org/r/20210806124715.17090-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 46 +++++++++++++++++++++++++++++++++++++++++- include/linux/memory_hotplug.h | 5 +++++ 2 files changed, 50 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 97e92e8b556a..d505c12c5c77 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -23,6 +23,42 @@ #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) +/** + * struct memory_group - a logical group of memory blocks + * @nid: The node id for all memory blocks inside the memory group. + * @blocks: List of all memory blocks belonging to this memory group. + * @is_dynamic: The memory group type: static vs. dynamic + * @s.max_pages: Valid with &memory_group.is_dynamic == false. The maximum + * number of pages we'll have in this static memory group. + * @d.unit_pages: Valid with &memory_group.is_dynamic == true. Unit in pages + * in which memory is added/removed in this dynamic memory group. + * This granularity defines the alignment of a unit in physical + * address space; it has to be at least as big as a single + * memory block. + * + * A memory group logically groups memory blocks; each memory block + * belongs to at most one memory group. A memory group corresponds to + * a memory device, such as a DIMM or a NUMA node, which spans multiple + * memory blocks and might even span multiple non-contiguous physical memory + * ranges. + * + * Modification of members after registration is serialized by memory + * hot(un)plug code. + */ +struct memory_group { + int nid; + struct list_head memory_blocks; + bool is_dynamic; + union { + struct { + unsigned long max_pages; + } s; + struct { + unsigned long unit_pages; + } d; + }; +}; + struct memory_block { unsigned long start_section_nr; unsigned long state; /* serialized by the dev->lock */ @@ -34,6 +70,8 @@ struct memory_block { * lay at the beginning of the memory block. */ unsigned long nr_vmemmap_pages; + struct memory_group *group; /* group (if any) for this block */ + struct list_head group_next; /* next block inside memory group */ }; int arch_get_memory_phys_device(unsigned long start_pfn); @@ -86,7 +124,8 @@ static inline int memory_notify(unsigned long val, void *v) extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size, - unsigned long vmemmap_pages); + unsigned long vmemmap_pages, + struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); @@ -96,6 +135,11 @@ extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func); #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION< Date: Tue, 7 Sep 2021 19:55:30 -0700 Subject: mm/memory_hotplug: track present pages in memory groups Let's track all present pages in each memory group. Especially, track memory present in ZONE_MOVABLE and memory present in one of the kernel zones (which really only is ZONE_NORMAL right now as memory groups only apply to hotplugged memory) separately within a memory group, to prepare for making smart auto-online decision for individual memory blocks within a memory group based on group statistics. Link: https://lkml.kernel.org/r/20210806124715.17090-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 6 ++++++ include/linux/memory_hotplug.h | 13 +++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index d505c12c5c77..6ffdc1db385f 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -27,6 +27,10 @@ * struct memory_group - a logical group of memory blocks * @nid: The node id for all memory blocks inside the memory group. * @blocks: List of all memory blocks belonging to this memory group. + * @present_kernel_pages: Present (online) memory outside ZONE_MOVABLE of this + * memory group. + * @present_movable_pages: Present (online) memory in ZONE_MOVABLE of this + * memory group. * @is_dynamic: The memory group type: static vs. dynamic * @s.max_pages: Valid with &memory_group.is_dynamic == false. The maximum * number of pages we'll have in this static memory group. @@ -48,6 +52,8 @@ struct memory_group { int nid; struct list_head memory_blocks; + unsigned long present_kernel_pages; + unsigned long present_movable_pages; bool is_dynamic; union { struct { diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 5d341978b4bc..cf3f423c8a74 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -12,6 +12,7 @@ struct zone; struct pglist_data; struct mem_section; struct memory_block; +struct memory_group; struct resource; struct vmem_altmap; @@ -100,13 +101,15 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); -extern void adjust_present_page_count(struct page *page, long nr_pages); +extern void adjust_present_page_count(struct page *page, + struct memory_group *group, + long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, struct zone *zone); extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, - struct zone *zone); + struct zone *zone, struct memory_group *group); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); extern void __offline_isolated_pages(unsigned long start_pfn, @@ -296,7 +299,8 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} #ifdef CONFIG_MEMORY_HOTREMOVE extern void try_offline_node(int nid); -extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); +extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages, + struct memory_group *group); extern int remove_memory(u64 start, u64 size); extern void __remove_memory(u64 start, u64 size); extern int offline_and_remove_memory(u64 start, u64 size); @@ -304,7 +308,8 @@ extern int offline_and_remove_memory(u64 start, u64 size); #else static inline void try_offline_node(int nid) {} -static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) +static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages, + struct memory_group *group) { return -EINVAL; } -- cgit v1.2.3 From 445fcf7c721450dd1d4ec6c217b3c6a932602a44 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:45 -0700 Subject: mm/memory_hotplug: memory group aware "auto-movable" online policy Use memory groups to improve our "auto-movable" onlining policy: 1. For static memory groups (e.g., a DIMM), online a memory block MOVABLE only if all other memory blocks in the group are either MOVABLE or could be onlined MOVABLE. A DIMM will either be MOVABLE or not, not a mixture. 2. For dynamic memory groups (e.g., a virtio-mem device), online a memory block MOVABLE only if all other memory blocks inside the current unit are either MOVABLE or could be onlined MOVABLE. For a virtio-mem device with a device block size with 512 MiB, all 128 MiB memory blocks wihin a 512 MiB unit will either be MOVABLE or not, not a mixture. We have to pass the memory group to zone_for_pfn_range() to take the memory group into account. Note: for now, there seems to be no compelling reason to make this behavior configurable. Link: https://lkml.kernel.org/r/20210806124715.17090-9-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index cf3f423c8a74..e5a867c950b2 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -349,7 +349,8 @@ extern void sparse_remove_section(struct mem_section *ms, extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); extern struct zone *zone_for_pfn_range(int online_type, int nid, - unsigned long start_pfn, unsigned long nr_pages); + struct memory_group *group, unsigned long start_pfn, + unsigned long nr_pages); extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); -- cgit v1.2.3 From 3fcebf90209a7f52d384ad7701425aa91be309ab Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Sep 2021 19:55:48 -0700 Subject: mm/memory_hotplug: improved dynamic memory group aware "auto-movable" online policy Currently, the "auto-movable" online policy does not allow for hotplugged KERNEL (ZONE_NORMAL) memory to increase the amount of MOVABLE memory we can have, primarily, because there is no coordiantion across memory devices and we don't want to create zone-imbalances accidentially when unplugging memory. However, within a single memory device it's different. Let's allow for KERNEL memory within a dynamic memory group to allow for more MOVABLE within the same memory group. The only thing we have to take care of is that the managing driver avoids zone imbalances by unplugging MOVABLE memory first, otherwise there can be corner cases where unplug of memory could result in (accidential) zone imbalances. virtio-mem is the only user of dynamic memory groups and recently added support for prioritizing unplug of ZONE_MOVABLE over ZONE_NORMAL, so we don't need a new toggle to enable it for dynamic memory groups. We limit this handling to dynamic memory groups, because: * We want to keep the runtime overhead for collecting stats when onlining a single memory block small. We tend to have only a handful of dynamic memory groups, but we can have quite some static memory groups (e.g., 256 DIMMs). * It doesn't make too much sense for static memory groups, as we try onlining all applicable memory blocks either completely to ZONE_MOVABLE or not. In ordinary operation, we won't have a mixture of zones within a static memory group. When adding memory to a dynamic memory group, we'll first online memory to ZONE_MOVABLE as long as early KERNEL memory allows for it. Then, we'll online the next unit(s) to ZONE_NORMAL, until we can online the next unit(s) to ZONE_MOVABLE. For a simple virtio-mem device with a MOVABLE:KERNEL ratio of 3:1, it will result in a layout like: [M][M][M][M][M][M][M][M][N][M][M][M][N][M][M][M]... ^ movable memory due to early kernel memory ^ allows for more movable memory ... ^-----^ ... here ^ allows for more movable memory ... ^-----^ ... here While the created layout is sub-optimal when it comes to contiguous zones, it gives us the maximum flexibility when dynamically growing/shrinking a device; we can grow small VMs really big in small steps, and still shrink reliably to e.g., 1/4 of the maximum VM size in this example, removing full memory blocks along with meta data more reliably. Mark dynamic memory groups in the xarray such that we can efficiently iterate over them when collecting stats. In usual setups, we have one virtio-mem device per NUMA node, and usually only a small number of NUMA nodes. Note: for now, there seems to be no compelling reason to make this behavior configurable. Link: https://lkml.kernel.org/r/20210806124715.17090-10-david@redhat.com Signed-off-by: David Hildenbrand Cc: Anshuman Khandual Cc: Dan Williams Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Hui Zhu Cc: Jason Wang Cc: Len Brown Cc: Marek Kedzierski Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 6ffdc1db385f..cbcc43ad2b97 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -146,6 +146,9 @@ extern int memory_group_register_static(int nid, unsigned long max_pages); extern int memory_group_register_dynamic(int nid, unsigned long unit_pages); extern int memory_group_unregister(int mgid); struct memory_group *memory_group_find_by_id(int mgid); +typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *); +int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, + struct memory_group *excluded, void *arg); #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ #ifdef CONFIG_MEMORY_HOTPLUG -- cgit v1.2.3 From fe3df441ef885a75a3eff5e151ead1a92266d222 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 7 Sep 2021 19:55:55 -0700 Subject: mm: remove redundant compound_head() calling There is a READ_ONCE() in the macro of compound_head(), which will prevent compiler from optimizing the code when there are more than once calling of it in a function. Remove the redundant calling of compound_head() from page_to_index() and page_add_file_rmap() for better code generation. Link: https://lkml.kernel.org/r/20210811101431.83940-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: David Howells Cc: Matthew Wilcox (Oracle) Cc: William Kucharski Cc: Kirill A. Shutemov Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ed02aa522263..904e57db3a7d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -521,18 +521,17 @@ static inline struct page *read_mapping_page(struct address_space *mapping, */ static inline pgoff_t page_to_index(struct page *page) { - pgoff_t pgoff; + struct page *head; if (likely(!PageTransTail(page))) return page->index; + head = compound_head(page); /* * We don't initialize ->index for tail pages: calculate based on * head page */ - pgoff = compound_head(page)->index; - pgoff += page - compound_head(page); - return pgoff; + return head->index + page - head; } extern pgoff_t hugetlb_basepage_index(struct page *page); -- cgit v1.2.3 From 82a70ce0426dd7c4099516175019dccbd18cebf9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Sep 2021 19:56:01 -0700 Subject: mm: move ioremap_page_range to vmalloc.c Patch series "small ioremap cleanups". The first patch moves a little code around the vmalloc/ioremap boundary following a bigger move by Nick earlier. The second enforces non-executable mapping on ioremap just like we do for vmap. No driver currently uses executable mappings anyway, as they should. This patch (of 2): This keeps it together with the implementation, and to remove the vmap_range wrapper. Link: https://lkml.kernel.org/r/20210824091259.1324527-1-hch@lst.de Link: https://lkml.kernel.org/r/20210824091259.1324527-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Nicholas Piggin Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 2644425b6dce..671d402c3778 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -225,9 +225,6 @@ static inline bool is_vm_area_hugepages(const void *addr) } #ifdef CONFIG_MMU -int vmap_range(unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int max_page_shift); void vunmap_range(unsigned long addr, unsigned long end); static inline void set_vm_flush_reset_perms(void *addr) { -- cgit v1.2.3 From 513861202d1259e35934e206b79cd54f523d79b5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 7 Sep 2021 19:56:09 -0700 Subject: highmem: don't disable preemption on RT in kmap_atomic() kmap_atomic() disables preemption and pagefaults for historical reasons. The conversion to kmap_local(), which only disables migration, cannot be done wholesale because quite some call sites need to be updated to accommodate with the changed semantics. On PREEMPT_RT enabled kernels the kmap_atomic() semantics are problematic due to the implicit disabling of preemption which makes it impossible to acquire 'sleeping' spinlocks within the kmap atomic sections. PREEMPT_RT replaces the preempt_disable() with a migrate_disable() for more than a decade. It could be argued that this is a justification to do this unconditionally, but PREEMPT_RT covers only a limited number of architectures and it disables some functionality which limits the coverage further. Limit the replacement to PREEMPT_RT for now. Link: https://lkml.kernel.org/r/20210810091116.pocdmaatdcogvdso@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Acked-by: Vlastimil Babka Cc: Thomas Gleixner Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem-internal.h | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 7902c7d8b55f..4aa1031d3e4c 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -90,7 +90,11 @@ static inline void __kunmap_local(void *vaddr) static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { - preempt_disable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_disable(); + else + preempt_disable(); + pagefault_disable(); return __kmap_local_page_prot(page, prot); } @@ -102,7 +106,11 @@ static inline void *kmap_atomic(struct page *page) static inline void *kmap_atomic_pfn(unsigned long pfn) { - preempt_disable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_disable(); + else + preempt_disable(); + pagefault_disable(); return __kmap_local_pfn_prot(pfn, kmap_prot); } @@ -111,7 +119,10 @@ static inline void __kunmap_atomic(void *addr) { kunmap_local_indexed(addr); pagefault_enable(); - preempt_enable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_enable(); + else + preempt_enable(); } unsigned int __nr_free_highpages(void); @@ -179,7 +190,10 @@ static inline void __kunmap_local(void *addr) static inline void *kmap_atomic(struct page *page) { - preempt_disable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_disable(); + else + preempt_disable(); pagefault_disable(); return page_address(page); } @@ -200,7 +214,10 @@ static inline void __kunmap_atomic(void *addr) kunmap_flush_on_unmap(addr); #endif pagefault_enable(); - preempt_enable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_enable(); + else + preempt_enable(); } static inline unsigned int nr_free_highpages(void) { return 0; } -- cgit v1.2.3 From 41c961b9013ee9b6d0491f6926df546e37964b1f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 7 Sep 2021 19:56:15 -0700 Subject: mm: introduce PAGEFLAGS_MASK to replace ((1UL << NR_PAGEFLAGS) - 1) Instead of hard-coding ((1UL << NR_PAGEFLAGS) - 1) everywhere, introducing PAGEFLAGS_MASK to make the code clear to get the page flags. Link: https://lkml.kernel.org/r/20210819150712.59948-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Vladimir Davydov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5922031ffab6..6b8d66965145 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -178,6 +178,8 @@ enum pageflags { PG_reported = PG_uptodate, }; +#define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) + #ifndef __GENERATING_BOUNDS_H static inline unsigned long _compound_head(const struct page *page) @@ -859,7 +861,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ - (((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON) + (PAGEFLAGS_MASK & ~__PG_HWPOISON) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) -- cgit v1.2.3 From 2224d8485492e499ca2e5d25407f8502cc06f149 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 7 Sep 2021 19:56:28 -0700 Subject: mm: introduce Data Access MONitor (DAMON) Patch series "Introduce Data Access MONitor (DAMON)", v34. Introduction ============ DAMON is a data access monitoring framework for the Linux kernel. The core mechanisms of DAMON called 'region based sampling' and 'adaptive regions adjustment' (refer to 'mechanisms.rst' in the 11th patch of this patchset for the detail) make it - accurate (The monitored information is useful for DRAM level memory management. It might not appropriate for Cache-level accuracy, though.), - light-weight (The monitoring overhead is low enough to be applied online while making no impact on the performance of the target workloads.), and - scalable (the upper-bound of the instrumentation overhead is controllable regardless of the size of target workloads.). Using this framework, therefore, several memory management mechanisms such as reclamation and THP can be optimized to aware real data access patterns. Experimental access pattern aware memory management optimization works that incurring high instrumentation overhead will be able to have another try. Though DAMON is for kernel subsystems, it can be easily exposed to the user space by writing a DAMON-wrapper kernel subsystem. Then, user space users who have some special workloads will be able to write personalized tools or applications for deeper understanding and specialized optimizations of their systems. DAMON is also merged in two public Amazon Linux kernel trees that based on v5.4.y[1] and v5.10.y[2]. [1] https://github.com/amazonlinux/linux/tree/amazon-5.4.y/master/mm/damon [2] https://github.com/amazonlinux/linux/tree/amazon-5.10.y/master/mm/damon The userspace tool[1] is available, released under GPLv2, and actively being maintained. I am also planning to implement another basic user interface in perf[2]. Also, the basic test suite for DAMON is available under GPLv2[3]. [1] https://github.com/awslabs/damo [2] https://lore.kernel.org/linux-mm/20210107120729.22328-1-sjpark@amazon.com/ [3] https://github.com/awslabs/damon-tests Long-term Plan -------------- DAMON is a part of a project called Data Access-aware Operating System (DAOS). As the name implies, I want to improve the performance and efficiency of systems using fine-grained data access patterns. The optimizations are for both kernel and user spaces. I will therefore modify or create kernel subsystems, export some of those to user space and implement user space library / tools. Below shows the layers and components for the project. --------------------------------------------------------------------------- Primitives: PTE Accessed bit, PG_idle, rmap, (Intel CMT), ... Framework: DAMON Features: DAMOS, virtual addr, physical addr, ... Applications: DAMON-debugfs, (DARC), ... ^^^^^^^^^^^^^^^^^^^^^^^ KERNEL SPACE ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Raw Interface: debugfs, (sysfs), (damonfs), tracepoints, (sys_damon), ... vvvvvvvvvvvvvvvvvvvvvvv USER SPACE vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv Library: (libdamon), ... Tools: DAMO, (perf), ... --------------------------------------------------------------------------- The components in parentheses or marked as '...' are not implemented yet but in the future plan. IOW, those are the TODO tasks of DAOS project. For more detail, please refer to the plans: https://lore.kernel.org/linux-mm/20201202082731.24828-1-sjpark@amazon.com/ Evaluations =========== We evaluated DAMON's overhead, monitoring quality and usefulness using 24 realistic workloads on my QEMU/KVM based virtual machine running a kernel that v24 DAMON patchset is applied. DAMON is lightweight. It increases system memory usage by 0.39% and slows target workloads down by 1.16%. DAMON is accurate and useful for memory management optimizations. An experimental DAMON-based operation scheme for THP, namely 'ethp', removes 76.15% of THP memory overheads while preserving 51.25% of THP speedup. Another experimental DAMON-based 'proactive reclamation' implementation, 'prcl', reduces 93.38% of residential sets and 23.63% of system memory footprint while incurring only 1.22% runtime overhead in the best case (parsec3/freqmine). NOTE that the experimental THP optimization and proactive reclamation are not for production but only for proof of concepts. Please refer to the official document[1] or "Documentation/admin-guide/mm: Add a document for DAMON" patch in this patchset for detailed evaluation setup and results. [1] https://damonitor.github.io/doc/html/latest-damon/admin-guide/mm/damon/eval.html Real-world User Story ===================== In summary, DAMON has used on production systems and proved its usefulness. DAMON as a profiler ------------------- We analyzed characteristics of a large scale production systems of our customers using DAMON. The systems utilize 70GB DRAM and 36 CPUs. From this, we were able to find interesting things below. There were obviously different access pattern under idle workload and active workload. Under the idle workload, it accessed large memory regions with low frequency, while the active workload accessed small memory regions with high freuqnecy. DAMON found a 7GB memory region that showing obviously high access frequency under the active workload. We believe this is the performance-effective working set and need to be protected. There was a 4KB memory region that showing highest access frequency under not only active but also idle workloads. We think this must be a hottest code section like thing that should never be paged out. For this analysis, DAMON used only 0.3-1% of single CPU time. Because we used recording-based analysis, it consumed about 3-12 MB of disk space per 20 minutes. This is only small amount of disk space, but we can further reduce the disk usage by using non-recording-based DAMON features. I'd like to argue that only DAMON can do such detailed analysis (finding 4KB highest region in 70GB memory) with the light overhead. DAMON as a system optimization tool ----------------------------------- We also found below potential performance problems on the systems and made DAMON-based solutions. The system doesn't want to make the workload suffer from the page reclamation and thus it utilizes enough DRAM but no swap device. However, we found the system is actively reclaiming file-backed pages, because the system has intensive file IO. The file IO turned out to be not performance critical for the workload, but the customer wanted to ensure performance critical file-backed pages like code section to not mistakenly be evicted. Using direct IO should or `mlock()` would be a straightforward solution, but modifying the user space code is not easy for the customer. Alternatively, we could use DAMON-based operation scheme[1]. By using it, we can ask DAMON to track access frequency of each region and make 'process_madvise(MADV_WILLNEED)[2]' call for regions having specific size and access frequency for a time interval. We also found the system is having high number of TLB misses. We tried 'always' THP enabled policy and it greatly reduced TLB misses, but the page reclamation also been more frequent due to the THP internal fragmentation caused memory bloat. We could try another DAMON-based operation scheme that applies 'MADV_HUGEPAGE' to memory regions having >=2MB size and high access frequency, while applying 'MADV_NOHUGEPAGE' to regions having <2MB size and low access frequency. We do not own the systems so we only reported the analysis results and possible optimization solutions to the customers. The customers satisfied about the analysis results and promised to try the optimization guides. [1] https://lore.kernel.org/linux-mm/20201006123931.5847-1-sjpark@amazon.com/ [2] https://lore.kernel.org/linux-api/20200622192900.22757-4-minchan@kernel.org/ Comparison with Idle Page Tracking ================================== Idle Page Tracking allows users to set and read idleness of pages using a bitmap file which represents each page with each bit of the file. One recommended usage of it is working set size detection. Users can do that by 1. find PFN of each page for workloads in interest, 2. set all the pages as idle by doing writes to the bitmap file, 3. wait until the workload accesses its working set, and 4. read the idleness of the pages again and count pages became not idle. NOTE: While Idle Page Tracking is for user space users, DAMON is primarily designed for kernel subsystems though it can easily exposed to the user space. Hence, this section only assumes such user space use of DAMON. For what use cases Idle Page Tracking would be better? ------------------------------------------------------ 1. Flexible usecases other than hotness monitoring. Because Idle Page Tracking allows users to control the primitive (Page idleness) by themselves, Idle Page Tracking users can do anything they want. Meanwhile, DAMON is primarily designed to monitor the hotness of each memory region. For this, DAMON asks users to provide sampling interval and aggregation interval. For the reason, there could be some use case that using Idle Page Tracking is simpler. 2. Physical memory monitoring. Idle Page Tracking receives PFN range as input, so natively supports physical memory monitoring. DAMON is designed to be extensible for multiple address spaces and use cases by implementing and using primitives for the given use case. Therefore, by theory, DAMON has no limitation in the type of target address space as long as primitives for the given address space exists. However, the default primitives introduced by this patchset supports only virtual address spaces. Therefore, for physical memory monitoring, you should implement your own primitives and use it, or simply use Idle Page Tracking. Nonetheless, RFC patchsets[1] for the physical memory address space primitives is already available. It also supports user memory same to Idle Page Tracking. [1] https://lore.kernel.org/linux-mm/20200831104730.28970-1-sjpark@amazon.com/ For what use cases DAMON is better? ----------------------------------- 1. Hotness Monitoring. Idle Page Tracking let users know only if a page frame is accessed or not. For hotness check, the user should write more code and use more memory. DAMON do that by itself. 2. Low Monitoring Overhead DAMON receives user's monitoring request with one step and then provide the results. So, roughly speaking, DAMON require only O(1) user/kernel context switches. In case of Idle Page Tracking, however, because the interface receives contiguous page frames, the number of user/kernel context switches increases as the monitoring target becomes complex and huge. As a result, the context switch overhead could be not negligible. Moreover, DAMON is born to handle with the monitoring overhead. Because the core mechanism is pure logical, Idle Page Tracking users might be able to implement the mechanism on their own, but it would be time consuming and the user/kernel context switching will still more frequent than that of DAMON. Also, the kernel subsystems cannot use the logic in this case. 3. Page granularity working set size detection. Until v22 of this patchset, this was categorized as the thing Idle Page Tracking could do better, because DAMON basically maintains additional metadata for each of the monitoring target regions. So, in the page granularity working set size detection use case, DAMON would incur (number of monitoring target pages * size of metadata) memory overhead. Size of the single metadata item is about 54 bytes, so assuming 4KB pages, about 1.3% of monitoring target pages will be additionally used. All essential metadata for Idle Page Tracking are embedded in 'struct page' and page table entries. Therefore, in this use case, only one counter variable for working set size accounting is required if Idle Page Tracking is used. There are more details to consider, but roughly speaking, this is true in most cases. However, the situation changed from v23. Now DAMON supports arbitrary types of monitoring targets, which don't use the metadata. Using that, DAMON can do the working set size detection with no additional space overhead but less user-kernel context switch. A first draft for the implementation of monitoring primitives for this usage is available in a DAMON development tree[1]. An RFC patchset for it based on this patchset will also be available soon. Since v24, the arbitrary type support is dropped from this patchset because this patchset doesn't introduce real use of the type. You can still get it from the DAMON development tree[2], though. [1] https://github.com/sjp38/linux/tree/damon/pgidle_hack [2] https://github.com/sjp38/linux/tree/damon/master 4. More future usecases While Idle Page Tracking has tight coupling with base primitives (PG_Idle and page table Accessed bits), DAMON is designed to be extensible for many use cases and address spaces. If you need some special address type or want to use special h/w access check primitives, you can write your own primitives for that and configure DAMON to use those. Therefore, if your use case could be changed a lot in future, using DAMON could be better. Can I use both Idle Page Tracking and DAMON? -------------------------------------------- Yes, though using them concurrently for overlapping memory regions could result in interference to each other. Nevertheless, such use case would be rare or makes no sense at all. Even in the case, the noise would bot be really significant. So, you can choose whatever you want depending on the characteristics of your use cases. More Information ================ We prepared a showcase web site[1] that you can get more information. There are - the official documentations[2], - the heatmap format dynamic access pattern of various realistic workloads for heap area[3], mmap()-ed area[4], and stack[5] area, - the dynamic working set size distribution[6] and chronological working set size changes[7], and - the latest performance test results[8]. [1] https://damonitor.github.io/_index [2] https://damonitor.github.io/doc/html/latest-damon [3] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.0.png.html [4] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html [5] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.2.png.html [6] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html [7] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html [8] https://damonitor.github.io/test/result/perf/latest/html/index.html Baseline and Complete Git Trees =============================== The patches are based on the latest -mm tree, specifically v5.14-rc1-mmots-2021-07-15-18-47 of https://github.com/hnaz/linux-mm. You can also clone the complete git tree: $ git clone git://github.com/sjp38/linux -b damon/patches/v34 The web is also available: https://github.com/sjp38/linux/releases/tag/damon/patches/v34 Development Trees ----------------- There are a couple of trees for entire DAMON patchset series and features for future release. - For latest release: https://github.com/sjp38/linux/tree/damon/master - For next release: https://github.com/sjp38/linux/tree/damon/next Long-term Support Trees ----------------------- For people who want to test DAMON but using LTS kernels, there are another couple of trees based on two latest LTS kernels respectively and containing the 'damon/master' backports. - For v5.4.y: https://github.com/sjp38/linux/tree/damon/for-v5.4.y - For v5.10.y: https://github.com/sjp38/linux/tree/damon/for-v5.10.y Amazon Linux Kernel Trees ------------------------- DAMON is also merged in two public Amazon Linux kernel trees that based on v5.4.y[1] and v5.10.y[2]. [1] https://github.com/amazonlinux/linux/tree/amazon-5.4.y/master/mm/damon [2] https://github.com/amazonlinux/linux/tree/amazon-5.10.y/master/mm/damon Git Tree for Diff of Patches ============================ For easy review of diff between different versions of each patch, I prepared a git tree containing all versions of the DAMON patchset series: https://github.com/sjp38/damon-patches You can clone it and use 'diff' for easy review of changes between different versions of the patchset. For example: $ git clone https://github.com/sjp38/damon-patches && cd damon-patches $ diff -u damon/v33 damon/v34 Sequence Of Patches =================== First three patches implement the core logics of DAMON. The 1st patch introduces basic sampling based hotness monitoring for arbitrary types of targets. Following two patches implement the core mechanisms for control of overhead and accuracy, namely regions based sampling (patch 2) and adaptive regions adjustment (patch 3). Now the essential parts of DAMON is complete, but it cannot work unless someone provides monitoring primitives for a specific use case. The following two patches make it just work for virtual address spaces monitoring. The 4th patch makes 'PG_idle' can be used by DAMON and the 5th patch implements the virtual memory address space specific monitoring primitives using page table Accessed bits and the 'PG_idle' page flag. Now DAMON just works for virtual address space monitoring via the kernel space api. To let the user space users can use DAMON, following four patches add interfaces for them. The 6th patch adds a tracepoint for monitoring results. The 7th patch implements a DAMON application kernel module, namely damon-dbgfs, that simply wraps DAMON and exposes DAMON interface to the user space via the debugfs interface. The 8th patch further exports pid of monitoring thread (kdamond) to user space for easier cpu usage accounting, and the 9th patch makes the debugfs interface to support multiple contexts. Three patches for maintainability follows. The 10th patch adds documentations for both the user space and the kernel space. The 11th patch provides unit tests (based on the kunit) while the 12th patch adds user space tests (based on the kselftest). Finally, the last patch (13th) updates the MAINTAINERS file. This patch (of 13): DAMON is a data access monitoring framework for the Linux kernel. The core mechanisms of DAMON make it - accurate (the monitoring output is useful enough for DRAM level performance-centric memory management; It might be inappropriate for CPU cache levels, though), - light-weight (the monitoring overhead is normally low enough to be applied online), and - scalable (the upper-bound of the overhead is in constant range regardless of the size of target workloads). Using this framework, hence, we can easily write efficient kernel space data access monitoring applications. For example, the kernel's memory management mechanisms can make advanced decisions using this. Experimental data access aware optimization works that incurring high access monitoring overhead could again be implemented on top of this. Due to its simple and flexible interface, providing user space interface would be also easy. Then, user space users who have some special workloads can write personalized applications for better understanding and optimizations of their workloads and systems. === Nevertheless, this commit is defining and implementing only basic access check part without the overhead-accuracy handling core logic. The basic access check is as below. The output of DAMON says what memory regions are how frequently accessed for a given duration. The resolution of the access frequency is controlled by setting ``sampling interval`` and ``aggregation interval``. In detail, DAMON checks access to each page per ``sampling interval`` and aggregates the results. In other words, counts the number of the accesses to each region. After each ``aggregation interval`` passes, DAMON calls callback functions that previously registered by users so that users can read the aggregated results and then clears the results. This can be described in below simple pseudo-code:: init() while monitoring_on: for page in monitoring_target: if accessed(page): nr_accesses[page] += 1 if time() % aggregation_interval == 0: for callback in user_registered_callbacks: callback(monitoring_target, nr_accesses) for page in monitoring_target: nr_accesses[page] = 0 if time() % update_interval == 0: update() sleep(sampling interval) The target regions constructed at the beginning of the monitoring and updated after each ``regions_update_interval``, because the target regions could be dynamically changed (e.g., mmap() or memory hotplug). The monitoring overhead of this mechanism will arbitrarily increase as the size of the target workload grows. The basic monitoring primitives for actual access check and dynamic target regions construction aren't in the core part of DAMON. Instead, it allows users to implement their own primitives that are optimized for their use case and configure DAMON to use those. In other words, users cannot use current version of DAMON without some additional works. Following commits will implement the core mechanisms for the overhead-accuracy control and default primitives implementations. Link: https://lkml.kernel.org/r/20210716081449.22187-1-sj38.park@gmail.com Link: https://lkml.kernel.org/r/20210716081449.22187-2-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Acked-by: Shakeel Butt Cc: Jonathan Cameron Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Jonathan Corbet Cc: David Hildenbrand Cc: David Woodhouse Cc: Marco Elver Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Joe Perches Cc: Mel Gorman Cc: Maximilian Heyne Cc: Minchan Kim Cc: Ingo Molnar Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: David Rientjes Cc: Steven Rostedt (VMware) Cc: Shuah Khan Cc: Vlastimil Babka Cc: Vladimir Davydov Cc: Brendan Higgins Cc: Markus Boehme Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 167 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 include/linux/damon.h (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h new file mode 100644 index 000000000000..2f652602b1ea --- /dev/null +++ b/include/linux/damon.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DAMON api + * + * Author: SeongJae Park + */ + +#ifndef _DAMON_H_ +#define _DAMON_H_ + +#include +#include +#include + +struct damon_ctx; + +/** + * struct damon_primitive Monitoring primitives for given use cases. + * + * @init: Initialize primitive-internal data structures. + * @update: Update primitive-internal data structures. + * @prepare_access_checks: Prepare next access check of target regions. + * @check_accesses: Check the accesses to target regions. + * @reset_aggregated: Reset aggregated accesses monitoring results. + * @target_valid: Determine if the target is valid. + * @cleanup: Clean up the context. + * + * DAMON can be extended for various address spaces and usages. For this, + * users should register the low level primitives for their target address + * space and usecase via the &damon_ctx.primitive. Then, the monitoring thread + * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting + * the monitoring, @update after each &damon_ctx.primitive_update_interval, and + * @check_accesses, @target_valid and @prepare_access_checks after each + * &damon_ctx.sample_interval. Finally, @reset_aggregated is called after each + * &damon_ctx.aggr_interval. + * + * @init should initialize primitive-internal data structures. For example, + * this could be used to construct proper monitoring target regions and link + * those to @damon_ctx.target. + * @update should update the primitive-internal data structures. For example, + * this could be used to update monitoring target regions for current status. + * @prepare_access_checks should manipulate the monitoring regions to be + * prepared for the next access check. + * @check_accesses should check the accesses to each region that made after the + * last preparation and update the number of observed accesses of each region. + * @reset_aggregated should reset the access monitoring results that aggregated + * by @check_accesses. + * @target_valid should check whether the target is still valid for the + * monitoring. + * @cleanup is called from @kdamond just before its termination. + */ +struct damon_primitive { + void (*init)(struct damon_ctx *context); + void (*update)(struct damon_ctx *context); + void (*prepare_access_checks)(struct damon_ctx *context); + void (*check_accesses)(struct damon_ctx *context); + void (*reset_aggregated)(struct damon_ctx *context); + bool (*target_valid)(void *target); + void (*cleanup)(struct damon_ctx *context); +}; + +/* + * struct damon_callback Monitoring events notification callbacks. + * + * @before_start: Called before starting the monitoring. + * @after_sampling: Called after each sampling. + * @after_aggregation: Called after each aggregation. + * @before_terminate: Called before terminating the monitoring. + * @private: User private data. + * + * The monitoring thread (&damon_ctx.kdamond) calls @before_start and + * @before_terminate just before starting and finishing the monitoring, + * respectively. Therefore, those are good places for installing and cleaning + * @private. + * + * The monitoring thread calls @after_sampling and @after_aggregation for each + * of the sampling intervals and aggregation intervals, respectively. + * Therefore, users can safely access the monitoring results without additional + * protection. For the reason, users are recommended to use these callback for + * the accesses to the results. + * + * If any callback returns non-zero, monitoring stops. + */ +struct damon_callback { + void *private; + + int (*before_start)(struct damon_ctx *context); + int (*after_sampling)(struct damon_ctx *context); + int (*after_aggregation)(struct damon_ctx *context); + int (*before_terminate)(struct damon_ctx *context); +}; + +/** + * struct damon_ctx - Represents a context for each monitoring. This is the + * main interface that allows users to set the attributes and get the results + * of the monitoring. + * + * @sample_interval: The time between access samplings. + * @aggr_interval: The time between monitor results aggregations. + * @primitive_update_interval: The time between monitoring primitive updates. + * + * For each @sample_interval, DAMON checks whether each region is accessed or + * not. It aggregates and keeps the access information (number of accesses to + * each region) for @aggr_interval time. DAMON also checks whether the target + * memory regions need update (e.g., by ``mmap()`` calls from the application, + * in case of virtual memory monitoring) and applies the changes for each + * @primitive_update_interval. All time intervals are in micro-seconds. + * Please refer to &struct damon_primitive and &struct damon_callback for more + * detail. + * + * @kdamond: Kernel thread who does the monitoring. + * @kdamond_stop: Notifies whether kdamond should stop. + * @kdamond_lock: Mutex for the synchronizations with @kdamond. + * + * For each monitoring context, one kernel thread for the monitoring is + * created. The pointer to the thread is stored in @kdamond. + * + * Once started, the monitoring thread runs until explicitly required to be + * terminated or every monitoring target is invalid. The validity of the + * targets is checked via the &damon_primitive.target_valid of @primitive. The + * termination can also be explicitly requested by writing non-zero to + * @kdamond_stop. The thread sets @kdamond to NULL when it terminates. + * Therefore, users can know whether the monitoring is ongoing or terminated by + * reading @kdamond. Reads and writes to @kdamond and @kdamond_stop from + * outside of the monitoring thread must be protected by @kdamond_lock. + * + * Note that the monitoring thread protects only @kdamond and @kdamond_stop via + * @kdamond_lock. Accesses to other fields must be protected by themselves. + * + * @primitive: Set of monitoring primitives for given use cases. + * @callback: Set of callbacks for monitoring events notifications. + * + * @target: Pointer to the user-defined monitoring target. + */ +struct damon_ctx { + unsigned long sample_interval; + unsigned long aggr_interval; + unsigned long primitive_update_interval; + +/* private: internal use only */ + struct timespec64 last_aggregation; + struct timespec64 last_primitive_update; + +/* public: */ + struct task_struct *kdamond; + bool kdamond_stop; + struct mutex kdamond_lock; + + struct damon_primitive primitive; + struct damon_callback callback; + + void *target; +}; + +#ifdef CONFIG_DAMON + +struct damon_ctx *damon_new_ctx(void); +void damon_destroy_ctx(struct damon_ctx *ctx); +int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, + unsigned long aggr_int, unsigned long primitive_upd_int); + +int damon_start(struct damon_ctx **ctxs, int nr_ctxs); +int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); + +#endif /* CONFIG_DAMON */ + +#endif /* _DAMON_H */ -- cgit v1.2.3 From f23b8eee1871a6db5c37f90831147de5426c40b7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 7 Sep 2021 19:56:32 -0700 Subject: mm/damon/core: implement region-based sampling To avoid the unbounded increase of the overhead, DAMON groups adjacent pages that are assumed to have the same access frequencies into a region. As long as the assumption (pages in a region have the same access frequencies) is kept, only one page in the region is required to be checked. Thus, for each ``sampling interval``, 1. the 'prepare_access_checks' primitive picks one page in each region, 2. waits for one ``sampling interval``, 3. checks whether the page is accessed meanwhile, and 4. increases the access count of the region if so. Therefore, the monitoring overhead is controllable by adjusting the number of regions. DAMON allows both the underlying primitives and user callbacks to adjust regions for the trade-off. In other words, this commit makes DAMON to use not only time-based sampling but also space-based sampling. This scheme, however, cannot preserve the quality of the output if the assumption is not guaranteed. Next commit will address this problem. Link: https://lkml.kernel.org/r/20210716081449.22187-3-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Acked-by: Shakeel Butt Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 77 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2f652602b1ea..67db309ad61b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -12,6 +12,48 @@ #include #include +/** + * struct damon_addr_range - Represents an address region of [@start, @end). + * @start: Start address of the region (inclusive). + * @end: End address of the region (exclusive). + */ +struct damon_addr_range { + unsigned long start; + unsigned long end; +}; + +/** + * struct damon_region - Represents a monitoring target region. + * @ar: The address range of the region. + * @sampling_addr: Address of the sample for the next access check. + * @nr_accesses: Access frequency of this region. + * @list: List head for siblings. + */ +struct damon_region { + struct damon_addr_range ar; + unsigned long sampling_addr; + unsigned int nr_accesses; + struct list_head list; +}; + +/** + * struct damon_target - Represents a monitoring target. + * @id: Unique identifier for this target. + * @regions_list: Head of the monitoring target regions of this target. + * @list: List head for siblings. + * + * Each monitoring context could have multiple targets. For example, a context + * for virtual memory address spaces could have multiple target processes. The + * @id of each target should be unique among the targets of the context. For + * example, in the virtual address monitoring context, it could be a pidfd or + * an address of an mm_struct. + */ +struct damon_target { + unsigned long id; + struct list_head regions_list; + struct list_head list; +}; + struct damon_ctx; /** @@ -36,7 +78,7 @@ struct damon_ctx; * * @init should initialize primitive-internal data structures. For example, * this could be used to construct proper monitoring target regions and link - * those to @damon_ctx.target. + * those to @damon_ctx.adaptive_targets. * @update should update the primitive-internal data structures. For example, * this could be used to update monitoring target regions for current status. * @prepare_access_checks should manipulate the monitoring regions to be @@ -130,7 +172,7 @@ struct damon_callback { * @primitive: Set of monitoring primitives for given use cases. * @callback: Set of callbacks for monitoring events notifications. * - * @target: Pointer to the user-defined monitoring target. + * @region_targets: Head of monitoring targets (&damon_target) list. */ struct damon_ctx { unsigned long sample_interval; @@ -149,11 +191,40 @@ struct damon_ctx { struct damon_primitive primitive; struct damon_callback callback; - void *target; + struct list_head region_targets; }; +#define damon_next_region(r) \ + (container_of(r->list.next, struct damon_region, list)) + +#define damon_prev_region(r) \ + (container_of(r->list.prev, struct damon_region, list)) + +#define damon_for_each_region(r, t) \ + list_for_each_entry(r, &t->regions_list, list) + +#define damon_for_each_region_safe(r, next, t) \ + list_for_each_entry_safe(r, next, &t->regions_list, list) + +#define damon_for_each_target(t, ctx) \ + list_for_each_entry(t, &(ctx)->region_targets, list) + +#define damon_for_each_target_safe(t, next, ctx) \ + list_for_each_entry_safe(t, next, &(ctx)->region_targets, list) + #ifdef CONFIG_DAMON +struct damon_region *damon_new_region(unsigned long start, unsigned long end); +inline void damon_insert_region(struct damon_region *r, + struct damon_region *prev, struct damon_region *next); +void damon_add_region(struct damon_region *r, struct damon_target *t); +void damon_destroy_region(struct damon_region *r); + +struct damon_target *damon_new_target(unsigned long id); +void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); +void damon_free_target(struct damon_target *t); +void damon_destroy_target(struct damon_target *t); + struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, -- cgit v1.2.3 From b9a6ac4e4ede4172d165c133398b93e3233b0ba7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 7 Sep 2021 19:56:36 -0700 Subject: mm/damon: adaptively adjust regions Even somehow the initial monitoring target regions are well constructed to fulfill the assumption (pages in same region have similar access frequencies), the data access pattern can be dynamically changed. This will result in low monitoring quality. To keep the assumption as much as possible, DAMON adaptively merges and splits each region based on their access frequency. For each ``aggregation interval``, it compares the access frequencies of adjacent regions and merges those if the frequency difference is small. Then, after it reports and clears the aggregated access frequency of each region, it splits each region into two or three regions if the total number of regions will not exceed the user-specified maximum number of regions after the split. In this way, DAMON provides its best-effort quality and minimal overhead while keeping the upper-bound overhead that users set. Link: https://lkml.kernel.org/r/20210716081449.22187-4-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Acked-by: Shakeel Butt Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 67db309ad61b..ce2a84b26cd7 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -12,6 +12,9 @@ #include #include +/* Minimal region size. Every damon_region is aligned by this. */ +#define DAMON_MIN_REGION PAGE_SIZE + /** * struct damon_addr_range - Represents an address region of [@start, @end). * @start: Start address of the region (inclusive). @@ -39,6 +42,7 @@ struct damon_region { /** * struct damon_target - Represents a monitoring target. * @id: Unique identifier for this target. + * @nr_regions: Number of monitoring target regions of this target. * @regions_list: Head of the monitoring target regions of this target. * @list: List head for siblings. * @@ -50,6 +54,7 @@ struct damon_region { */ struct damon_target { unsigned long id; + unsigned int nr_regions; struct list_head regions_list; struct list_head list; }; @@ -85,6 +90,8 @@ struct damon_ctx; * prepared for the next access check. * @check_accesses should check the accesses to each region that made after the * last preparation and update the number of observed accesses of each region. + * It should also return max number of observed accesses that made as a result + * of its update. The value will be used for regions adjustment threshold. * @reset_aggregated should reset the access monitoring results that aggregated * by @check_accesses. * @target_valid should check whether the target is still valid for the @@ -95,7 +102,7 @@ struct damon_primitive { void (*init)(struct damon_ctx *context); void (*update)(struct damon_ctx *context); void (*prepare_access_checks)(struct damon_ctx *context); - void (*check_accesses)(struct damon_ctx *context); + unsigned int (*check_accesses)(struct damon_ctx *context); void (*reset_aggregated)(struct damon_ctx *context); bool (*target_valid)(void *target); void (*cleanup)(struct damon_ctx *context); @@ -172,7 +179,9 @@ struct damon_callback { * @primitive: Set of monitoring primitives for given use cases. * @callback: Set of callbacks for monitoring events notifications. * - * @region_targets: Head of monitoring targets (&damon_target) list. + * @min_nr_regions: The minimum number of adaptive monitoring regions. + * @max_nr_regions: The maximum number of adaptive monitoring regions. + * @adaptive_targets: Head of monitoring targets (&damon_target) list. */ struct damon_ctx { unsigned long sample_interval; @@ -191,7 +200,9 @@ struct damon_ctx { struct damon_primitive primitive; struct damon_callback callback; - struct list_head region_targets; + unsigned long min_nr_regions; + unsigned long max_nr_regions; + struct list_head adaptive_targets; }; #define damon_next_region(r) \ @@ -207,28 +218,31 @@ struct damon_ctx { list_for_each_entry_safe(r, next, &t->regions_list, list) #define damon_for_each_target(t, ctx) \ - list_for_each_entry(t, &(ctx)->region_targets, list) + list_for_each_entry(t, &(ctx)->adaptive_targets, list) #define damon_for_each_target_safe(t, next, ctx) \ - list_for_each_entry_safe(t, next, &(ctx)->region_targets, list) + list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list) #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next); + struct damon_region *prev, struct damon_region *next, + struct damon_target *t); void damon_add_region(struct damon_region *r, struct damon_target *t); -void damon_destroy_region(struct damon_region *r); +void damon_destroy_region(struct damon_region *r, struct damon_target *t); struct damon_target *damon_new_target(unsigned long id); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); void damon_free_target(struct damon_target *t); void damon_destroy_target(struct damon_target *t); +unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long primitive_upd_int); + unsigned long aggr_int, unsigned long primitive_upd_int, + unsigned long min_nr_reg, unsigned long max_nr_reg); int damon_start(struct damon_ctx **ctxs, int nr_ctxs); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); -- cgit v1.2.3 From 1c676e0d9b1a59b98885b24a0e16a81fe4cc8301 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 7 Sep 2021 19:56:40 -0700 Subject: mm/idle_page_tracking: make PG_idle reusable PG_idle and PG_young allow the two PTE Accessed bit users, Idle Page Tracking and the reclaim logic concurrently work while not interfering with each other. That is, when they need to clear the Accessed bit, they set PG_young to represent the previous state of the bit, respectively. And when they need to read the bit, if the bit is cleared, they further read the PG_young to know whether the other has cleared the bit meanwhile or not. For yet another user of the PTE Accessed bit, we could add another page flag, or extend the mechanism to use the flags. For the DAMON usecase, however, we don't need to do that just yet. IDLE_PAGE_TRACKING and DAMON are mutually exclusive, so there's only ever going to be one user of the current set of flags. In this commit, we split out the CONFIG options to allow for the use of PG_young and PG_idle outside of idle page tracking. In the next commit, DAMON's reference implementation of the virtual memory address space monitoring primitives will use it. [sjpark@amazon.de: set PAGE_EXTENSION for non-64BIT] Link: https://lkml.kernel.org/r/20210806095153.6444-1-sj38.park@gmail.com [akpm@linux-foundation.org: tweak Kconfig text] [sjpark@amazon.de: hide PAGE_IDLE_FLAG from users] Link: https://lkml.kernel.org/r/20210813081238.34705-1-sj38.park@gmail.com Link: https://lkml.kernel.org/r/20210716081449.22187-5-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Shakeel Butt Reviewed-by: Fernand Sieber Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 4 ++-- include/linux/page_ext.h | 2 +- include/linux/page_idle.h | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6b8d66965145..0a51dd1bb6b1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -131,7 +131,7 @@ enum pageflags { #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) PG_young, PG_idle, #endif @@ -441,7 +441,7 @@ PAGEFLAG_FALSE(HWPoison) #define __PG_HWPOISON 0 #endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) TESTPAGEFLAG(Young, young, PF_ANY) SETPAGEFLAG(Young, young, PF_ANY) TESTCLEARFLAG(Young, young, PF_ANY) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index aff81ba31bd8..fabb2e1e087f 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -19,7 +19,7 @@ struct page_ext_operations { enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, -#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, #endif diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 1e894d34bdce..d8a6aecf99cb 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -6,7 +6,7 @@ #include #include -#ifdef CONFIG_IDLE_PAGE_TRACKING +#ifdef CONFIG_PAGE_IDLE_FLAG #ifdef CONFIG_64BIT static inline bool page_is_young(struct page *page) @@ -106,7 +106,7 @@ static inline void clear_page_idle(struct page *page) } #endif /* CONFIG_64BIT */ -#else /* !CONFIG_IDLE_PAGE_TRACKING */ +#else /* !CONFIG_PAGE_IDLE_FLAG */ static inline bool page_is_young(struct page *page) { @@ -135,6 +135,6 @@ static inline void clear_page_idle(struct page *page) { } -#endif /* CONFIG_IDLE_PAGE_TRACKING */ +#endif /* CONFIG_PAGE_IDLE_FLAG */ #endif /* _LINUX_MM_PAGE_IDLE_H */ -- cgit v1.2.3 From 3f49584b262cf8f42b25f4c1ad9f5bfd3bdc1bca Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 7 Sep 2021 19:56:44 -0700 Subject: mm/damon: implement primitives for the virtual memory address spaces This commit introduces a reference implementation of the address space specific low level primitives for the virtual address space, so that users of DAMON can easily monitor the data accesses on virtual address spaces of specific processes by simply configuring the implementation to be used by DAMON. The low level primitives for the fundamental access monitoring are defined in two parts: 1. Identification of the monitoring target address range for the address space. 2. Access check of specific address range in the target space. The reference implementation for the virtual address space does the works as below. PTE Accessed-bit Based Access Check ----------------------------------- The implementation uses PTE Accessed-bit for basic access checks. That is, it clears the bit for the next sampling target page and checks whether it is set again after one sampling period. This could disturb the reclaim logic. DAMON uses ``PG_idle`` and ``PG_young`` page flags to solve the conflict, as Idle page tracking does. VMA-based Target Address Range Construction ------------------------------------------- Only small parts in the super-huge virtual address space of the processes are mapped to physical memory and accessed. Thus, tracking the unmapped address regions is just wasteful. However, because DAMON can deal with some level of noise using the adaptive regions adjustment mechanism, tracking every mapping is not strictly required but could even incur a high overhead in some cases. That said, too huge unmapped areas inside the monitoring target should be removed to not take the time for the adaptive mechanism. For the reason, this implementation converts the complex mappings to three distinct regions that cover every mapped area of the address space. Also, the two gaps between the three regions are the two biggest unmapped areas in the given address space. The two biggest unmapped areas would be the gap between the heap and the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed region and the stack in most of the cases. Because these gaps are exceptionally huge in usual address spaces, excluding these will be sufficient to make a reasonable trade-off. Below shows this in detail:: (small mmap()-ed regions and munmap()-ed regions) [akpm@linux-foundation.org: mm/damon/vaddr.c needs highmem.h for kunmap_atomic()] [sjpark@amazon.de: remove unnecessary PAGE_EXTENSION setup] Link: https://lkml.kernel.org/r/20210806095153.6444-2-sj38.park@gmail.com [sjpark@amazon.de: safely walk page table] Link: https://lkml.kernel.org/r/20210831161800.29419-1-sj38.park@gmail.com Link: https://lkml.kernel.org/r/20210716081449.22187-6-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Acked-by: Shakeel Butt Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index ce2a84b26cd7..edb350e52b93 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -249,4 +249,17 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); #endif /* CONFIG_DAMON */ +#ifdef CONFIG_DAMON_VADDR + +/* Monitoring primitives for virtual memory address spaces */ +void damon_va_init(struct damon_ctx *ctx); +void damon_va_update(struct damon_ctx *ctx); +void damon_va_prepare_access_checks(struct damon_ctx *ctx); +unsigned int damon_va_check_accesses(struct damon_ctx *ctx); +bool damon_va_target_valid(void *t); +void damon_va_cleanup(struct damon_ctx *ctx); +void damon_va_set_primitives(struct damon_ctx *ctx); + +#endif /* CONFIG_DAMON_VADDR */ + #endif /* _DAMON_H */ -- cgit v1.2.3 From 4bc05954d0076655cfaf6f0135585bdc20cd6b11 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 7 Sep 2021 19:56:53 -0700 Subject: mm/damon: implement a debugfs-based user space interface DAMON is designed to be used by kernel space code such as the memory management subsystems, and therefore it provides only kernel space API. That said, letting the user space control DAMON could provide some benefits to them. For example, it will allow user space to analyze their specific workloads and make their own special optimizations. For such cases, this commit implements a simple DAMON application kernel module, namely 'damon-dbgfs', which merely wraps the DAMON api and exports those to the user space via the debugfs. 'damon-dbgfs' exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under its debugfs directory, ``/damon/``. Attributes ---------- Users can read and write the ``sampling interval``, ``aggregation interval``, ``regions update interval``, and min/max number of monitoring target regions by reading from and writing to the ``attrs`` file. For example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10, 1000 and check it again:: # cd /damon # echo 5000 100000 1000000 10 1000 > attrs # cat attrs 5000 100000 1000000 10 1000 Target IDs ---------- Some types of address spaces supports multiple monitoring target. For example, the virtual memory address spaces monitoring can have multiple processes as the monitoring targets. Users can set the targets by writing relevant id values of the targets to, and get the ids of the current targets by reading from the ``target_ids`` file. In case of the virtual address spaces monitoring, the values should be pids of the monitoring target processes. For example, below commands set processes having pids 42 and 4242 as the monitoring targets and check it again:: # cd /damon # echo 42 4242 > target_ids # cat target_ids 42 4242 Note that setting the target ids doesn't start the monitoring. Turning On/Off -------------- Setting the files as described above doesn't incur effect unless you explicitly start the monitoring. You can start, stop, and check the current status of the monitoring by writing to and reading from the ``monitor_on`` file. Writing ``on`` to the file starts the monitoring of the targets with the attributes. Writing ``off`` to the file stops those. DAMON also stops if every targets are invalidated (in case of the virtual memory monitoring, target processes are invalidated when terminated). Below example commands turn on, off, and check the status of DAMON:: # cd /damon # echo on > monitor_on # echo off > monitor_on # cat monitor_on off Please note that you cannot write to the above-mentioned debugfs files while the monitoring is turned on. If you write to the files while DAMON is running, an error code such as ``-EBUSY`` will be returned. [akpm@linux-foundation.org: remove unneeded "alloc failed" printks] [akpm@linux-foundation.org: replace macro with static inline] Link: https://lkml.kernel.org/r/20210716081449.22187-8-sj38.park@gmail.com Signed-off-by: SeongJae Park Reviewed-by: Leonard Foerster Reviewed-by: Fernand Sieber Cc: Alexander Shishkin Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Fan Du Cc: Greg Kroah-Hartman Cc: Greg Thelen Cc: Ingo Molnar Cc: Joe Perches Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Marco Elver Cc: Markus Boehme Cc: Maximilian Heyne Cc: Mel Gorman Cc: Minchan Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shakeel Butt Cc: Shuah Khan Cc: Steven Rostedt (VMware) Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index edb350e52b93..d68b67b8d458 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -240,9 +240,12 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); +int damon_set_targets(struct damon_ctx *ctx, + unsigned long *ids, ssize_t nr_ids); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, unsigned long aggr_int, unsigned long primitive_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg); +int damon_nr_running_ctxs(void); int damon_start(struct damon_ctx **ctxs, int nr_ctxs); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); -- cgit v1.2.3 From a8a47cf5ce4bbc70a54fa4eca71d35f43dc8218a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 7 Sep 2021 19:57:41 -0700 Subject: include/linux/once.h: fix trivia typo Not -> Note Fix trivia typo Not -> Note in the comment to DO_ONCE(). Link: https://lkml.kernel.org/r/20210722184349.76290-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/once.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/once.h b/include/linux/once.h index ae6f4eb41cbe..d361fb14ac3a 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -16,7 +16,7 @@ void __do_once_done(bool *done, struct static_key_true *once_key, * out the condition into a nop. DO_ONCE() guarantees type safety of * arguments! * - * Not that the following is not equivalent ... + * Note that the following is not equivalent ... * * DO_ONCE(func, arg); * DO_ONCE(func, arg); -- cgit v1.2.3 From c9221919a2d2df5741ab074dfec5bdfc6f1e043b Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 7 Sep 2021 19:57:44 -0700 Subject: units: change from 'L' to 'UL' Patch series "Add Hz macros", v3. There are multiple definitions of the HZ_PER_MHZ or HZ_PER_KHZ in the different drivers. Instead of duplicating this definition again and again, add one in the units.h header to be reused in all the place the redefiniton occurs. At the same time, change the type of the Watts, as they can not be negative. This patch (of 10): The users of the macros are safe to be assigned with an unsigned instead of signed as the variables using them are themselves unsigned. Link: https://lkml.kernel.org/r/20210816114732.1834145-1-daniel.lezcano@linaro.org Link: https://lkml.kernel.org/r/20210816114732.1834145-2-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Cc: Andy Shevchenko Cc: Jonathan Cameron Cc: Christian Eggers Cc: Lukasz Luba Cc: MyungJoo Ham Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Peter Meerwald Cc: Zhang Rui Cc: Guenter Roeck Cc: Miquel Raynal Cc: Maxime Coquelin Cc: "Rafael J. Wysocki" Cc: Daniel Lezcano Cc: Chanwoo Choi Cc: Jonathan Cameron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/units.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/units.h b/include/linux/units.h index dcc30a53fa93..ff51d3cfc6a0 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -4,9 +4,9 @@ #include -#define MILLIWATT_PER_WATT 1000L -#define MICROWATT_PER_MILLIWATT 1000L -#define MICROWATT_PER_WATT 1000000L +#define MILLIWATT_PER_WATT 1000UL +#define MICROWATT_PER_MILLIWATT 1000UL +#define MICROWATT_PER_WATT 1000000UL #define ABSOLUTE_ZERO_MILLICELSIUS -273150 -- cgit v1.2.3 From e2c77032fcbe515194107994d12cd72ddb77b022 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 7 Sep 2021 19:57:48 -0700 Subject: units: add the HZ macros The macros for the unit conversion for frequency are duplicated in different places. Provide these macros in the 'units' header, so they can be reused. Link: https://lkml.kernel.org/r/20210816114732.1834145-3-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Reviewed-by: Christian Eggers Reviewed-by: Andy Shevchenko Cc: Chanwoo Choi Cc: Guenter Roeck Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Lukasz Luba Cc: Maxime Coquelin Cc: Miquel Raynal Cc: MyungJoo Ham Cc: Peter Meerwald Cc: "Rafael J. Wysocki" Cc: Zhang Rui Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/units.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/units.h b/include/linux/units.h index ff51d3cfc6a0..8b8dc8a84d93 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -4,6 +4,10 @@ #include +#define HZ_PER_KHZ 1000UL +#define KHZ_PER_MHZ 1000UL +#define HZ_PER_MHZ 1000000UL + #define MILLIWATT_PER_WATT 1000UL #define MICROWATT_PER_MILLIWATT 1000UL #define MICROWATT_PER_WATT 1000000UL -- cgit v1.2.3 From 1e1c15839df084f4011825fee922aa976c9159dc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 7 Sep 2021 20:00:00 -0700 Subject: fs/epoll: use a per-cpu counter for user's watches count This counter tracks the number of watches a user has, to compare against the 'max_user_watches' limit. This causes a scalability bottleneck on SPECjbb2015 on large systems as there is only one user. Changing to a per-cpu counter increases throughput of the benchmark by about 30% on a 16-socket, > 1000 thread system. [rdunlap@infradead.org: fix build errors in kernel/user.c when CONFIG_EPOLL=n] [npiggin@gmail.com: move ifdefs into wrapper functions, slightly improve panic message] Link: https://lkml.kernel.org/r/1628051945.fens3r99ox.astroid@bobo.none [akpm@linux-foundation.org: tweak user_epoll_alloc(), per Guenter] Link: https://lkml.kernel.org/r/20210804191421.GA1900577@roeck-us.net Link: https://lkml.kernel.org/r/20210802032013.2751916-1-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reported-by: Anton Blanchard Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/user.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 2462f7d07695..00ed419dd464 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -13,7 +14,7 @@ struct user_struct { refcount_t __count; /* reference count */ #ifdef CONFIG_EPOLL - atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ + struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */ #endif unsigned long unix_inflight; /* How many files in flight in unix sockets */ atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ -- cgit v1.2.3 From 5b91a75b3312c03798f555e10569fd85211a490c Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Tue, 7 Sep 2021 20:00:38 -0700 Subject: pid: cleanup the stale comment mentioning pidmap_init(). pidmap_init() has already been replaced with pid_idr_init() in the commit 95846ecf9dac ("pid: replace pid bitmap implementation with IDR API"). Cleanup the stale comment which still mentions it. Link: https://lkml.kernel.org/r/20210714120713.19825-1-itazur@amazon.com Signed-off-by: Takahiro Itazuri Cc: Kuniyuki Iwashima Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/threads.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/threads.h b/include/linux/threads.h index 18d5a74bcc3d..c34173e6c5f1 100644 --- a/include/linux/threads.h +++ b/include/linux/threads.h @@ -38,7 +38,7 @@ * Define a minimum number of pids per cpu. Heuristically based * on original pid max of 32k for 32 cpus. Also, increase the * minimum settable value for pid_max on the running system based - * on similar defaults. See kernel/pid.c:pidmap_init() for details. + * on similar defaults. See kernel/pid.c:pid_idr_init() for details. */ #define PIDS_PER_CPU_DEFAULT 1024 #define PIDS_PER_CPU_MIN 8 -- cgit v1.2.3 From e130242dc351f1cfa2bbeb6766a1486ce936ef88 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:21 -0700 Subject: mm: simplify compat numa syscalls The compat implementations for mbind, get_mempolicy, set_mempolicy and migrate_pages are just there to handle the subtly different layout of bitmaps on 32-bit hosts. The compat implementation however lacks some of the checks that are present in the native one, in particular for checking that the extra bits are all zero when user space has a larger mask size than the kernel. Worse, those extra bits do not get cleared when copying in or out of the kernel, which can lead to incorrect data as well. Unify the implementation to handle the compat bitmap layout directly in the get_nodes() and copy_nodes_to_user() helpers. Splitting out the get_bitmap() helper from get_nodes() also helps readability of the native case. On x86, two additional problems are addressed by this: compat tasks can pass a bitmap at the end of a mapping, causing a fault when reading across the page boundary for a 64-bit word. x32 tasks might also run into problems with get_mempolicy corrupting data when an odd number of 32-bit words gets passed. On parisc the migrate_pages() system call apparently had the wrong calling convention, as big-endian architectures expect the words inside of a bitmap to be swapped. This is not a problem though since parisc has no NUMA support. [arnd@arndb.de: fix mempolicy crash] Link: https://lkml.kernel.org/r/20210730143417.3700653-1-arnd@kernel.org Link: https://lore.kernel.org/lkml/YQPLG20V3dmOfq3a@osiris/ Link: https://lkml.kernel.org/r/20210727144859.4150043-5-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 8e0598c7d1d1..3a2ac5afee30 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -395,14 +395,6 @@ struct compat_kexec_segment; struct compat_mq_attr; struct compat_msgbuf; -#define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t)) - -#define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG) - -long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, - unsigned long bitmap_size); -long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, - unsigned long bitmap_size); void copy_siginfo_to_external32(struct compat_siginfo *to, const struct kernel_siginfo *from); int copy_siginfo_from_user32(kernel_siginfo_t *to, @@ -976,6 +968,15 @@ static inline bool in_compat_syscall(void) { return false; } #endif /* CONFIG_COMPAT */ +#define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t)) + +#define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG) + +long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, + unsigned long bitmap_size); +long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, + unsigned long bitmap_size); + /* * Some legacy ABIs like the i386 one use less than natural alignment for 64-bit * types, and will need special compat treatment for that. Most architectures -- cgit v1.2.3 From 59ab844eed9c6b01d32dcb27b57accc23771b324 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:25 -0700 Subject: compat: remove some compat entry points These are all handled correctly when calling the native system call entry point, so remove the special cases. Link: https://lkml.kernel.org/r/20210727144859.4150043-6-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 3a2ac5afee30..2d42cebd1fb8 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -799,26 +799,6 @@ asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr /* mm/fadvise.c: No generic prototype for fadvise64_64 */ /* mm/, CONFIG_MMU only */ -asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, - compat_ulong_t mode, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, compat_ulong_t flags); -asmlinkage long compat_sys_get_mempolicy(int __user *policy, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, - compat_ulong_t addr, - compat_ulong_t flags); -asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode); -asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, - compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes, - const compat_ulong_t __user *new_nodes); -asmlinkage long compat_sys_move_pages(pid_t pid, compat_ulong_t nr_pages, - __u32 __user *pages, - const int __user *nodes, - int __user *status, - int flags); - asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, struct compat_siginfo __user *uinfo); -- cgit v1.2.3 From a7a08b275a8bbade798c4bdaad07ade68fe7003c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:29 -0700 Subject: arch: remove compat_alloc_user_space All users of compat_alloc_user_space() and copy_in_user() have been removed from the kernel, only a few functions in sparc remain that can be changed to calling arch_copy_in_user() instead. Link: https://lkml.kernel.org/r/20210727144859.4150043-7-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 2 -- include/linux/uaccess.h | 10 ---------- 2 files changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 2d42cebd1fb8..1c758b0e0359 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -511,8 +511,6 @@ extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request, struct epoll_event; /* fortunately, this one is fixed-layout */ -extern void __user *compat_alloc_user_space(unsigned long len); - int compat_restore_altstack(const compat_stack_t __user *uss); int __compat_save_altstack(compat_stack_t __user *, unsigned long); #define unsafe_compat_save_altstack(uss, sp, label) do { \ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index c05e903cef02..ac0394087f7d 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -200,16 +200,6 @@ copy_to_user(void __user *to, const void *from, unsigned long n) n = _copy_to_user(to, from, n); return n; } -#ifdef CONFIG_COMPAT -static __always_inline unsigned long __must_check -copy_in_user(void __user *to, const void __user *from, unsigned long n) -{ - might_fault(); - if (access_ok(to, n) && access_ok(from, n)) - n = raw_copy_in_user(to, from, n); - return n; -} -#endif #ifndef copy_mc_to_kernel /* -- cgit v1.2.3 From b83a908498d68fafca931e1276e145b339cac5fb Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Mon, 2 Aug 2021 13:23:20 -0700 Subject: compiler_attributes.h: move __compiletime_{error|warning} Clang 14 will add support for __attribute__((__error__(""))) and __attribute__((__warning__(""))). To make use of these in __compiletime_error and __compiletime_warning (as used by BUILD_BUG and friends) for newer clang and detect/fallback for older versions of clang, move these to compiler_attributes.h and guard them with __has_attribute preprocessor guards. Link: https://reviews.llvm.org/D106030 Link: https://bugs.llvm.org/show_bug.cgi?id=16428 Link: https://github.com/ClangBuiltLinux/linux/issues/1173 Signed-off-by: Nick Desaulniers Reviewed-by: Nathan Chancellor Reviewed-by: Kees Cook [Reworded, landed in Clang 14] Signed-off-by: Miguel Ojeda --- include/linux/compiler-gcc.h | 3 --- include/linux/compiler_attributes.h | 24 ++++++++++++++++++++++++ include/linux/compiler_types.h | 6 ------ 3 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 5d97ef738a57..61c1479688db 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -43,9 +43,6 @@ #define __compiletime_object_size(obj) __builtin_object_size(obj, 0) -#define __compiletime_warning(message) __attribute__((__warning__(message))) -#define __compiletime_error(message) __attribute__((__error__(message))) - #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) #define __latent_entropy __attribute__((latent_entropy)) #endif diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 7b1fa5c30169..f4df9e5a8c76 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -30,6 +30,7 @@ # define __GCC4_has_attribute___assume_aligned__ (__GNUC_MINOR__ >= 9) # define __GCC4_has_attribute___copy__ 0 # define __GCC4_has_attribute___designated_init__ 0 +# define __GCC4_has_attribute___error__ 1 # define __GCC4_has_attribute___externally_visible__ 1 # define __GCC4_has_attribute___no_caller_saved_registers__ 0 # define __GCC4_has_attribute___noclone__ 1 @@ -38,6 +39,7 @@ # define __GCC4_has_attribute___no_sanitize_undefined__ (__GNUC_MINOR__ >= 9) # define __GCC4_has_attribute___no_sanitize_coverage__ 0 # define __GCC4_has_attribute___fallthrough__ 0 +# define __GCC4_has_attribute___warning__ 1 #endif /* @@ -137,6 +139,17 @@ # define __designated_init #endif +/* + * Optional: only supported since clang >= 14.0 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-error-function-attribute + */ +#if __has_attribute(__error__) +# define __compiletime_error(msg) __attribute__((__error__(msg))) +#else +# define __compiletime_error(msg) +#endif + /* * Optional: not supported by clang * @@ -286,6 +299,17 @@ */ #define __must_check __attribute__((__warn_unused_result__)) +/* + * Optional: only supported since clang >= 14.0 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-warning-function-attribute + */ +#if __has_attribute(__warning__) +# define __compiletime_warning(msg) __attribute__((__warning__(msg))) +#else +# define __compiletime_warning(msg) +#endif + /* * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-weak-function-attribute * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-weak-variable-attribute diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index d29bda7f6ebd..8246d0caffa6 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -294,12 +294,6 @@ struct ftrace_likely_data { #ifndef __compiletime_object_size # define __compiletime_object_size(obj) -1 #endif -#ifndef __compiletime_warning -# define __compiletime_warning(message) -#endif -#ifndef __compiletime_error -# define __compiletime_error(message) -#endif #ifdef __OPTIMIZE__ # define __compiletime_assert(condition, msg, prefix, suffix) \ -- cgit v1.2.3 From 13db8c50477d83ad3e3b9b0ae247e5cd833a7ae4 Mon Sep 17 00:00:00 2001 From: Liu Zixian Date: Wed, 8 Sep 2021 18:10:05 -0700 Subject: mm/hugetlb: initialize hugetlb_usage in mm_init After fork, the child process will get incorrect (2x) hugetlb_usage. If a process uses 5 2MB hugetlb pages in an anonymous mapping, HugetlbPages: 10240 kB and then forks, the child will show, HugetlbPages: 20480 kB The reason for double the amount is because hugetlb_usage will be copied from the parent and then increased when we copy page tables from parent to child. Child will have 2x actual usage. Fix this by adding hugetlb_count_init in mm_init. Link: https://lkml.kernel.org/r/20210826071742.877-1-liuzixian4@huawei.com Fixes: 5d317b2b6536 ("mm: hugetlb: proc: add HugetlbPages field to /proc/PID/status") Signed-off-by: Liu Zixian Reviewed-by: Naoya Horiguchi Reviewed-by: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f7ca1a3870ea..1faebe1cd0ed 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -858,6 +858,11 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm); +static inline void hugetlb_count_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->hugetlb_usage, 0); +} + static inline void hugetlb_count_add(long l, struct mm_struct *mm) { atomic_long_add(l, &mm->hugetlb_usage); @@ -1042,6 +1047,10 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, return &mm->page_table_lock; } +static inline void hugetlb_count_init(struct mm_struct *mm) +{ +} + static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) { } -- cgit v1.2.3 From 10994316089c9682f2fbe0be0b1e82bcaf5f4e8c Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Wed, 8 Sep 2021 18:10:14 -0700 Subject: mmap_lock: change trace and locking order Print to the trace log before releasing the lock to avoid racing with other trace log printers of the same lock type. Link: https://lkml.kernel.org/r/20210903022041.1843024-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Suggested-by: Steven Rostedt (VMware) Reviewed-by: Matthew Wilcox (Oracle) Cc: Michel Lespinasse Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmap_lock.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 0540f0156f58..b179f1e3541a 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -101,14 +101,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm) static inline void mmap_write_unlock(struct mm_struct *mm) { - up_write(&mm->mmap_lock); __mmap_lock_trace_released(mm, true); + up_write(&mm->mmap_lock); } static inline void mmap_write_downgrade(struct mm_struct *mm) { - downgrade_write(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, false, true); + downgrade_write(&mm->mmap_lock); } static inline void mmap_read_lock(struct mm_struct *mm) @@ -140,8 +140,8 @@ static inline bool mmap_read_trylock(struct mm_struct *mm) static inline void mmap_read_unlock(struct mm_struct *mm) { - up_read(&mm->mmap_lock); __mmap_lock_trace_released(mm, false); + up_read(&mm->mmap_lock); } static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) @@ -155,8 +155,8 @@ static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { - up_read_non_owner(&mm->mmap_lock); __mmap_lock_trace_released(mm, false); + up_read_non_owner(&mm->mmap_lock); } static inline void mmap_assert_locked(struct mm_struct *mm) -- cgit v1.2.3 From 5dfe50b05588010f347cb2f436434bf22b7a84ed Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Sep 2021 22:36:38 +0900 Subject: bootconfig: Rename xbc_node_find_child() to xbc_node_find_subkey() Rename xbc_node_find_child() to xbc_node_find_subkey() for clarifying that function returns a key node (no value node). Since there are xbc_node_for_each_child() (loop on all child nodes) and xbc_node_for_each_subkey() (loop on only subkey nodes), this name distinction is necessary to avoid confusing users. Link: https://lkml.kernel.org/r/163119459826.161018.11200274779483115300.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index abe089c27529..537e1b991f11 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -110,7 +110,7 @@ static inline __init bool xbc_node_is_leaf(struct xbc_node *node) } /* Tree-based key-value access APIs */ -struct xbc_node * __init xbc_node_find_child(struct xbc_node *parent, +struct xbc_node * __init xbc_node_find_subkey(struct xbc_node *parent, const char *key); const char * __init xbc_node_find_value(struct xbc_node *parent, @@ -148,7 +148,7 @@ xbc_find_value(const char *key, struct xbc_node **vnode) */ static inline struct xbc_node * __init xbc_find_node(const char *key) { - return xbc_node_find_child(NULL, key); + return xbc_node_find_subkey(NULL, key); } /** -- cgit v1.2.3 From 8c854303ce0e38e5bbedd725ff39da7e235865d8 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:21 +0200 Subject: cpu/hotplug: Remove deprecated CPU-hotplug functions. No users in tree use the deprecated CPU-hotplug functions anymore. Remove them. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210803141621.780504-39-bigeasy@linutronix.de --- include/linux/cpu.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 94a578a96202..9cf51e41e697 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -143,12 +143,6 @@ static inline int remove_cpu(unsigned int cpu) { return -EPERM; } static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { } #endif /* !CONFIG_HOTPLUG_CPU */ -/* Wrappers which go away once all code is converted */ -static inline void cpu_hotplug_begin(void) { cpus_write_lock(); } -static inline void cpu_hotplug_done(void) { cpus_write_unlock(); } -static inline void get_online_cpus(void) { cpus_read_lock(); } -static inline void put_online_cpus(void) { cpus_read_unlock(); } - #ifdef CONFIG_PM_SLEEP_SMP extern int freeze_secondary_cpus(int primary); extern void thaw_secondary_cpus(void); -- cgit v1.2.3 From c9871c800f65fffed40f3df3e1eb38984f95cfcf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 9 Sep 2021 14:34:59 +0200 Subject: Documentation: core-api/cpuhotplug: Rewrite the API section Dave stumbled over the incomplete and confusing documentation of the CPU hotplug API. Rewrite it, add the missing function documentations and correct the existing ones. Reported-by: Dave Chinner Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210909123212.489059409@linutronix.de --- include/linux/cpuhotplug.h | 132 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 110 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 39cf84a30b9f..832d8a74fa59 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -22,8 +22,42 @@ * AP_ACTIVE AP_ACTIVE */ +/* + * CPU hotplug states. The state machine invokes the installed state + * startup callbacks sequentially from CPUHP_OFFLINE + 1 to CPUHP_ONLINE + * during a CPU online operation. During a CPU offline operation the + * installed teardown callbacks are invoked in the reverse order from + * CPU_ONLINE - 1 down to CPUHP_OFFLINE. + * + * The state space has three sections: PREPARE, STARTING and ONLINE. + * + * PREPARE: The callbacks are invoked on a control CPU before the + * hotplugged CPU is started up or after the hotplugged CPU has died. + * + * STARTING: The callbacks are invoked on the hotplugged CPU from the low level + * hotplug startup/teardown code with interrupts disabled. + * + * ONLINE: The callbacks are invoked on the hotplugged CPU from the per CPU + * hotplug thread with interrupts and preemption enabled. + * + * Adding explicit states to this enum is only necessary when: + * + * 1) The state is within the STARTING section + * + * 2) The state has ordering constraints vs. other states in the + * same section. + * + * If neither #1 nor #2 apply, please use the dynamic state space when + * setting up a state by using CPUHP_PREPARE_DYN or CPUHP_PREPARE_ONLINE + * for the @state argument of the setup function. + * + * See Documentation/core-api/cpu_hotplug.rst for further information and + * examples. + */ enum cpuhp_state { CPUHP_INVALID = -1, + + /* PREPARE section invoked on a control CPU */ CPUHP_OFFLINE = 0, CPUHP_CREATE_THREADS, CPUHP_PERF_PREPARE, @@ -95,6 +129,11 @@ enum cpuhp_state { CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, CPUHP_BRINGUP_CPU, + + /* + * STARTING section invoked on the hotplugged CPU in low level + * bringup and teardown code. + */ CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, CPUHP_AP_SCHED_STARTING, @@ -155,6 +194,8 @@ enum cpuhp_state { CPUHP_AP_ARM_CACHE_B15_RAC_DYING, CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, + + /* Online section invoked on the hotplugged CPU from the hotplug thread */ CPUHP_AP_ONLINE_IDLE, CPUHP_AP_SCHED_WAIT_EMPTY, CPUHP_AP_SMPBOOT_THREADS, @@ -216,14 +257,15 @@ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name, int (*teardown)(unsigned int cpu), bool multi_instance); /** - * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks + * cpuhp_setup_state - Setup hotplug state callbacks with calling the @startup + * callback * @state: The state for which the calls are installed * @name: Name of the callback (will be used in debug output) - * @startup: startup callback function - * @teardown: teardown callback function + * @startup: startup callback function or NULL if not required + * @teardown: teardown callback function or NULL if not required * - * Installs the callback functions and invokes the startup callback on - * the present cpus which have already reached the @state. + * Installs the callback functions and invokes the @startup callback on + * the online cpus which have already reached the @state. */ static inline int cpuhp_setup_state(enum cpuhp_state state, const char *name, @@ -233,6 +275,18 @@ static inline int cpuhp_setup_state(enum cpuhp_state state, return __cpuhp_setup_state(state, name, true, startup, teardown, false); } +/** + * cpuhp_setup_state_cpuslocked - Setup hotplug state callbacks with calling + * @startup callback from a cpus_read_lock() + * held region + * @state: The state for which the calls are installed + * @name: Name of the callback (will be used in debug output) + * @startup: startup callback function or NULL if not required + * @teardown: teardown callback function or NULL if not required + * + * Same as cpuhp_setup_state() except that it must be invoked from within a + * cpus_read_lock() held region. + */ static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), @@ -244,14 +298,14 @@ static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state, /** * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the - * callbacks + * @startup callback * @state: The state for which the calls are installed * @name: Name of the callback. - * @startup: startup callback function - * @teardown: teardown callback function + * @startup: startup callback function or NULL if not required + * @teardown: teardown callback function or NULL if not required * - * Same as @cpuhp_setup_state except that no calls are executed are invoked - * during installation of this callback. NOP if SMP=n or HOTPLUG_CPU=n. + * Same as cpuhp_setup_state() except that the @startup callback is not + * invoked during installation. NOP if SMP=n or HOTPLUG_CPU=n. */ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, const char *name, @@ -262,6 +316,19 @@ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, false); } +/** + * cpuhp_setup_state_nocalls_cpuslocked - Setup hotplug state callbacks without + * invoking the @startup callback from + * a cpus_read_lock() held region + * callbacks + * @state: The state for which the calls are installed + * @name: Name of the callback. + * @startup: startup callback function or NULL if not required + * @teardown: teardown callback function or NULL if not required + * + * Same as cpuhp_setup_state_nocalls() except that it must be invoked from + * within a cpus_read_lock() held region. + */ static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), @@ -275,13 +342,13 @@ static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state, * cpuhp_setup_state_multi - Add callbacks for multi state * @state: The state for which the calls are installed * @name: Name of the callback. - * @startup: startup callback function - * @teardown: teardown callback function + * @startup: startup callback function or NULL if not required + * @teardown: teardown callback function or NULL if not required * * Sets the internal multi_instance flag and prepares a state to work as a multi * instance callback. No callbacks are invoked at this point. The callbacks are * invoked once an instance for this state are registered via - * @cpuhp_state_add_instance or @cpuhp_state_add_instance_nocalls. + * cpuhp_state_add_instance() or cpuhp_state_add_instance_nocalls() */ static inline int cpuhp_setup_state_multi(enum cpuhp_state state, const char *name, @@ -306,9 +373,10 @@ int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, * @state: The state for which the instance is installed * @node: The node for this individual state. * - * Installs the instance for the @state and invokes the startup callback on - * the present cpus which have already reached the @state. The @state must have - * been earlier marked as multi-instance by @cpuhp_setup_state_multi. + * Installs the instance for the @state and invokes the registered startup + * callback on the online cpus which have already reached the @state. The + * @state must have been earlier marked as multi-instance by + * cpuhp_setup_state_multi(). */ static inline int cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node) @@ -322,8 +390,9 @@ static inline int cpuhp_state_add_instance(enum cpuhp_state state, * @state: The state for which the instance is installed * @node: The node for this individual state. * - * Installs the instance for the @state The @state must have been earlier - * marked as multi-instance by @cpuhp_setup_state_multi. + * Installs the instance for the @state. The @state must have been earlier + * marked as multi-instance by cpuhp_setup_state_multi. NOP if SMP=n or + * HOTPLUG_CPU=n. */ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state, struct hlist_node *node) @@ -331,6 +400,17 @@ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state, return __cpuhp_state_add_instance(state, node, false); } +/** + * cpuhp_state_add_instance_nocalls_cpuslocked - Add an instance for a state + * without invoking the startup + * callback from a cpus_read_lock() + * held region. + * @state: The state for which the instance is installed + * @node: The node for this individual state. + * + * Same as cpuhp_state_add_instance_nocalls() except that it must be + * invoked from within a cpus_read_lock() held region. + */ static inline int cpuhp_state_add_instance_nocalls_cpuslocked(enum cpuhp_state state, struct hlist_node *node) @@ -346,7 +426,7 @@ void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke); * @state: The state for which the calls are removed * * Removes the callback functions and invokes the teardown callback on - * the present cpus which have already reached the @state. + * the online cpus which have already reached the @state. */ static inline void cpuhp_remove_state(enum cpuhp_state state) { @@ -355,7 +435,7 @@ static inline void cpuhp_remove_state(enum cpuhp_state state) /** * cpuhp_remove_state_nocalls - Remove hotplug state callbacks without invoking - * teardown + * the teardown callback * @state: The state for which the calls are removed */ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) @@ -363,6 +443,14 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) __cpuhp_remove_state(state, false); } +/** + * cpuhp_remove_state_nocalls_cpuslocked - Remove hotplug state callbacks without invoking + * teardown from a cpus_read_lock() held region. + * @state: The state for which the calls are removed + * + * Same as cpuhp_remove_state nocalls() except that it must be invoked + * from within a cpus_read_lock() held region. + */ static inline void cpuhp_remove_state_nocalls_cpuslocked(enum cpuhp_state state) { __cpuhp_remove_state_cpuslocked(state, false); @@ -390,8 +478,8 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, * @state: The state from which the instance is removed * @node: The node for this individual state. * - * Removes the instance and invokes the teardown callback on the present cpus - * which have already reached the @state. + * Removes the instance and invokes the teardown callback on the online cpus + * which have already reached @state. */ static inline int cpuhp_state_remove_instance(enum cpuhp_state state, struct hlist_node *node) -- cgit v1.2.3