From a1867f85e06edacd82956d3422caa2b9074f4321 Mon Sep 17 00:00:00 2001
From: Min Li <min.li.xe@renesas.com>
Date: Fri, 18 Jun 2021 12:37:12 -0400
Subject: mfd: Add Renesas Synchronization Management Unit (SMU) support

Add support for ClockMatrix(TM) and 82P33xxx families of timing
and synchronization devices. The access interface can be either
SPI or I2C. Currently, it will create 2 types of MFD devices,
which are to be used by the corresponding rsmu character device
driver and the PTP hardware clock driver, respectively.

Signed-off-by: Min Li <min.li.xe@renesas.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/idt82p33_reg.h | 112 ++++++
 include/linux/mfd/idt8a340_reg.h | 729 +++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/rsmu.h         |  36 ++
 3 files changed, 877 insertions(+)
 create mode 100644 include/linux/mfd/idt82p33_reg.h
 create mode 100644 include/linux/mfd/idt8a340_reg.h
 create mode 100644 include/linux/mfd/rsmu.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/idt82p33_reg.h b/include/linux/mfd/idt82p33_reg.h
new file mode 100644
index 000000000000..129a6c078221
--- /dev/null
+++ b/include/linux/mfd/idt82p33_reg.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Register Map - Based on AN888_SMUforIEEE_SynchEther_82P33xxx_RevH.pdf
+ *
+ * Copyright (C) 2021 Integrated Device Technology, Inc., a Renesas Company.
+ */
+#ifndef HAVE_IDT82P33_REG
+#define HAVE_IDT82P33_REG
+
+/* Register address */
+#define DPLL1_TOD_CNFG 0x134
+#define DPLL2_TOD_CNFG 0x1B4
+
+#define DPLL1_TOD_STS 0x10B
+#define DPLL2_TOD_STS 0x18B
+
+#define DPLL1_TOD_TRIGGER 0x115
+#define DPLL2_TOD_TRIGGER 0x195
+
+#define DPLL1_OPERATING_MODE_CNFG 0x120
+#define DPLL2_OPERATING_MODE_CNFG 0x1A0
+
+#define DPLL1_HOLDOVER_FREQ_CNFG 0x12C
+#define DPLL2_HOLDOVER_FREQ_CNFG 0x1AC
+
+#define DPLL1_PHASE_OFFSET_CNFG 0x143
+#define DPLL2_PHASE_OFFSET_CNFG 0x1C3
+
+#define DPLL1_SYNC_EDGE_CNFG 0x140
+#define DPLL2_SYNC_EDGE_CNFG 0x1C0
+
+#define DPLL1_INPUT_MODE_CNFG 0x116
+#define DPLL2_INPUT_MODE_CNFG 0x196
+
+#define DPLL1_OPERATING_STS 0x102
+#define DPLL2_OPERATING_STS 0x182
+
+#define DPLL1_CURRENT_FREQ_STS 0x103
+#define DPLL2_CURRENT_FREQ_STS 0x183
+
+#define REG_SOFT_RESET 0X381
+
+#define OUT_MUX_CNFG(outn) REG_ADDR(0x6, (0xC * (outn)))
+
+/* Register bit definitions */
+#define SYNC_TOD BIT(1)
+#define PH_OFFSET_EN BIT(7)
+#define SQUELCH_ENABLE BIT(5)
+
+/* Bit definitions for the DPLL_MODE register */
+#define PLL_MODE_SHIFT		(0)
+#define PLL_MODE_MASK		(0x1F)
+#define COMBO_MODE_EN		BIT(5)
+#define COMBO_MODE_SHIFT	(6)
+#define COMBO_MODE_MASK		(0x3)
+
+/* Bit definitions for DPLL_OPERATING_STS register */
+#define OPERATING_STS_MASK	(0x7)
+#define OPERATING_STS_SHIFT	(0x0)
+
+/* Bit definitions for DPLL_TOD_TRIGGER register */
+#define READ_TRIGGER_MASK	(0xF)
+#define READ_TRIGGER_SHIFT	(0x0)
+#define WRITE_TRIGGER_MASK	(0xF0)
+#define WRITE_TRIGGER_SHIFT	(0x4)
+
+/* Bit definitions for REG_SOFT_RESET register */
+#define SOFT_RESET_EN		BIT(7)
+
+enum pll_mode {
+	PLL_MODE_MIN = 0,
+	PLL_MODE_AUTOMATIC = PLL_MODE_MIN,
+	PLL_MODE_FORCE_FREERUN = 1,
+	PLL_MODE_FORCE_HOLDOVER = 2,
+	PLL_MODE_FORCE_LOCKED = 4,
+	PLL_MODE_FORCE_PRE_LOCKED2 = 5,
+	PLL_MODE_FORCE_PRE_LOCKED = 6,
+	PLL_MODE_FORCE_LOST_PHASE = 7,
+	PLL_MODE_DCO = 10,
+	PLL_MODE_WPH = 18,
+	PLL_MODE_MAX = PLL_MODE_WPH,
+};
+
+enum hw_tod_trig_sel {
+	HW_TOD_TRIG_SEL_MIN = 0,
+	HW_TOD_TRIG_SEL_NO_WRITE = HW_TOD_TRIG_SEL_MIN,
+	HW_TOD_TRIG_SEL_NO_READ = HW_TOD_TRIG_SEL_MIN,
+	HW_TOD_TRIG_SEL_SYNC_SEL = 1,
+	HW_TOD_TRIG_SEL_IN12 = 2,
+	HW_TOD_TRIG_SEL_IN13 = 3,
+	HW_TOD_TRIG_SEL_IN14 = 4,
+	HW_TOD_TRIG_SEL_TOD_PPS = 5,
+	HW_TOD_TRIG_SEL_TIMER_INTERVAL = 6,
+	HW_TOD_TRIG_SEL_MSB_PHASE_OFFSET_CNFG = 7,
+	HW_TOD_TRIG_SEL_MSB_HOLDOVER_FREQ_CNFG = 8,
+	HW_TOD_WR_TRIG_SEL_MSB_TOD_CNFG = 9,
+	HW_TOD_RD_TRIG_SEL_LSB_TOD_STS = HW_TOD_WR_TRIG_SEL_MSB_TOD_CNFG,
+	WR_TRIG_SEL_MAX = HW_TOD_WR_TRIG_SEL_MSB_TOD_CNFG,
+};
+
+/** @brief Enumerated type listing DPLL operational modes */
+enum dpll_state {
+	DPLL_STATE_FREERUN = 1,
+	DPLL_STATE_HOLDOVER = 2,
+	DPLL_STATE_LOCKED = 4,
+	DPLL_STATE_PRELOCKED2 = 5,
+	DPLL_STATE_PRELOCKED = 6,
+	DPLL_STATE_LOSTPHASE = 7,
+	DPLL_STATE_MAX
+};
+
+#endif
diff --git a/include/linux/mfd/idt8a340_reg.h b/include/linux/mfd/idt8a340_reg.h
new file mode 100644
index 000000000000..92d763230bdf
--- /dev/null
+++ b/include/linux/mfd/idt8a340_reg.h
@@ -0,0 +1,729 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Based on 5.2.0, Family Programming Guide (Sept 30, 2020)
+ *
+ * Copyright (C) 2021 Integrated Device Technology, Inc., a Renesas Company.
+ */
+#ifndef HAVE_IDT8A340_REG
+#define HAVE_IDT8A340_REG
+
+#define PAGE_ADDR_BASE                    0x0000
+#define PAGE_ADDR                         0x00fc
+
+#define HW_REVISION                       0x8180
+#define REV_ID                            0x007a
+
+#define HW_DPLL_0                         (0x8a00)
+#define HW_DPLL_1                         (0x8b00)
+#define HW_DPLL_2                         (0x8c00)
+#define HW_DPLL_3                         (0x8d00)
+#define HW_DPLL_4                         (0x8e00)
+#define HW_DPLL_5                         (0x8f00)
+#define HW_DPLL_6                         (0x9000)
+#define HW_DPLL_7                         (0x9100)
+
+#define HW_DPLL_TOD_SW_TRIG_ADDR__0       (0x080)
+#define HW_DPLL_TOD_CTRL_1                (0x089)
+#define HW_DPLL_TOD_CTRL_2                (0x08A)
+#define HW_DPLL_TOD_OVR__0                (0x098)
+#define HW_DPLL_TOD_OUT_0__0              (0x0B0)
+
+#define HW_Q0_Q1_CH_SYNC_CTRL_0           (0xa740)
+#define HW_Q0_Q1_CH_SYNC_CTRL_1           (0xa741)
+#define HW_Q2_Q3_CH_SYNC_CTRL_0           (0xa742)
+#define HW_Q2_Q3_CH_SYNC_CTRL_1           (0xa743)
+#define HW_Q4_Q5_CH_SYNC_CTRL_0           (0xa744)
+#define HW_Q4_Q5_CH_SYNC_CTRL_1           (0xa745)
+#define HW_Q6_Q7_CH_SYNC_CTRL_0           (0xa746)
+#define HW_Q6_Q7_CH_SYNC_CTRL_1           (0xa747)
+#define HW_Q8_CH_SYNC_CTRL_0              (0xa748)
+#define HW_Q8_CH_SYNC_CTRL_1              (0xa749)
+#define HW_Q9_CH_SYNC_CTRL_0              (0xa74a)
+#define HW_Q9_CH_SYNC_CTRL_1              (0xa74b)
+#define HW_Q10_CH_SYNC_CTRL_0             (0xa74c)
+#define HW_Q10_CH_SYNC_CTRL_1             (0xa74d)
+#define HW_Q11_CH_SYNC_CTRL_0             (0xa74e)
+#define HW_Q11_CH_SYNC_CTRL_1             (0xa74f)
+
+#define SYNC_SOURCE_DPLL0_TOD_PPS	0x14
+#define SYNC_SOURCE_DPLL1_TOD_PPS	0x15
+#define SYNC_SOURCE_DPLL2_TOD_PPS	0x16
+#define SYNC_SOURCE_DPLL3_TOD_PPS	0x17
+
+#define SYNCTRL1_MASTER_SYNC_RST	BIT(7)
+#define SYNCTRL1_MASTER_SYNC_TRIG	BIT(5)
+#define SYNCTRL1_TOD_SYNC_TRIG		BIT(4)
+#define SYNCTRL1_FBDIV_FRAME_SYNC_TRIG	BIT(3)
+#define SYNCTRL1_FBDIV_SYNC_TRIG	BIT(2)
+#define SYNCTRL1_Q1_DIV_SYNC_TRIG	BIT(1)
+#define SYNCTRL1_Q0_DIV_SYNC_TRIG	BIT(0)
+
+#define HW_Q8_CTRL_SPARE  (0xa7d4)
+#define HW_Q11_CTRL_SPARE (0xa7ec)
+
+/**
+ * Select FOD5 as sync_trigger for Q8 divider.
+ * Transition from logic zero to one
+ * sets trigger to sync Q8 divider.
+ *
+ * Unused when FOD4 is driving Q8 divider (normal operation).
+ */
+#define Q9_TO_Q8_SYNC_TRIG  BIT(1)
+
+/**
+ * Enable FOD5 as driver for clock and sync for Q8 divider.
+ * Enable fanout buffer for FOD5.
+ *
+ * Unused when FOD4 is driving Q8 divider (normal operation).
+ */
+#define Q9_TO_Q8_FANOUT_AND_CLOCK_SYNC_ENABLE_MASK  (BIT(0) | BIT(2))
+
+/**
+ * Select FOD6 as sync_trigger for Q11 divider.
+ * Transition from logic zero to one
+ * sets trigger to sync Q11 divider.
+ *
+ * Unused when FOD7 is driving Q11 divider (normal operation).
+ */
+#define Q10_TO_Q11_SYNC_TRIG  BIT(1)
+
+/**
+ * Enable FOD6 as driver for clock and sync for Q11 divider.
+ * Enable fanout buffer for FOD6.
+ *
+ * Unused when FOD7 is driving Q11 divider (normal operation).
+ */
+#define Q10_TO_Q11_FANOUT_AND_CLOCK_SYNC_ENABLE_MASK  (BIT(0) | BIT(2))
+
+#define RESET_CTRL                        0xc000
+#define SM_RESET                          0x0012
+#define SM_RESET_V520                     0x0013
+#define SM_RESET_CMD                      0x5A
+
+#define GENERAL_STATUS                    0xc014
+#define BOOT_STATUS                       0x0000
+#define HW_REV_ID                         0x000A
+#define BOND_ID                           0x000B
+#define HW_CSR_ID                         0x000C
+#define HW_IRQ_ID                         0x000E
+#define MAJ_REL                           0x0010
+#define MIN_REL                           0x0011
+#define HOTFIX_REL                        0x0012
+#define PIPELINE_ID                       0x0014
+#define BUILD_ID                          0x0018
+#define JTAG_DEVICE_ID                    0x001c
+#define PRODUCT_ID                        0x001e
+#define OTP_SCSR_CONFIG_SELECT            0x0022
+
+#define STATUS                            0xc03c
+#define DPLL0_STATUS			  0x0018
+#define DPLL1_STATUS			  0x0019
+#define DPLL2_STATUS			  0x001a
+#define DPLL3_STATUS			  0x001b
+#define DPLL4_STATUS			  0x001c
+#define DPLL5_STATUS			  0x001d
+#define DPLL6_STATUS			  0x001e
+#define DPLL7_STATUS			  0x001f
+#define DPLL_SYS_STATUS                   0x0020
+#define DPLL_SYS_APLL_STATUS              0x0021
+#define DPLL0_FILTER_STATUS               0x0044
+#define DPLL1_FILTER_STATUS               0x004c
+#define DPLL2_FILTER_STATUS               0x0054
+#define DPLL3_FILTER_STATUS               0x005c
+#define DPLL4_FILTER_STATUS               0x0064
+#define DPLL5_FILTER_STATUS               0x006c
+#define DPLL6_FILTER_STATUS               0x0074
+#define DPLL7_FILTER_STATUS               0x007c
+#define DPLLSYS_FILTER_STATUS             0x0084
+#define USER_GPIO0_TO_7_STATUS            0x008a
+#define USER_GPIO8_TO_15_STATUS           0x008b
+
+#define GPIO_USER_CONTROL                 0xc160
+#define GPIO0_TO_7_OUT                    0x0000
+#define GPIO8_TO_15_OUT                   0x0001
+#define GPIO0_TO_7_OUT_V520               0x0002
+#define GPIO8_TO_15_OUT_V520              0x0003
+
+#define STICKY_STATUS_CLEAR               0xc164
+
+#define GPIO_TOD_NOTIFICATION_CLEAR       0xc16c
+
+#define ALERT_CFG                         0xc188
+
+#define SYS_DPLL_XO                       0xc194
+
+#define SYS_APLL                          0xc19c
+
+#define INPUT_0                           0xc1b0
+#define INPUT_1                           0xc1c0
+#define INPUT_2                           0xc1d0
+#define INPUT_3                           0xc200
+#define INPUT_4                           0xc210
+#define INPUT_5                           0xc220
+#define INPUT_6                           0xc230
+#define INPUT_7                           0xc240
+#define INPUT_8                           0xc250
+#define INPUT_9                           0xc260
+#define INPUT_10                          0xc280
+#define INPUT_11                          0xc290
+#define INPUT_12                          0xc2a0
+#define INPUT_13                          0xc2b0
+#define INPUT_14                          0xc2c0
+#define INPUT_15                          0xc2d0
+
+#define REF_MON_0                         0xc2e0
+#define REF_MON_1                         0xc2ec
+#define REF_MON_2                         0xc300
+#define REF_MON_3                         0xc30c
+#define REF_MON_4                         0xc318
+#define REF_MON_5                         0xc324
+#define REF_MON_6                         0xc330
+#define REF_MON_7                         0xc33c
+#define REF_MON_8                         0xc348
+#define REF_MON_9                         0xc354
+#define REF_MON_10                        0xc360
+#define REF_MON_11                        0xc36c
+#define REF_MON_12                        0xc380
+#define REF_MON_13                        0xc38c
+#define REF_MON_14                        0xc398
+#define REF_MON_15                        0xc3a4
+
+#define DPLL_0                            0xc3b0
+#define DPLL_CTRL_REG_0                   0x0002
+#define DPLL_CTRL_REG_1                   0x0003
+#define DPLL_CTRL_REG_2                   0x0004
+#define DPLL_TOD_SYNC_CFG                 0x0031
+#define DPLL_COMBO_SLAVE_CFG_0            0x0032
+#define DPLL_COMBO_SLAVE_CFG_1            0x0033
+#define DPLL_SLAVE_REF_CFG                0x0034
+#define DPLL_REF_MODE                     0x0035
+#define DPLL_PHASE_MEASUREMENT_CFG        0x0036
+#define DPLL_MODE                         0x0037
+#define DPLL_MODE_V520                    0x003B
+#define DPLL_1                            0xc400
+#define DPLL_2                            0xc438
+#define DPLL_2_V520                       0xc43c
+#define DPLL_3                            0xc480
+#define DPLL_4                            0xc4b8
+#define DPLL_4_V520                       0xc4bc
+#define DPLL_5                            0xc500
+#define DPLL_6                            0xc538
+#define DPLL_6_V520                       0xc53c
+#define DPLL_7                            0xc580
+#define SYS_DPLL                          0xc5b8
+#define SYS_DPLL_V520                     0xc5bc
+
+#define DPLL_CTRL_0                       0xc600
+#define DPLL_CTRL_DPLL_MANU_REF_CFG       0x0001
+#define DPLL_CTRL_DPLL_FOD_FREQ           0x001c
+#define DPLL_CTRL_COMBO_MASTER_CFG        0x003a
+#define DPLL_CTRL_1                       0xc63c
+#define DPLL_CTRL_2                       0xc680
+#define DPLL_CTRL_3                       0xc6bc
+#define DPLL_CTRL_4                       0xc700
+#define DPLL_CTRL_5                       0xc73c
+#define DPLL_CTRL_6                       0xc780
+#define DPLL_CTRL_7                       0xc7bc
+#define SYS_DPLL_CTRL                     0xc800
+
+#define DPLL_PHASE_0                      0xc818
+/* Signed 42-bit FFO in units of 2^(-53) */
+#define DPLL_WR_PHASE                     0x0000
+#define DPLL_PHASE_1                      0xc81c
+#define DPLL_PHASE_2                      0xc820
+#define DPLL_PHASE_3                      0xc824
+#define DPLL_PHASE_4                      0xc828
+#define DPLL_PHASE_5                      0xc82c
+#define DPLL_PHASE_6                      0xc830
+#define DPLL_PHASE_7                      0xc834
+
+#define DPLL_FREQ_0                       0xc838
+/* Signed 42-bit FFO in units of 2^(-53) */
+#define DPLL_WR_FREQ                      0x0000
+#define DPLL_FREQ_1                       0xc840
+#define DPLL_FREQ_2                       0xc848
+#define DPLL_FREQ_3                       0xc850
+#define DPLL_FREQ_4                       0xc858
+#define DPLL_FREQ_5                       0xc860
+#define DPLL_FREQ_6                       0xc868
+#define DPLL_FREQ_7                       0xc870
+
+#define DPLL_PHASE_PULL_IN_0              0xc880
+#define PULL_IN_OFFSET                    0x0000 /* Signed 32 bit */
+#define PULL_IN_SLOPE_LIMIT               0x0004 /* Unsigned 24 bit */
+#define PULL_IN_CTRL                      0x0007
+#define DPLL_PHASE_PULL_IN_1              0xc888
+#define DPLL_PHASE_PULL_IN_2              0xc890
+#define DPLL_PHASE_PULL_IN_3              0xc898
+#define DPLL_PHASE_PULL_IN_4              0xc8a0
+#define DPLL_PHASE_PULL_IN_5              0xc8a8
+#define DPLL_PHASE_PULL_IN_6              0xc8b0
+#define DPLL_PHASE_PULL_IN_7              0xc8b8
+
+#define GPIO_CFG                          0xc8c0
+#define GPIO_CFG_GBL                      0x0000
+#define GPIO_0                            0xc8c2
+#define GPIO_DCO_INC_DEC                  0x0000
+#define GPIO_OUT_CTRL_0                   0x0001
+#define GPIO_OUT_CTRL_1                   0x0002
+#define GPIO_TOD_TRIG                     0x0003
+#define GPIO_DPLL_INDICATOR               0x0004
+#define GPIO_LOS_INDICATOR                0x0005
+#define GPIO_REF_INPUT_DSQ_0              0x0006
+#define GPIO_REF_INPUT_DSQ_1              0x0007
+#define GPIO_REF_INPUT_DSQ_2              0x0008
+#define GPIO_REF_INPUT_DSQ_3              0x0009
+#define GPIO_MAN_CLK_SEL_0                0x000a
+#define GPIO_MAN_CLK_SEL_1                0x000b
+#define GPIO_MAN_CLK_SEL_2                0x000c
+#define GPIO_SLAVE                        0x000d
+#define GPIO_ALERT_OUT_CFG                0x000e
+#define GPIO_TOD_NOTIFICATION_CFG         0x000f
+#define GPIO_CTRL                         0x0010
+#define GPIO_CTRL_V520                    0x0011
+#define GPIO_1                            0xc8d4
+#define GPIO_2                            0xc8e6
+#define GPIO_3                            0xc900
+#define GPIO_4                            0xc912
+#define GPIO_5                            0xc924
+#define GPIO_6                            0xc936
+#define GPIO_7                            0xc948
+#define GPIO_8                            0xc95a
+#define GPIO_9                            0xc980
+#define GPIO_10                           0xc992
+#define GPIO_11                           0xc9a4
+#define GPIO_12                           0xc9b6
+#define GPIO_13                           0xc9c8
+#define GPIO_14                           0xc9da
+#define GPIO_15                           0xca00
+
+#define OUT_DIV_MUX                       0xca12
+#define OUTPUT_0                          0xca14
+#define OUTPUT_0_V520                     0xca20
+/* FOD frequency output divider value */
+#define OUT_DIV                           0x0000
+#define OUT_DUTY_CYCLE_HIGH               0x0004
+#define OUT_CTRL_0                        0x0008
+#define OUT_CTRL_1                        0x0009
+/* Phase adjustment in FOD cycles */
+#define OUT_PHASE_ADJ                     0x000c
+#define OUTPUT_1                          0xca24
+#define OUTPUT_1_V520                     0xca30
+#define OUTPUT_2                          0xca34
+#define OUTPUT_2_V520                     0xca40
+#define OUTPUT_3                          0xca44
+#define OUTPUT_3_V520                     0xca50
+#define OUTPUT_4                          0xca54
+#define OUTPUT_4_V520                     0xca60
+#define OUTPUT_5                          0xca64
+#define OUTPUT_5_V520                     0xca80
+#define OUTPUT_6                          0xca80
+#define OUTPUT_6_V520                     0xca90
+#define OUTPUT_7                          0xca90
+#define OUTPUT_7_V520                     0xcaa0
+#define OUTPUT_8                          0xcaa0
+#define OUTPUT_8_V520                     0xcab0
+#define OUTPUT_9                          0xcab0
+#define OUTPUT_9_V520                     0xcac0
+#define OUTPUT_10                         0xcac0
+#define OUTPUT_10_V520                     0xcad0
+#define OUTPUT_11                         0xcad0
+#define OUTPUT_11_V520                    0xcae0
+
+#define SERIAL                            0xcae0
+#define SERIAL_V520                       0xcaf0
+
+#define PWM_ENCODER_0                     0xcb00
+#define PWM_ENCODER_1                     0xcb08
+#define PWM_ENCODER_2                     0xcb10
+#define PWM_ENCODER_3                     0xcb18
+#define PWM_ENCODER_4                     0xcb20
+#define PWM_ENCODER_5                     0xcb28
+#define PWM_ENCODER_6                     0xcb30
+#define PWM_ENCODER_7                     0xcb38
+#define PWM_DECODER_0                     0xcb40
+#define PWM_DECODER_1                     0xcb48
+#define PWM_DECODER_1_V520                0xcb4a
+#define PWM_DECODER_2                     0xcb50
+#define PWM_DECODER_2_V520                0xcb54
+#define PWM_DECODER_3                     0xcb58
+#define PWM_DECODER_3_V520                0xcb5e
+#define PWM_DECODER_4                     0xcb60
+#define PWM_DECODER_4_V520                0xcb68
+#define PWM_DECODER_5                     0xcb68
+#define PWM_DECODER_5_V520                0xcb80
+#define PWM_DECODER_6                     0xcb70
+#define PWM_DECODER_6_V520                0xcb8a
+#define PWM_DECODER_7                     0xcb80
+#define PWM_DECODER_7_V520                0xcb94
+#define PWM_DECODER_8                     0xcb88
+#define PWM_DECODER_8_V520                0xcb9e
+#define PWM_DECODER_9                     0xcb90
+#define PWM_DECODER_9_V520                0xcba8
+#define PWM_DECODER_10                    0xcb98
+#define PWM_DECODER_10_V520               0xcbb2
+#define PWM_DECODER_11                    0xcba0
+#define PWM_DECODER_11_V520               0xcbbc
+#define PWM_DECODER_12                    0xcba8
+#define PWM_DECODER_12_V520               0xcbc6
+#define PWM_DECODER_13                    0xcbb0
+#define PWM_DECODER_13_V520               0xcbd0
+#define PWM_DECODER_14                    0xcbb8
+#define PWM_DECODER_14_V520               0xcbda
+#define PWM_DECODER_15                    0xcbc0
+#define PWM_DECODER_15_V520               0xcbe4
+#define PWM_USER_DATA                     0xcbc8
+#define PWM_USER_DATA_V520                0xcbf0
+
+#define TOD_0                             0xcbcc
+#define TOD_0_V520                        0xcc00
+/* Enable TOD counter, output channel sync and even-PPS mode */
+#define TOD_CFG                           0x0000
+#define TOD_CFG_V520                      0x0001
+#define TOD_1                             0xcbce
+#define TOD_1_V520                        0xcc02
+#define TOD_2                             0xcbd0
+#define TOD_2_V520                        0xcc04
+#define TOD_3                             0xcbd2
+#define TOD_3_V520                        0xcc06
+
+#define TOD_WRITE_0                       0xcc00
+#define TOD_WRITE_0_V520                  0xcc10
+/* 8-bit subns, 32-bit ns, 48-bit seconds */
+#define TOD_WRITE                         0x0000
+/* Counter increments after TOD write is completed */
+#define TOD_WRITE_COUNTER                 0x000c
+/* TOD write trigger configuration */
+#define TOD_WRITE_SELECT_CFG_0            0x000d
+/* TOD write trigger selection */
+#define TOD_WRITE_CMD                     0x000f
+#define TOD_WRITE_1                       0xcc10
+#define TOD_WRITE_1_V520                  0xcc20
+#define TOD_WRITE_2                       0xcc20
+#define TOD_WRITE_2_V520                  0xcc30
+#define TOD_WRITE_3                       0xcc30
+#define TOD_WRITE_3_V520                  0xcc40
+
+#define TOD_READ_PRIMARY_0                0xcc40
+#define TOD_READ_PRIMARY_0_V520           0xcc50
+/* 8-bit subns, 32-bit ns, 48-bit seconds */
+#define TOD_READ_PRIMARY                  0x0000
+/* Counter increments after TOD write is completed */
+#define TOD_READ_PRIMARY_COUNTER          0x000b
+/* Read trigger configuration */
+#define TOD_READ_PRIMARY_SEL_CFG_0        0x000c
+/* Read trigger selection */
+#define TOD_READ_PRIMARY_CMD              0x000e
+#define TOD_READ_PRIMARY_CMD_V520         0x000f
+#define TOD_READ_PRIMARY_1                0xcc50
+#define TOD_READ_PRIMARY_1_V520           0xcc60
+#define TOD_READ_PRIMARY_2                0xcc60
+#define TOD_READ_PRIMARY_2_V520           0xcc80
+#define TOD_READ_PRIMARY_3                0xcc80
+#define TOD_READ_PRIMARY_3_V520           0xcc90
+
+#define TOD_READ_SECONDARY_0              0xcc90
+#define TOD_READ_SECONDARY_0_V520         0xcca0
+#define TOD_READ_SECONDARY_1              0xcca0
+#define TOD_READ_SECONDARY_1_V520         0xccb0
+#define TOD_READ_SECONDARY_2              0xccb0
+#define TOD_READ_SECONDARY_2_V520         0xccc0
+#define TOD_READ_SECONDARY_3              0xccc0
+#define TOD_READ_SECONDARY_3_V520         0xccd0
+
+#define OUTPUT_TDC_CFG                    0xccd0
+#define OUTPUT_TDC_CFG_V520               0xcce0
+#define OUTPUT_TDC_0                      0xcd00
+#define OUTPUT_TDC_1                      0xcd08
+#define OUTPUT_TDC_2                      0xcd10
+#define OUTPUT_TDC_3                      0xcd18
+#define INPUT_TDC                         0xcd20
+
+#define SCRATCH                           0xcf50
+#define SCRATCH_V520                      0xcf4c
+
+#define EEPROM                            0xcf68
+#define EEPROM_V520                       0xcf64
+
+#define OTP                               0xcf70
+
+#define BYTE                              0xcf80
+
+/* Bit definitions for the MAJ_REL register */
+#define MAJOR_SHIFT                       (1)
+#define MAJOR_MASK                        (0x7f)
+#define PR_BUILD                          BIT(0)
+
+/* Bit definitions for the USER_GPIO0_TO_7_STATUS register */
+#define GPIO0_LEVEL                       BIT(0)
+#define GPIO1_LEVEL                       BIT(1)
+#define GPIO2_LEVEL                       BIT(2)
+#define GPIO3_LEVEL                       BIT(3)
+#define GPIO4_LEVEL                       BIT(4)
+#define GPIO5_LEVEL                       BIT(5)
+#define GPIO6_LEVEL                       BIT(6)
+#define GPIO7_LEVEL                       BIT(7)
+
+/* Bit definitions for the USER_GPIO8_TO_15_STATUS register */
+#define GPIO8_LEVEL                       BIT(0)
+#define GPIO9_LEVEL                       BIT(1)
+#define GPIO10_LEVEL                      BIT(2)
+#define GPIO11_LEVEL                      BIT(3)
+#define GPIO12_LEVEL                      BIT(4)
+#define GPIO13_LEVEL                      BIT(5)
+#define GPIO14_LEVEL                      BIT(6)
+#define GPIO15_LEVEL                      BIT(7)
+
+/* Bit definitions for the GPIO0_TO_7_OUT register */
+#define GPIO0_DRIVE_LEVEL                 BIT(0)
+#define GPIO1_DRIVE_LEVEL                 BIT(1)
+#define GPIO2_DRIVE_LEVEL                 BIT(2)
+#define GPIO3_DRIVE_LEVEL                 BIT(3)
+#define GPIO4_DRIVE_LEVEL                 BIT(4)
+#define GPIO5_DRIVE_LEVEL                 BIT(5)
+#define GPIO6_DRIVE_LEVEL                 BIT(6)
+#define GPIO7_DRIVE_LEVEL                 BIT(7)
+
+/* Bit definitions for the GPIO8_TO_15_OUT register */
+#define GPIO8_DRIVE_LEVEL                 BIT(0)
+#define GPIO9_DRIVE_LEVEL                 BIT(1)
+#define GPIO10_DRIVE_LEVEL                BIT(2)
+#define GPIO11_DRIVE_LEVEL                BIT(3)
+#define GPIO12_DRIVE_LEVEL                BIT(4)
+#define GPIO13_DRIVE_LEVEL                BIT(5)
+#define GPIO14_DRIVE_LEVEL                BIT(6)
+#define GPIO15_DRIVE_LEVEL                BIT(7)
+
+/* Bit definitions for the DPLL_TOD_SYNC_CFG register */
+#define TOD_SYNC_SOURCE_SHIFT             (1)
+#define TOD_SYNC_SOURCE_MASK              (0x3)
+#define TOD_SYNC_EN                       BIT(0)
+
+/* Bit definitions for the DPLL_MODE register */
+#define WRITE_TIMER_MODE                  BIT(6)
+#define PLL_MODE_SHIFT                    (3)
+#define PLL_MODE_MASK                     (0x7)
+#define STATE_MODE_SHIFT                  (0)
+#define STATE_MODE_MASK                   (0x7)
+
+/* Bit definitions for the GPIO_CFG_GBL register */
+#define SUPPLY_MODE_SHIFT                 (0)
+#define SUPPLY_MODE_MASK                  (0x3)
+
+/* Bit definitions for the GPIO_DCO_INC_DEC register */
+#define INCDEC_DPLL_INDEX_SHIFT           (0)
+#define INCDEC_DPLL_INDEX_MASK            (0x7)
+
+/* Bit definitions for the GPIO_OUT_CTRL_0 register */
+#define CTRL_OUT_0                        BIT(0)
+#define CTRL_OUT_1                        BIT(1)
+#define CTRL_OUT_2                        BIT(2)
+#define CTRL_OUT_3                        BIT(3)
+#define CTRL_OUT_4                        BIT(4)
+#define CTRL_OUT_5                        BIT(5)
+#define CTRL_OUT_6                        BIT(6)
+#define CTRL_OUT_7                        BIT(7)
+
+/* Bit definitions for the GPIO_OUT_CTRL_1 register */
+#define CTRL_OUT_8                        BIT(0)
+#define CTRL_OUT_9                        BIT(1)
+#define CTRL_OUT_10                       BIT(2)
+#define CTRL_OUT_11                       BIT(3)
+#define CTRL_OUT_12                       BIT(4)
+#define CTRL_OUT_13                       BIT(5)
+#define CTRL_OUT_14                       BIT(6)
+#define CTRL_OUT_15                       BIT(7)
+
+/* Bit definitions for the GPIO_TOD_TRIG register */
+#define TOD_TRIG_0                        BIT(0)
+#define TOD_TRIG_1                        BIT(1)
+#define TOD_TRIG_2                        BIT(2)
+#define TOD_TRIG_3                        BIT(3)
+
+/* Bit definitions for the GPIO_DPLL_INDICATOR register */
+#define IND_DPLL_INDEX_SHIFT              (0)
+#define IND_DPLL_INDEX_MASK               (0x7)
+
+/* Bit definitions for the GPIO_LOS_INDICATOR register */
+#define REFMON_INDEX_SHIFT                (0)
+#define REFMON_INDEX_MASK                 (0xf)
+/* Active level of LOS indicator, 0=low 1=high */
+#define ACTIVE_LEVEL                      BIT(4)
+
+/* Bit definitions for the GPIO_REF_INPUT_DSQ_0 register */
+#define DSQ_INP_0                         BIT(0)
+#define DSQ_INP_1                         BIT(1)
+#define DSQ_INP_2                         BIT(2)
+#define DSQ_INP_3                         BIT(3)
+#define DSQ_INP_4                         BIT(4)
+#define DSQ_INP_5                         BIT(5)
+#define DSQ_INP_6                         BIT(6)
+#define DSQ_INP_7                         BIT(7)
+
+/* Bit definitions for the GPIO_REF_INPUT_DSQ_1 register */
+#define DSQ_INP_8                         BIT(0)
+#define DSQ_INP_9                         BIT(1)
+#define DSQ_INP_10                        BIT(2)
+#define DSQ_INP_11                        BIT(3)
+#define DSQ_INP_12                        BIT(4)
+#define DSQ_INP_13                        BIT(5)
+#define DSQ_INP_14                        BIT(6)
+#define DSQ_INP_15                        BIT(7)
+
+/* Bit definitions for the GPIO_REF_INPUT_DSQ_2 register */
+#define DSQ_DPLL_0                        BIT(0)
+#define DSQ_DPLL_1                        BIT(1)
+#define DSQ_DPLL_2                        BIT(2)
+#define DSQ_DPLL_3                        BIT(3)
+#define DSQ_DPLL_4                        BIT(4)
+#define DSQ_DPLL_5                        BIT(5)
+#define DSQ_DPLL_6                        BIT(6)
+#define DSQ_DPLL_7                        BIT(7)
+
+/* Bit definitions for the GPIO_REF_INPUT_DSQ_3 register */
+#define DSQ_DPLL_SYS                      BIT(0)
+#define GPIO_DSQ_LEVEL                    BIT(1)
+
+/* Bit definitions for the GPIO_TOD_NOTIFICATION_CFG register */
+#define DPLL_TOD_SHIFT                    (0)
+#define DPLL_TOD_MASK                     (0x3)
+#define TOD_READ_SECONDARY                BIT(2)
+#define GPIO_ASSERT_LEVEL                 BIT(3)
+
+/* Bit definitions for the GPIO_CTRL register */
+#define GPIO_FUNCTION_EN                  BIT(0)
+#define GPIO_CMOS_OD_MODE                 BIT(1)
+#define GPIO_CONTROL_DIR                  BIT(2)
+#define GPIO_PU_PD_MODE                   BIT(3)
+#define GPIO_FUNCTION_SHIFT               (4)
+#define GPIO_FUNCTION_MASK                (0xf)
+
+/* Bit definitions for the OUT_CTRL_1 register */
+#define OUT_SYNC_DISABLE                  BIT(7)
+#define SQUELCH_VALUE                     BIT(6)
+#define SQUELCH_DISABLE                   BIT(5)
+#define PAD_VDDO_SHIFT                    (2)
+#define PAD_VDDO_MASK                     (0x7)
+#define PAD_CMOSDRV_SHIFT                 (0)
+#define PAD_CMOSDRV_MASK                  (0x3)
+
+/* Bit definitions for the TOD_CFG register */
+#define TOD_EVEN_PPS_MODE                 BIT(2)
+#define TOD_OUT_SYNC_ENABLE               BIT(1)
+#define TOD_ENABLE                        BIT(0)
+
+/* Bit definitions for the TOD_WRITE_SELECT_CFG_0 register */
+#define WR_PWM_DECODER_INDEX_SHIFT        (4)
+#define WR_PWM_DECODER_INDEX_MASK         (0xf)
+#define WR_REF_INDEX_SHIFT                (0)
+#define WR_REF_INDEX_MASK                 (0xf)
+
+/* Bit definitions for the TOD_WRITE_CMD register */
+#define TOD_WRITE_SELECTION_SHIFT         (0)
+#define TOD_WRITE_SELECTION_MASK          (0xf)
+/* 4.8.7 */
+#define TOD_WRITE_TYPE_SHIFT              (4)
+#define TOD_WRITE_TYPE_MASK               (0x3)
+
+/* Bit definitions for the TOD_READ_PRIMARY_SEL_CFG_0 register */
+#define RD_PWM_DECODER_INDEX_SHIFT        (4)
+#define RD_PWM_DECODER_INDEX_MASK         (0xf)
+#define RD_REF_INDEX_SHIFT                (0)
+#define RD_REF_INDEX_MASK                 (0xf)
+
+/* Bit definitions for the TOD_READ_PRIMARY_CMD register */
+#define TOD_READ_TRIGGER_MODE             BIT(4)
+#define TOD_READ_TRIGGER_SHIFT            (0)
+#define TOD_READ_TRIGGER_MASK             (0xf)
+
+/* Bit definitions for the DPLL_CTRL_COMBO_MASTER_CFG register */
+#define COMBO_MASTER_HOLD                 BIT(0)
+
+/* Bit definitions for DPLL_SYS_STATUS register */
+#define DPLL_SYS_STATE_MASK               (0xf)
+
+/* Bit definitions for SYS_APLL_STATUS register */
+#define SYS_APLL_LOSS_LOCK_LIVE_MASK       BIT(0)
+#define SYS_APLL_LOSS_LOCK_LIVE_LOCKED     0
+#define SYS_APLL_LOSS_LOCK_LIVE_UNLOCKED   1
+
+/* Bit definitions for the DPLL0_STATUS register */
+#define DPLL_STATE_MASK                   (0xf)
+#define DPLL_STATE_SHIFT                  (0x0)
+
+/* Values of DPLL_N.DPLL_MODE.PLL_MODE */
+enum pll_mode {
+	PLL_MODE_MIN = 0,
+	PLL_MODE_NORMAL = PLL_MODE_MIN,
+	PLL_MODE_WRITE_PHASE = 1,
+	PLL_MODE_WRITE_FREQUENCY = 2,
+	PLL_MODE_GPIO_INC_DEC = 3,
+	PLL_MODE_SYNTHESIS = 4,
+	PLL_MODE_PHASE_MEASUREMENT = 5,
+	PLL_MODE_DISABLED = 6,
+	PLL_MODE_MAX = PLL_MODE_DISABLED,
+};
+
+enum hw_tod_write_trig_sel {
+	HW_TOD_WR_TRIG_SEL_MIN = 0,
+	HW_TOD_WR_TRIG_SEL_MSB = HW_TOD_WR_TRIG_SEL_MIN,
+	HW_TOD_WR_TRIG_SEL_RESERVED = 1,
+	HW_TOD_WR_TRIG_SEL_TOD_PPS = 2,
+	HW_TOD_WR_TRIG_SEL_IRIGB_PPS = 3,
+	HW_TOD_WR_TRIG_SEL_PWM_PPS = 4,
+	HW_TOD_WR_TRIG_SEL_GPIO = 5,
+	HW_TOD_WR_TRIG_SEL_FOD_SYNC = 6,
+	WR_TRIG_SEL_MAX = HW_TOD_WR_TRIG_SEL_FOD_SYNC,
+};
+
+enum scsr_read_trig_sel {
+	/* CANCEL CURRENT TOD READ; MODULE BECOMES IDLE - NO TRIGGER OCCURS */
+	SCSR_TOD_READ_TRIG_SEL_DISABLE = 0,
+	/* TRIGGER IMMEDIATELY */
+	SCSR_TOD_READ_TRIG_SEL_IMMEDIATE = 1,
+	/* TRIGGER ON RISING EDGE OF INTERNAL TOD PPS SIGNAL */
+	SCSR_TOD_READ_TRIG_SEL_TODPPS = 2,
+	/* TRGGER ON RISING EDGE OF SELECTED REFERENCE INPUT */
+	SCSR_TOD_READ_TRIG_SEL_REFCLK = 3,
+	/* TRIGGER ON RISING EDGE OF SELECTED PWM DECODER 1PPS OUTPUT */
+	SCSR_TOD_READ_TRIG_SEL_PWMPPS = 4,
+	SCSR_TOD_READ_TRIG_SEL_RESERVED = 5,
+	/* TRIGGER WHEN WRITE FREQUENCY EVENT OCCURS  */
+	SCSR_TOD_READ_TRIG_SEL_WRITEFREQUENCYEVENT = 6,
+	/* TRIGGER ON SELECTED GPIO */
+	SCSR_TOD_READ_TRIG_SEL_GPIO = 7,
+	SCSR_TOD_READ_TRIG_SEL_MAX = SCSR_TOD_READ_TRIG_SEL_GPIO,
+};
+
+/* Values STATUS.DPLL_SYS_STATUS.DPLL_SYS_STATE */
+enum dpll_state {
+	DPLL_STATE_MIN = 0,
+	DPLL_STATE_FREERUN = DPLL_STATE_MIN,
+	DPLL_STATE_LOCKACQ = 1,
+	DPLL_STATE_LOCKREC = 2,
+	DPLL_STATE_LOCKED = 3,
+	DPLL_STATE_HOLDOVER = 4,
+	DPLL_STATE_OPEN_LOOP = 5,
+	DPLL_STATE_MAX = DPLL_STATE_OPEN_LOOP,
+};
+
+/* 4.8.7 only */
+enum scsr_tod_write_trig_sel {
+	SCSR_TOD_WR_TRIG_SEL_DISABLE = 0,
+	SCSR_TOD_WR_TRIG_SEL_IMMEDIATE = 1,
+	SCSR_TOD_WR_TRIG_SEL_REFCLK = 2,
+	SCSR_TOD_WR_TRIG_SEL_PWMPPS = 3,
+	SCSR_TOD_WR_TRIG_SEL_TODPPS = 4,
+	SCSR_TOD_WR_TRIG_SEL_SYNCFOD = 5,
+	SCSR_TOD_WR_TRIG_SEL_GPIO = 6,
+	SCSR_TOD_WR_TRIG_SEL_MAX = SCSR_TOD_WR_TRIG_SEL_GPIO,
+};
+
+/* 4.8.7 only */
+enum scsr_tod_write_type_sel {
+	SCSR_TOD_WR_TYPE_SEL_ABSOLUTE = 0,
+	SCSR_TOD_WR_TYPE_SEL_DELTA_PLUS = 1,
+	SCSR_TOD_WR_TYPE_SEL_DELTA_MINUS = 2,
+	SCSR_TOD_WR_TYPE_SEL_MAX = SCSR_TOD_WR_TYPE_SEL_DELTA_MINUS,
+};
+#endif
diff --git a/include/linux/mfd/rsmu.h b/include/linux/mfd/rsmu.h
new file mode 100644
index 000000000000..6870de608233
--- /dev/null
+++ b/include/linux/mfd/rsmu.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Core interface for Renesas Synchronization Management Unit (SMU) devices.
+ *
+ * Copyright (C) 2021 Integrated Device Technology, Inc., a Renesas Company.
+ */
+
+#ifndef __LINUX_MFD_RSMU_H
+#define __LINUX_MFD_RSMU_H
+
+/* The supported devices are ClockMatrix, Sabre and SnowLotus */
+enum rsmu_type {
+	RSMU_CM		= 0x34000,
+	RSMU_SABRE	= 0x33810,
+	RSMU_SL		= 0x19850,
+};
+
+/**
+ *
+ * struct rsmu_ddata - device data structure for sub devices.
+ *
+ * @dev:    i2c/spi device.
+ * @regmap: i2c/spi bus access.
+ * @lock:   mutex used by sub devices to make sure a series of
+ *          bus access requests are not interrupted.
+ * @type:   RSMU device type.
+ * @page:   i2c/spi bus driver internal use only.
+ */
+struct rsmu_ddata {
+	struct device *dev;
+	struct regmap *regmap;
+	struct mutex lock;
+	enum rsmu_type type;
+	u16 page;
+};
+#endif /*  __LINUX_MFD_RSMU_H */
-- 
cgit v1.2.3


From 69031f500865ee3eee19566a1b9c40a189817eaa Mon Sep 17 00:00:00 2001
From: Claire Chang <tientzu@chromium.org>
Date: Sat, 19 Jun 2021 11:40:34 +0800
Subject: swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used

Always have the pointer to the swiotlb pool used in struct device. This
could help simplify the code for other pools.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Stefano Stabellini <sstabellini@kernel.org>
Tested-by: Will Deacon <will@kernel.org>
Acked-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/linux/device.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 59940f1744c1..2a22875238a6 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -423,6 +423,7 @@ struct dev_links_info {
  * @dma_pools:	Dma pools (if dma'ble device).
  * @dma_mem:	Internal for coherent mem override.
  * @cma_area:	Contiguous memory area for dma allocations
+ * @dma_io_tlb_mem: Pointer to the swiotlb pool used.  Not for driver use.
  * @archdata:	For arch-specific additions.
  * @of_node:	Associated device tree node.
  * @fwnode:	Associated device node supplied by platform firmware.
@@ -531,6 +532,9 @@ struct device {
 #ifdef CONFIG_DMA_CMA
 	struct cma *cma_area;		/* contiguous memory area for dma
 					   allocations */
+#endif
+#ifdef CONFIG_SWIOTLB
+	struct io_tlb_mem *dma_io_tlb_mem;
 #endif
 	/* arch specific additions */
 	struct dev_archdata	archdata;
-- 
cgit v1.2.3


From 7fd856aa7f4261ddac62ea59d8383fef22a0690e Mon Sep 17 00:00:00 2001
From: Claire Chang <tientzu@chromium.org>
Date: Sat, 19 Jun 2021 11:40:35 +0800
Subject: swiotlb: Update is_swiotlb_buffer to add a struct device argument

Update is_swiotlb_buffer to add a struct device argument. This will be
useful later to allow for different pools.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Stefano Stabellini <sstabellini@kernel.org>
Tested-by: Will Deacon <will@kernel.org>
Acked-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/linux/swiotlb.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 216854a5e513..d1f3d95881cd 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -2,6 +2,7 @@
 #ifndef __LINUX_SWIOTLB_H
 #define __LINUX_SWIOTLB_H
 
+#include <linux/device.h>
 #include <linux/dma-direction.h>
 #include <linux/init.h>
 #include <linux/types.h>
@@ -101,9 +102,9 @@ struct io_tlb_mem {
 };
 extern struct io_tlb_mem *io_tlb_default_mem;
 
-static inline bool is_swiotlb_buffer(phys_addr_t paddr)
+static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 {
-	struct io_tlb_mem *mem = io_tlb_default_mem;
+	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
 
 	return mem && paddr >= mem->start && paddr < mem->end;
 }
@@ -115,7 +116,7 @@ bool is_swiotlb_active(void);
 void __init swiotlb_adjust_size(unsigned long size);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
-static inline bool is_swiotlb_buffer(phys_addr_t paddr)
+static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 {
 	return false;
 }
-- 
cgit v1.2.3


From 6f2beb268a5d35504a636c4a3b7aaa76ec32d96c Mon Sep 17 00:00:00 2001
From: Claire Chang <tientzu@chromium.org>
Date: Sat, 19 Jun 2021 11:40:36 +0800
Subject: swiotlb: Update is_swiotlb_active to add a struct device argument

Update is_swiotlb_active to add a struct device argument. This will be
useful later to allow for different pools.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Stefano Stabellini <sstabellini@kernel.org>
Tested-by: Will Deacon <will@kernel.org>
Acked-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/linux/swiotlb.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index d1f3d95881cd..dd1c30a83058 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -112,7 +112,7 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
-bool is_swiotlb_active(void);
+bool is_swiotlb_active(struct device *dev);
 void __init swiotlb_adjust_size(unsigned long size);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
@@ -132,7 +132,7 @@ static inline size_t swiotlb_max_mapping_size(struct device *dev)
 	return SIZE_MAX;
 }
 
-static inline bool is_swiotlb_active(void)
+static inline bool is_swiotlb_active(struct device *dev)
 {
 	return false;
 }
-- 
cgit v1.2.3


From 903cd0f315fe426c6a64c54ed389de0becb663dc Mon Sep 17 00:00:00 2001
From: Claire Chang <tientzu@chromium.org>
Date: Thu, 24 Jun 2021 23:55:20 +0800
Subject: swiotlb: Use is_swiotlb_force_bounce for swiotlb data bouncing

Propagate the swiotlb_force into io_tlb_default_mem->force_bounce and
use it to determine whether to bounce the data or not. This will be
useful later to allow for different pools.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Stefano Stabellini <sstabellini@kernel.org>
Tested-by: Will Deacon <will@kernel.org>
Acked-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

[v2: Includes Will's fix]
---
 include/linux/swiotlb.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index dd1c30a83058..da348671b0d5 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -84,6 +84,7 @@ extern enum swiotlb_force swiotlb_force;
  *		unmap calls.
  * @debugfs:	The dentry to debugfs.
  * @late_alloc:	%true if allocated using the page allocator
+ * @force_bounce: %true if swiotlb bouncing is forced
  */
 struct io_tlb_mem {
 	phys_addr_t start;
@@ -94,6 +95,7 @@ struct io_tlb_mem {
 	spinlock_t lock;
 	struct dentry *debugfs;
 	bool late_alloc;
+	bool force_bounce;
 	struct io_tlb_slot {
 		phys_addr_t orig_addr;
 		size_t alloc_size;
@@ -109,6 +111,13 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 	return mem && paddr >= mem->start && paddr < mem->end;
 }
 
+static inline bool is_swiotlb_force_bounce(struct device *dev)
+{
+	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+
+	return mem && mem->force_bounce;
+}
+
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
@@ -120,6 +129,10 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 {
 	return false;
 }
+static inline bool is_swiotlb_force_bounce(struct device *dev)
+{
+	return false;
+}
 static inline void swiotlb_exit(void)
 {
 }
-- 
cgit v1.2.3


From f4111e39a52aa5d5136d890bbd1aa87c1c8fe3bc Mon Sep 17 00:00:00 2001
From: Claire Chang <tientzu@chromium.org>
Date: Sat, 19 Jun 2021 11:40:40 +0800
Subject: swiotlb: Add restricted DMA alloc/free support

Add the functions, swiotlb_{alloc,free} and is_swiotlb_for_alloc to
support the memory allocation from restricted DMA pool.

The restricted DMA pool is preferred if available.

Note that since coherent allocation needs remapping, one must set up
another device coherent pool by shared-dma-pool and use
dma_alloc_from_dev_coherent instead for atomic coherent allocation.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Stefano Stabellini <sstabellini@kernel.org>
Tested-by: Will Deacon <will@kernel.org>
Acked-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/linux/swiotlb.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index da348671b0d5..3b9454d1e498 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -85,6 +85,7 @@ extern enum swiotlb_force swiotlb_force;
  * @debugfs:	The dentry to debugfs.
  * @late_alloc:	%true if allocated using the page allocator
  * @force_bounce: %true if swiotlb bouncing is forced
+ * @for_alloc:  %true if the pool is used for memory allocation
  */
 struct io_tlb_mem {
 	phys_addr_t start;
@@ -96,6 +97,7 @@ struct io_tlb_mem {
 	struct dentry *debugfs;
 	bool late_alloc;
 	bool force_bounce;
+	bool for_alloc;
 	struct io_tlb_slot {
 		phys_addr_t orig_addr;
 		size_t alloc_size;
@@ -158,4 +160,28 @@ static inline void swiotlb_adjust_size(unsigned long size)
 extern void swiotlb_print_info(void);
 extern void swiotlb_set_max_segment(unsigned int);
 
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+struct page *swiotlb_alloc(struct device *dev, size_t size);
+bool swiotlb_free(struct device *dev, struct page *page, size_t size);
+
+static inline bool is_swiotlb_for_alloc(struct device *dev)
+{
+	return dev->dma_io_tlb_mem->for_alloc;
+}
+#else
+static inline struct page *swiotlb_alloc(struct device *dev, size_t size)
+{
+	return NULL;
+}
+static inline bool swiotlb_free(struct device *dev, struct page *page,
+				size_t size)
+{
+	return false;
+}
+static inline bool is_swiotlb_for_alloc(struct device *dev)
+{
+	return false;
+}
+#endif /* CONFIG_DMA_RESTRICTED_POOL */
+
 #endif /* __LINUX_SWIOTLB_H */
-- 
cgit v1.2.3


From 0b84e4f8b793eb4045fd64f6f514165a7974cd16 Mon Sep 17 00:00:00 2001
From: Claire Chang <tientzu@chromium.org>
Date: Sat, 19 Jun 2021 11:40:41 +0800
Subject: swiotlb: Add restricted DMA pool initialization

Add the initialization function to create restricted DMA pools from
matching reserved-memory nodes.

Regardless of swiotlb setting, the restricted DMA pool is preferred if
available.

The restricted DMA pools provide a basic level of protection against the
DMA overwriting buffer contents at unexpected times. However, to protect
against general data leakage and system memory corruption, the system
needs to provide a way to lock down the memory access, e.g., MPU.

Signed-off-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Stefano Stabellini <sstabellini@kernel.org>
Tested-by: Will Deacon <will@kernel.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/linux/swiotlb.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 3b9454d1e498..39284ff2a6cd 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -73,7 +73,8 @@ extern enum swiotlb_force swiotlb_force;
  *		range check to see if the memory was in fact allocated by this
  *		API.
  * @nslabs:	The number of IO TLB blocks (in groups of 64) between @start and
- *		@end. This is command line adjustable via setup_io_tlb_npages.
+ *		@end. For default swiotlb, this is command line adjustable via
+ *		setup_io_tlb_npages.
  * @used:	The number of used IO TLB block.
  * @list:	The free list describing the number of free entries available
  *		from each index.
-- 
cgit v1.2.3


From fe364a7d95c24e07e9b3f2ab917f01d6d8330bba Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 12 Jul 2021 14:39:40 +0300
Subject: dmaengine: dw: Program xBAR hardware for Elkhart Lake

Intel Elkhart Lake PSE DMA implementation is integrated with crossbar IP
in order to serve more hardware than there are DMA request lines available.

Due to this, program xBAR hardware to make flexible support of PSE peripheral.

The Device-to-Device has not been tested and it's not supported by DMA Engine,
but it's left in the code for the sake of documenting hardware features.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210712113940.42753-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/platform_data/dma-dw.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index b34a094b2258..b11b0c8bc5da 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -52,6 +52,7 @@ struct dw_dma_slave {
  * @max_burst: Maximum value of burst transaction size supported by hardware
  *	       per channel (in units of CTL.SRC_TR_WIDTH/CTL.DST_TR_WIDTH).
  * @protctl: Protection control signals setting per channel.
+ * @quirks: Optional platform quirks.
  */
 struct dw_dma_platform_data {
 	unsigned int	nr_channels;
@@ -71,6 +72,8 @@ struct dw_dma_platform_data {
 #define CHAN_PROTCTL_CACHEABLE		BIT(2)
 #define CHAN_PROTCTL_MASK		GENMASK(2, 0)
 	unsigned char	protctl;
+#define DW_DMA_QUIRK_XBAR_PRESENT	BIT(0)
+	unsigned int	quirks;
 };
 
 #endif /* _PLATFORM_DATA_DMA_DW_H */
-- 
cgit v1.2.3


From 7ed012969bbcdbd7aef5778a061681e6cbc4b402 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Wed, 14 Jul 2021 17:01:59 +0200
Subject: Compiler Attributes: fix __has_attribute(__no_sanitize_coverage__)
 for GCC 4

Fix __has_attribute(__no_sanitize_coverage__) for GCC 4 by defining
__GCC4_has_attribute___no_sanitize_coverage__.

Fixes: 540540d06e9d ("kcov: add __no_sanitize_coverage to fix noinstr for all architectures")
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 include/linux/compiler_attributes.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index 183ddd5fd072..7b1fa5c30169 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -36,6 +36,7 @@
 # define __GCC4_has_attribute___nonstring__           0
 # define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8)
 # define __GCC4_has_attribute___no_sanitize_undefined__ (__GNUC_MINOR__ >= 9)
+# define __GCC4_has_attribute___no_sanitize_coverage__ 0
 # define __GCC4_has_attribute___fallthrough__         0
 #endif
 
-- 
cgit v1.2.3


From 5d1ef2ce13a9098b4e0d31c50e4c79763a57b444 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Fri, 23 Jul 2021 10:53:02 +0200
Subject: ima: Introduce ima_get_current_hash_algo()

Buffer measurements, unlike file measurements, are not accessible after the
measurement is done, as buffers are not suitable for use with the
integrity_iint_cache structure (there is no index, for files it is the
inode number). In the subsequent patches, the measurement (digest) will be
returned directly by the functions that perform the buffer measurement,
ima_measure_critical_data() and process_buffer_measurement().

A caller of those functions also needs to know the algorithm used to
calculate the digest. Instead of adding the algorithm as a new parameter to
the functions, this patch provides it separately with the new function
ima_get_current_hash_algo().

Since the hash algorithm does not change after the IMA setup phase, there
is no risk of races (obtaining a digest calculated with a different
algorithm than the one returned).

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
[zohar@linux.ibm.com: annotate ima_hash_algo as __ro_after_init]
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/ima.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index 61d5723ec303..81e830d01ced 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -11,9 +11,11 @@
 #include <linux/fs.h>
 #include <linux/security.h>
 #include <linux/kexec.h>
+#include <crypto/hash_info.h>
 struct linux_binprm;
 
 #ifdef CONFIG_IMA
+extern enum hash_algo ima_get_current_hash_algo(void);
 extern int ima_bprm_check(struct linux_binprm *bprm);
 extern int ima_file_check(struct file *file, int mask);
 extern void ima_post_create_tmpfile(struct user_namespace *mnt_userns,
@@ -64,6 +66,11 @@ static inline const char * const *arch_get_ima_policy(void)
 #endif
 
 #else
+static inline enum hash_algo ima_get_current_hash_algo(void)
+{
+	return HASH_ALGO__LAST;
+}
+
 static inline int ima_bprm_check(struct linux_binprm *bprm)
 {
 	return 0;
-- 
cgit v1.2.3


From ce5bb5a86e5ebcd3c2e40e6dd1382027b5d43caf Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Fri, 23 Jul 2021 10:53:03 +0200
Subject: ima: Return int in the functions to measure a buffer

ima_measure_critical_data() and process_buffer_measurement() currently
don't return a result as, unlike appraisal-related functions, the result is
not used by callers to deny an operation. Measurement-related functions
instead rely on the audit subsystem to notify the system administrator when
an error occurs.

However, ima_measure_critical_data() and process_buffer_measurement() are a
special case, as these are the only functions that can return a buffer
measurement (for files, there is ima_file_hash()). In a subsequent patch,
they will be modified to return the calculated digest.

In preparation to return the result of the digest calculation, this patch
modifies the return type from void to int, and returns 0 if the buffer has
been successfully measured, a negative value otherwise.

Given that the result of the measurement is still not necessary, this patch
does not modify the behavior of existing callers by processing the returned
value. For those, the return value is ignored.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Acked-by: Paul Moore <paul@paul-moore.com> (for the SELinux bits)
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/ima.h | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index 81e830d01ced..60492263aa64 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -35,10 +35,10 @@ extern void ima_post_path_mknod(struct user_namespace *mnt_userns,
 extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
 extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size);
 extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
-extern void ima_measure_critical_data(const char *event_label,
-				      const char *event_name,
-				      const void *buf, size_t buf_len,
-				      bool hash);
+extern int ima_measure_critical_data(const char *event_label,
+				     const char *event_name,
+				     const void *buf, size_t buf_len,
+				     bool hash);
 
 #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM
 extern void ima_appraise_parse_cmdline(void);
@@ -144,10 +144,13 @@ static inline int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size
 
 static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {}
 
-static inline void ima_measure_critical_data(const char *event_label,
+static inline int ima_measure_critical_data(const char *event_label,
 					     const char *event_name,
 					     const void *buf, size_t buf_len,
-					     bool hash) {}
+					     bool hash)
+{
+	return -ENOENT;
+}
 
 #endif /* CONFIG_IMA */
 
-- 
cgit v1.2.3


From ca3c9bdb101d9b9eb3ed8a85cc0fe55915ba49de Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Fri, 23 Jul 2021 10:53:04 +0200
Subject: ima: Add digest and digest_len params to the functions to measure a
 buffer

This patch performs the final modification necessary to pass the buffer
measurement to callers, so that they provide a functionality similar to
ima_file_hash(). It adds the 'digest' and 'digest_len' parameters to
ima_measure_critical_data() and process_buffer_measurement().

These functions calculate the digest even if there is no suitable rule in
the IMA policy and, in this case, they simply return 1 before generating a
new measurement entry.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/ima.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index 60492263aa64..b6ab66a546ae 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -38,7 +38,7 @@ extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
 extern int ima_measure_critical_data(const char *event_label,
 				     const char *event_name,
 				     const void *buf, size_t buf_len,
-				     bool hash);
+				     bool hash, u8 *digest, size_t digest_len);
 
 #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM
 extern void ima_appraise_parse_cmdline(void);
@@ -147,7 +147,8 @@ static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {
 static inline int ima_measure_critical_data(const char *event_label,
 					     const char *event_name,
 					     const void *buf, size_t buf_len,
-					     bool hash)
+					     bool hash, u8 *digest,
+					     size_t digest_len)
 {
 	return -ENOENT;
 }
-- 
cgit v1.2.3


From 463e862ac63ef27fca423782536f6465abc3f180 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 20 Jul 2021 14:38:24 +0100
Subject: swiotlb: Convert io_default_tlb_mem to static allocation

Since commit 69031f500865 ("swiotlb: Set dev->dma_io_tlb_mem to the
swiotlb pool used"), 'struct device' may hold a copy of the global
'io_default_tlb_mem' pointer if the device is using swiotlb for DMA. A
subsequent call to swiotlb_exit() will therefore leave dangling pointers
behind in these device structures, resulting in KASAN splats such as:

  |  BUG: KASAN: use-after-free in __iommu_dma_unmap_swiotlb+0x64/0xb0
  |  Read of size 8 at addr ffff8881d7830000 by task swapper/0/0
  |
  |  CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.12.0-rc3-debug #1
  |  Hardware name: HP HP Desktop M01-F1xxx/87D6, BIOS F.12 12/17/2020
  |  Call Trace:
  |   <IRQ>
  |   dump_stack+0x9c/0xcf
  |   print_address_description.constprop.0+0x18/0x130
  |   kasan_report.cold+0x7f/0x111
  |   __iommu_dma_unmap_swiotlb+0x64/0xb0
  |   nvme_pci_complete_rq+0x73/0x130
  |   blk_complete_reqs+0x6f/0x80
  |   __do_softirq+0xfc/0x3be

Convert 'io_default_tlb_mem' to a static structure, so that the
per-device pointers remain valid after swiotlb_exit() has been invoked.
All users are updated to reference the static structure directly, using
the 'nslabs' field to determine whether swiotlb has been initialised.
The 'slots' array is still allocated dynamically and referenced via a
pointer rather than a flexible array member.

Cc: Claire Chang <tientzu@chromium.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Fixes: 69031f500865 ("swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used")
Reported-by: Nathan Chancellor <nathan@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Tested-by: Claire Chang <tientzu@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad@kernel.org>
---
 include/linux/swiotlb.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 39284ff2a6cd..b0cb2a9973f4 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -103,9 +103,9 @@ struct io_tlb_mem {
 		phys_addr_t orig_addr;
 		size_t alloc_size;
 		unsigned int list;
-	} slots[];
+	} *slots;
 };
-extern struct io_tlb_mem *io_tlb_default_mem;
+extern struct io_tlb_mem io_tlb_default_mem;
 
 static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 {
-- 
cgit v1.2.3


From 374c15594c4ee0dfcceb38852bd43be09070f402 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacm@codeaurora.org>
Date: Wed, 16 Jun 2021 06:38:42 -0700
Subject: iommu/io-pgtable: Introduce unmap_pages() as a page table op

The io-pgtable code expects to operate on a single block or
granule of memory that is supported by the IOMMU hardware when
unmapping memory.

This means that when a large buffer that consists of multiple
such blocks is unmapped, the io-pgtable code will walk the page
tables to the correct level to unmap each block, even for blocks
that are virtually contiguous and at the same level, which can
incur an overhead in performance.

Introduce the unmap_pages() page table op to express to the
io-pgtable code that it should unmap a number of blocks of
the same size, instead of a single block. Doing so allows
multiple blocks to be unmapped in one call to the io-pgtable
code, reducing the number of page table walks, and indirect
calls.

Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Link: https://lore.kernel.org/r/1623850736-389584-2-git-send-email-quic_c_gdjako@quicinc.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/io-pgtable.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 4d40dfa75b55..9391c5fa71e6 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -144,6 +144,7 @@ struct io_pgtable_cfg {
  *
  * @map:          Map a physically contiguous memory region.
  * @unmap:        Unmap a physically contiguous memory region.
+ * @unmap_pages:  Unmap a range of virtually contiguous pages of the same size.
  * @iova_to_phys: Translate iova to physical address.
  *
  * These functions map directly onto the iommu_ops member functions with
@@ -154,6 +155,9 @@ struct io_pgtable_ops {
 		   phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
 	size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova,
 			size_t size, struct iommu_iotlb_gather *gather);
+	size_t (*unmap_pages)(struct io_pgtable_ops *ops, unsigned long iova,
+			      size_t pgsize, size_t pgcount,
+			      struct iommu_iotlb_gather *gather);
 	phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops,
 				    unsigned long iova);
 };
-- 
cgit v1.2.3


From cacffb7f7b45ba7649eedea4c196c6e9f1863bf3 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacm@codeaurora.org>
Date: Wed, 16 Jun 2021 06:38:43 -0700
Subject: iommu: Add an unmap_pages() op for IOMMU drivers

Add a callback for IOMMU drivers to provide a path for the
IOMMU framework to call into an IOMMU driver, which can call
into the io-pgtable code, to unmap a virtually contiguous
range of pages of the same size.

For IOMMU drivers that do not specify an unmap_pages() callback,
the existing logic of unmapping memory one page block at a time
will be used.

Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Link: https://lore.kernel.org/r/1623850736-389584-3-git-send-email-quic_c_gdjako@quicinc.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 32d448050bf7..25a844121be5 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -181,6 +181,7 @@ struct iommu_iotlb_gather {
  * @detach_dev: detach device from an iommu domain
  * @map: map a physically contiguous memory region to an iommu domain
  * @unmap: unmap a physically contiguous memory region from an iommu domain
+ * @unmap_pages: unmap a number of pages of the same size from an iommu domain
  * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain
  * @iotlb_sync_map: Sync mappings created recently using @map to the hardware
  * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
@@ -231,6 +232,9 @@ struct iommu_ops {
 		   phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
 	size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
 		     size_t size, struct iommu_iotlb_gather *iotlb_gather);
+	size_t (*unmap_pages)(struct iommu_domain *domain, unsigned long iova,
+			      size_t pgsize, size_t pgcount,
+			      struct iommu_iotlb_gather *iotlb_gather);
 	void (*flush_iotlb_all)(struct iommu_domain *domain);
 	void (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova,
 			       size_t size);
-- 
cgit v1.2.3


From ca073b55d16a83ba7e73cd313312abc68f07f293 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacm@codeaurora.org>
Date: Wed, 16 Jun 2021 06:38:44 -0700
Subject: iommu/io-pgtable: Introduce map_pages() as a page table op

Mapping memory into io-pgtables follows the same semantics
that unmapping memory used to follow (i.e. a buffer will be
mapped one page block per call to the io-pgtable code). This
means that it can be optimized in the same way that unmapping
memory was, so add a map_pages() callback to the io-pgtable
ops structure, so that a range of pages of the same size
can be mapped within the same call.

Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Link: https://lore.kernel.org/r/1623850736-389584-4-git-send-email-quic_c_gdjako@quicinc.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/io-pgtable.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 9391c5fa71e6..c43f3b899d2a 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -143,6 +143,7 @@ struct io_pgtable_cfg {
  * struct io_pgtable_ops - Page table manipulation API for IOMMU drivers.
  *
  * @map:          Map a physically contiguous memory region.
+ * @map_pages:    Map a physically contiguous range of pages of the same size.
  * @unmap:        Unmap a physically contiguous memory region.
  * @unmap_pages:  Unmap a range of virtually contiguous pages of the same size.
  * @iova_to_phys: Translate iova to physical address.
@@ -153,6 +154,9 @@ struct io_pgtable_cfg {
 struct io_pgtable_ops {
 	int (*map)(struct io_pgtable_ops *ops, unsigned long iova,
 		   phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
+	int (*map_pages)(struct io_pgtable_ops *ops, unsigned long iova,
+			 phys_addr_t paddr, size_t pgsize, size_t pgcount,
+			 int prot, gfp_t gfp, size_t *mapped);
 	size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova,
 			size_t size, struct iommu_iotlb_gather *gather);
 	size_t (*unmap_pages)(struct io_pgtable_ops *ops, unsigned long iova,
-- 
cgit v1.2.3


From 910c4406ccc9613de0a54abf910edc4bf8a575c0 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacm@codeaurora.org>
Date: Wed, 16 Jun 2021 06:38:45 -0700
Subject: iommu: Add a map_pages() op for IOMMU drivers

Add a callback for IOMMU drivers to provide a path for the
IOMMU framework to call into an IOMMU driver, which can
call into the io-pgtable code, to map a physically contiguous
rnage of pages of the same size.

For IOMMU drivers that do not specify a map_pages() callback,
the existing logic of mapping memory one page block at a time
will be used.

Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
Suggested-by: Will Deacon <will@kernel.org>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Link: https://lore.kernel.org/r/1623850736-389584-5-git-send-email-quic_c_gdjako@quicinc.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 25a844121be5..d7989d4a7404 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -180,6 +180,8 @@ struct iommu_iotlb_gather {
  * @attach_dev: attach device to an iommu domain
  * @detach_dev: detach device from an iommu domain
  * @map: map a physically contiguous memory region to an iommu domain
+ * @map_pages: map a physically contiguous set of pages of the same size to
+ *             an iommu domain.
  * @unmap: unmap a physically contiguous memory region from an iommu domain
  * @unmap_pages: unmap a number of pages of the same size from an iommu domain
  * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain
@@ -230,6 +232,9 @@ struct iommu_ops {
 	void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
 	int (*map)(struct iommu_domain *domain, unsigned long iova,
 		   phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
+	int (*map_pages)(struct iommu_domain *domain, unsigned long iova,
+			 phys_addr_t paddr, size_t pgsize, size_t pgcount,
+			 int prot, gfp_t gfp, size_t *mapped);
 	size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
 		     size_t size, struct iommu_iotlb_gather *iotlb_gather);
 	size_t (*unmap_pages)(struct iommu_domain *domain, unsigned long iova,
-- 
cgit v1.2.3


From 308723e3580027f0cd7c86a5edfe6b5acb6863d2 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Mon, 12 Jul 2021 19:12:20 +0800
Subject: iommu: Remove mode argument from iommu_set_dma_strict()

We only ever now set strict mode enabled in iommu_set_dma_strict(), so
just remove the argument.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1626088340-5838-7-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d7989d4a7404..4997c78e2670 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -485,7 +485,7 @@ int iommu_enable_nesting(struct iommu_domain *domain);
 int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirks);
 
-void iommu_set_dma_strict(bool val);
+void iommu_set_dma_strict(void);
 bool iommu_get_dma_strict(struct iommu_domain *domain);
 
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
-- 
cgit v1.2.3


From c25abcd625505f53b72dc156bac32b5120826742 Mon Sep 17 00:00:00 2001
From: Sumit Garg <sumit.garg@linaro.org>
Date: Mon, 12 Jul 2021 19:16:18 +0530
Subject: kdb: Get rid of redundant kdb_register_flags()

Commit e4f291b3f7bb ("kdb: Simplify kdb commands registration")
allowed registration of pre-allocated kdb commands with pointer to
struct kdbtab_t. Lets switch other users as well to register pre-
allocated kdb commands via:
- Changing prototype for kdb_register() to pass a pointer to struct
  kdbtab_t instead.
- Embed kdbtab_t structure in kdb_macro_t rather than individual params.

With these changes kdb_register_flags() becomes redundant and hence
removed. Also, since we have switched all users to register
pre-allocated commands, "is_dynamic" flag in struct kdbtab_t becomes
redundant and hence removed as well.

Suggested-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Sumit Garg <sumit.garg@linaro.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20210712134620.276667-3-sumit.garg@linaro.org
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 include/linux/kdb.h | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index 0125a677b67f..de858edfb3b8 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -13,6 +13,8 @@
  * Copyright (C) 2009 Jason Wessel <jason.wessel@windriver.com>
  */
 
+#include <linux/list.h>
+
 /* Shifted versions of the command enable bits are be used if the command
  * has no arguments (see kdb_check_flags). This allows commands, such as
  * go, to have different permissions depending upon whether it is called
@@ -64,6 +66,17 @@ typedef enum {
 
 typedef int (*kdb_func_t)(int, const char **);
 
+/* The KDB shell command table */
+typedef struct _kdbtab {
+	char    *cmd_name;		/* Command name */
+	kdb_func_t cmd_func;		/* Function to execute command */
+	char    *cmd_usage;		/* Usage String for this command */
+	char    *cmd_help;		/* Help message for this command */
+	short    cmd_minlen;		/* Minimum legal # cmd chars required */
+	kdb_cmdflags_t cmd_flags;	/* Command behaviour flags */
+	struct list_head list_node;	/* Command list */
+} kdbtab_t;
+
 #ifdef	CONFIG_KGDB_KDB
 #include <linux/init.h>
 #include <linux/sched.h>
@@ -193,19 +206,13 @@ static inline const char *kdb_walk_kallsyms(loff_t *pos)
 #endif /* ! CONFIG_KALLSYMS */
 
 /* Dynamic kdb shell command registration */
-extern int kdb_register(char *, kdb_func_t, char *, char *, short);
-extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
-			      short, kdb_cmdflags_t);
-extern int kdb_unregister(char *);
+extern int kdb_register(kdbtab_t *cmd);
+extern void kdb_unregister(kdbtab_t *cmd);
 #else /* ! CONFIG_KGDB_KDB */
 static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
 static inline void kdb_init(int level) {}
-static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
-			       char *help, short minlen) { return 0; }
-static inline int kdb_register_flags(char *cmd, kdb_func_t func, char *usage,
-				     char *help, short minlen,
-				     kdb_cmdflags_t flags) { return 0; }
-static inline int kdb_unregister(char *cmd) { return 0; }
+static inline int kdb_register(kdbtab_t *cmd) { return 0; }
+static inline void kdb_unregister(kdbtab_t *cmd) {}
 #endif	/* CONFIG_KGDB_KDB */
 enum {
 	KDB_NOT_INITIALIZED,
-- 
cgit v1.2.3


From e868f0a3c4b9c1d7721f08b703142a876814a3f8 Mon Sep 17 00:00:00 2001
From: Sumit Garg <sumit.garg@linaro.org>
Date: Mon, 12 Jul 2021 19:16:20 +0530
Subject: kdb: Rename members of struct kdbtab_t

Remove redundant prefix "cmd_" from name of members in struct kdbtab_t
for better readibility.

Suggested-by: Doug Anderson <dianders@chromium.org>
Signed-off-by: Sumit Garg <sumit.garg@linaro.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20210712134620.276667-5-sumit.garg@linaro.org
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 include/linux/kdb.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index de858edfb3b8..ea0f5e580fac 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -68,12 +68,12 @@ typedef int (*kdb_func_t)(int, const char **);
 
 /* The KDB shell command table */
 typedef struct _kdbtab {
-	char    *cmd_name;		/* Command name */
-	kdb_func_t cmd_func;		/* Function to execute command */
-	char    *cmd_usage;		/* Usage String for this command */
-	char    *cmd_help;		/* Help message for this command */
-	short    cmd_minlen;		/* Minimum legal # cmd chars required */
-	kdb_cmdflags_t cmd_flags;	/* Command behaviour flags */
+	char    *name;			/* Command name */
+	kdb_func_t func;		/* Function to execute command */
+	char    *usage;			/* Usage String for this command */
+	char    *help;			/* Help message for this command */
+	short    minlen;		/* Minimum legal # cmd chars required */
+	kdb_cmdflags_t flags;		/* Command behaviour flags */
 	struct list_head list_node;	/* Command list */
 } kdbtab_t;
 
-- 
cgit v1.2.3


From cf0a95659e659d36838e56cc439d3986dcb46870 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 22 Jul 2021 22:34:50 +0300
Subject: clk: x86: Rename clk-lpt to more specific clk-lpss-atom

The LPT stands for Lynxpoint PCH. However the driver is used on a few
Intel Atom SoCs. Rename it to reflect this in a way how another clock
driver, i.e. clk-pmc-atom, is called.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210722193450.35321-1-andriy.shevchenko@linux.intel.com
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/platform_data/x86/clk-lpss.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/clk-lpss.h b/include/linux/platform_data/x86/clk-lpss.h
index 207e1a317800..41df326583f9 100644
--- a/include/linux/platform_data/x86/clk-lpss.h
+++ b/include/linux/platform_data/x86/clk-lpss.h
@@ -15,6 +15,6 @@ struct lpss_clk_data {
 	struct clk *clk;
 };
 
-extern int lpt_clk_init(void);
+extern int lpss_atom_clk_init(void);
 
 #endif /* __CLK_LPSS_H */
-- 
cgit v1.2.3


From 147b589c5f446d602215577835356a96c40a4044 Mon Sep 17 00:00:00 2001
From: Dong Aisheng <aisheng.dong@nxp.com>
Date: Tue, 6 Jul 2021 22:21:56 +0800
Subject: remoteproc: fix kernel doc for struct rproc_ops

The load_rsc_table was removed since the
commit c1d35c1ab424 ("remoteproc: Rename "load_rsc_table" to "parse_fw"")
but got added back again by mistake in the below commit:
commit b1a17513a2d6 ("remoteproc: add vendor resources handling").

The patch fixed a small code indent issue which not worth
a separate patch.

Fixes: b1a17513a2d6 ("remoteproc: add vendor resources handling")
Signed-off-by: Dong Aisheng <aisheng.dong@nxp.com>
Link: https://lore.kernel.org/r/20210706142156.952794-2-aisheng.dong@nxp.com
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/remoteproc.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index a5b37bc10865..83c09ac36b13 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -369,9 +369,8 @@ enum rsc_handling_status {
  * @da_to_va:	optional platform hook to perform address translations
  * @parse_fw:	parse firmware to extract information (e.g. resource table)
  * @handle_rsc:	optional platform hook to handle vendor resources. Should return
- * RSC_HANDLED if resource was handled, RSC_IGNORED if not handled and a
- * negative value on error
- * @load_rsc_table:	load resource table from firmware image
+ *		RSC_HANDLED if resource was handled, RSC_IGNORED if not handled
+ *		and a negative value on error
  * @find_loaded_rsc_table: find the loaded resource table from firmware image
  * @get_loaded_rsc_table: get resource table installed in memory
  *			  by external entity
-- 
cgit v1.2.3


From fb1ba406c451045f1063ace70086b4645d4e9d54 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 24 Jul 2021 09:20:18 +0200
Subject: scsi: scsi_ioctl: Remove scsi_cmd_blk_ioctl()

Open code scsi_cmd_blk_ioctl() in its two callers.

Link: https://lore.kernel.org/r/20210724072033.1284840-10-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/blkdev.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3177181c4326..19aa3d5429c0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -889,8 +889,6 @@ extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
 int blk_rq_append_bio(struct request *rq, struct bio *bio);
 extern void blk_queue_split(struct bio **);
 extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
-extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t,
-			      unsigned int, void __user *);
 extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			  unsigned int, void __user *);
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
-- 
cgit v1.2.3


From 4f07bfc56157ebc689ef54879e90c48a47294083 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 24 Jul 2021 09:20:19 +0200
Subject: scsi: scsi_ioctl: Remove scsi_verify_blk_ioctl()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Manually verify that the device is not a partition and the caller has admin
privіleges at the beginning of the sr ioctl method and open code the
trivial check for sd as well.

Link: https://lore.kernel.org/r/20210724072033.1284840-11-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 19aa3d5429c0..e2b972a85012 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -888,7 +888,6 @@ extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 int blk_rq_append_bio(struct request *rq, struct bio *bio);
 extern void blk_queue_split(struct bio **);
-extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
 extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			  unsigned int, void __user *);
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
-- 
cgit v1.2.3


From 547e2f7093b19a993d76c249b4c3ec8af8127d09 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 24 Jul 2021 09:20:21 +0200
Subject: scsi: block: Add a queue_max_bytes() helper

Return the max_sectors value in bytes.  Lifted from scsi_ioctl.c.

Link: https://lore.kernel.org/r/20210724072033.1284840-13-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/blkdev.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e2b972a85012..9971796819ef 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1373,6 +1373,11 @@ static inline unsigned int queue_max_sectors(const struct request_queue *q)
 	return q->limits.max_sectors;
 }
 
+static inline unsigned int queue_max_bytes(struct request_queue *q)
+{
+	return min_t(unsigned int, queue_max_sectors(q), INT_MAX >> 9) << 9;
+}
+
 static inline unsigned int queue_max_hw_sectors(const struct request_queue *q)
 {
 	return q->limits.max_hw_sectors;
-- 
cgit v1.2.3


From 78011042684dfbb50f7060f4623793f7a5c74a01 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 24 Jul 2021 09:20:23 +0200
Subject: scsi: bsg: Move bsg_scsi_ops to drivers/scsi/

Move the SCSI-specific bsg code in the SCSI midlayer instead of in the
common bsg code.  This just keeps the common bsg code block/ and also
allows building it as a module.

Link: https://lore.kernel.org/r/20210724072033.1284840-15-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/blkdev.h |  2 +-
 include/linux/bsg.h    | 11 +++--------
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9971796819ef..d36b67bd7267 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -537,7 +537,7 @@ struct request_queue {
 
 	int			mq_freeze_depth;
 
-#if defined(CONFIG_BLK_DEV_BSG)
+#if IS_ENABLED(CONFIG_BLK_DEV_BSG_COMMON)
 	struct bsg_class_device bsg_dev;
 #endif
 
diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index dac37b6e00ec..b887da20bd41 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -5,8 +5,9 @@
 #include <uapi/linux/bsg.h>
 
 struct request;
+struct request_queue;
 
-#ifdef CONFIG_BLK_DEV_BSG
+#ifdef CONFIG_BLK_DEV_BSG_COMMON
 struct bsg_ops {
 	int	(*check_proto)(struct sg_io_v4 *hdr);
 	int	(*fill_hdr)(struct request *rq, struct sg_io_v4 *hdr,
@@ -24,16 +25,10 @@ struct bsg_class_device {
 
 int bsg_register_queue(struct request_queue *q, struct device *parent,
 		const char *name, const struct bsg_ops *ops);
-int bsg_scsi_register_queue(struct request_queue *q, struct device *parent);
 void bsg_unregister_queue(struct request_queue *q);
 #else
-static inline int bsg_scsi_register_queue(struct request_queue *q,
-		struct device *parent)
-{
-	return 0;
-}
 static inline void bsg_unregister_queue(struct request_queue *q)
 {
 }
-#endif /* CONFIG_BLK_DEV_BSG */
+#endif /* CONFIG_BLK_DEV_BSG_COMMON */
 #endif /* _LINUX_BSG_H */
-- 
cgit v1.2.3


From 7353dc06c9a8e37c80da7ff986e6ef5123bec8ce Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 24 Jul 2021 09:20:26 +0200
Subject: scsi: scsi_ioctl: Simplify SCSI passthrough permission checking

Remove the separate command filter structure and just use a switch
statement (which also cought two duplicate commands), return a bool and
give the function a sensible name.

Link: https://lore.kernel.org/r/20210724072033.1284840-18-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d36b67bd7267..e28679e63373 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1343,7 +1343,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 				    gfp_mask, 0);
 }
 
-extern int blk_verify_command(unsigned char *cmd, fmode_t mode);
+bool scsi_cmd_allowed(unsigned char *cmd, fmode_t mode);
 
 static inline bool bdev_is_partition(struct block_device *bdev)
 {
-- 
cgit v1.2.3


From f2542a3be3277a65c766fa6e86b930d3d839f79e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 24 Jul 2021 09:20:27 +0200
Subject: scsi: scsi_ioctl: Move the "block layer" SCSI ioctl handling to
 drivers/scsi

Merge the ioctl handling in block/scsi_ioctl.c into its only caller in
drivers/scsi/scsi_ioctl.c.

Link: https://lore.kernel.org/r/20210724072033.1284840-19-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/blkdev.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e28679e63373..8c617a5a5d61 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -28,8 +28,6 @@
 #include <linux/sbitmap.h>
 
 struct module;
-struct scsi_ioctl_command;
-
 struct request_queue;
 struct elevator_queue;
 struct blk_trace;
@@ -888,13 +886,6 @@ extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 int blk_rq_append_bio(struct request *rq, struct bio *bio);
 extern void blk_queue_split(struct bio **);
-extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
-			  unsigned int, void __user *);
-extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
-			 struct scsi_ioctl_command __user *);
-extern int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp);
-extern int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp);
-
 extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
@@ -1343,8 +1334,6 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 				    gfp_mask, 0);
 }
 
-bool scsi_cmd_allowed(unsigned char *cmd, fmode_t mode);
-
 static inline bool bdev_is_partition(struct block_device *bdev)
 {
 	return bdev->bd_partno;
-- 
cgit v1.2.3


From 4e804c39f1be4498d80f379e5b7bc6d4f80f813c Mon Sep 17 00:00:00 2001
From: Sergio Paracuellos <sergio.paracuellos@gmail.com>
Date: Wed, 28 Jul 2021 06:12:51 +0200
Subject: gpiolib: convert 'devprop_gpiochip_set_names' to support multiple
 gpiochip banks per device

The default gpiolib-of implementation does not work with the multiple
gpiochip banks per device structure used for example by the gpio-mt7621
and gpio-brcmstb drivers. To fix these kind of situations driver code
is forced to fill the names to avoid the gpiolib code to set names
repeated along the banks. Instead of continue with that antipattern
fix the gpiolib core function to get expected behaviour for every
single situation adding a field 'offset' in the gpiochip structure.
Doing in this way, we can assume this offset will be zero for normal
driver code where only one gpiochip bank per device is used but
can be set explicitly in those drivers that really need more than
one gpiochip.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Gregory Fong <gregory.0xf0@gmail.com>
Signed-off-by: Sergio Paracuellos <sergio.paracuellos@gmail.com>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
 include/linux/gpio/driver.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 3a268781fcec..a0f9901dcae6 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -312,6 +312,9 @@ struct gpio_irq_chip {
  *	get rid of the static GPIO number space in the long run.
  * @ngpio: the number of GPIOs handled by this controller; the last GPIO
  *	handled is (base + ngpio - 1).
+ * @offset: when multiple gpio chips belong to the same device this
+ *	can be used as offset within the device so friendly names can
+ *	be properly assigned.
  * @names: if set, must be an array of strings to use as alternative
  *      names for the GPIOs in this chip. Any entry in the array
  *      may be NULL if there is no alias for the GPIO, however the
@@ -398,6 +401,7 @@ struct gpio_chip {
 
 	int			base;
 	u16			ngpio;
+	u16			offset;
 	const char		*const *names;
 	bool			can_sleep;
 
-- 
cgit v1.2.3


From ba51bdafaafc065019c6f6a2cdae006d176cee48 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 5 Jul 2021 15:02:43 +0200
Subject: scsi: sr: cdrom: Move cdrom_read_cdda_bpc() into the sr driver

cdrom_read_cdda_bpc() relies on sending SCSI command to the low level
driver using a REQ_OP_SCSI_IN request.  This isn't generic block layer
functionality, so move the actual low-level code into the sr driver and
call it through a new read_cdda_bpc method in the cdrom_device_ops
structure.

With this the CDROM code does not have to pull in scsi_normalize_sense()
and depend on CONFIG_SCSI_COMMON.

Link: https://lore.kernel.org/r/20210730072752.GB23847%40lst.de
Tested-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/cdrom.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h
index f48d0a31deae..c4fef00abdf3 100644
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -86,11 +86,13 @@ struct cdrom_device_ops {
 	/* play stuff */
 	int (*audio_ioctl) (struct cdrom_device_info *,unsigned int, void *);
 
-/* driver specifications */
-	const int capability;   /* capability flags */
 	/* handle uniform packets for scsi type devices (scsi,atapi) */
 	int (*generic_packet) (struct cdrom_device_info *,
 			       struct packet_command *);
+	int (*read_cdda_bpc)(struct cdrom_device_info *cdi, void __user *ubuf,
+			       u32 lba, u32 nframes, u8 *last_sense);
+/* driver specifications */
+	const int capability;   /* capability flags */
 };
 
 int cdrom_multisession(struct cdrom_device_info *cdi,
-- 
cgit v1.2.3


From ead09dd3aed5cc6a6c6288a87a5bfa9bbc8d5ecf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 29 Jul 2021 08:48:42 +0200
Subject: scsi: bsg: Simplify device registration

Use the per-device cdev_device_interface to store the bsg data in the char
device inode, and thus remove the need to embedd the bsg_class_device
structure in the request_queue.

Link: https://lore.kernel.org/r/20210729064845.1044147-2-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/blkdev.h  |  6 ------
 include/linux/bsg-lib.h |  1 +
 include/linux/bsg.h     | 21 ++++++---------------
 3 files changed, 7 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8c617a5a5d61..28957ccdd9c2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -18,7 +18,6 @@
 #include <linux/bio.h>
 #include <linux/stringify.h>
 #include <linux/gfp.h>
-#include <linux/bsg.h>
 #include <linux/smp.h>
 #include <linux/rcupdate.h>
 #include <linux/percpu-refcount.h>
@@ -33,7 +32,6 @@ struct elevator_queue;
 struct blk_trace;
 struct request;
 struct sg_io_hdr;
-struct bsg_job;
 struct blkcg_gq;
 struct blk_flush_queue;
 struct pr_ops;
@@ -535,10 +533,6 @@ struct request_queue {
 
 	int			mq_freeze_depth;
 
-#if IS_ENABLED(CONFIG_BLK_DEV_BSG_COMMON)
-	struct bsg_class_device bsg_dev;
-#endif
-
 #ifdef CONFIG_BLK_DEV_THROTTLING
 	/* Throttle data */
 	struct throtl_data *td;
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index 960988d42f77..6b211323a489 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -12,6 +12,7 @@
 #include <linux/blkdev.h>
 #include <scsi/scsi_request.h>
 
+struct bsg_job;
 struct request;
 struct device;
 struct scatterlist;
diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index b887da20bd41..fa21f79beda2 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -4,10 +4,11 @@
 
 #include <uapi/linux/bsg.h>
 
+struct bsg_device;
+struct device;
 struct request;
 struct request_queue;
 
-#ifdef CONFIG_BLK_DEV_BSG_COMMON
 struct bsg_ops {
 	int	(*check_proto)(struct sg_io_v4 *hdr);
 	int	(*fill_hdr)(struct request *rq, struct sg_io_v4 *hdr,
@@ -16,19 +17,9 @@ struct bsg_ops {
 	void	(*free_rq)(struct request *rq);
 };
 
-struct bsg_class_device {
-	struct device *class_dev;
-	int minor;
-	struct request_queue *queue;
-	const struct bsg_ops *ops;
-};
+struct bsg_device *bsg_register_queue(struct request_queue *q,
+		struct device *parent, const char *name,
+		const struct bsg_ops *ops);
+void bsg_unregister_queue(struct bsg_device *bcd);
 
-int bsg_register_queue(struct request_queue *q, struct device *parent,
-		const char *name, const struct bsg_ops *ops);
-void bsg_unregister_queue(struct request_queue *q);
-#else
-static inline void bsg_unregister_queue(struct request_queue *q)
-{
-}
-#endif /* CONFIG_BLK_DEV_BSG_COMMON */
 #endif /* _LINUX_BSG_H */
-- 
cgit v1.2.3


From cf93a27446fe1a6e0acb9bbedf5fce1e98e4fc5b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 29 Jul 2021 08:48:43 +0200
Subject: scsi: block: Remove BLK_SCSI_MAX_CMDS

This was used for the table based SCSI passthough permission checking that
is gone now.

Link: https://lore.kernel.org/r/20210729064845.1044147-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/blkdev.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 28957ccdd9c2..e0bb14acb708 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -271,9 +271,6 @@ enum blk_queue_state {
 #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
 #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
 
-#define BLK_SCSI_MAX_CMDS	(256)
-#define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
-
 /*
  * Zoned block device models (zoned limit).
  *
-- 
cgit v1.2.3


From 1e61c1a804d2a2a3c46add01cac3a6e9eca01080 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 29 Jul 2021 08:48:44 +0200
Subject: scsi: block: Remove the remaining SG_IO-related fields from struct
 request_queue

Move the sg_timeout and sg_reserved_size fields into the bsg_device and
scsi_device structures as they have nothing to do with generic block I/O.
Note that these values are now separate for bsg vs. SCSI device node
access, but that just matches how /dev/sg vs the other nodes has always
behaved.

Link: https://lore.kernel.org/r/20210729064845.1044147-4-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/blkdev.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e0bb14acb708..987f15089eeb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -499,11 +499,6 @@ struct request_queue {
 	unsigned int		max_active_zones;
 #endif /* CONFIG_BLK_DEV_ZONED */
 
-	/*
-	 * sg stuff
-	 */
-	unsigned int		sg_timeout;
-	unsigned int		sg_reserved_size;
 	int			node;
 	struct mutex		debugfs_mutex;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
-- 
cgit v1.2.3


From 75ca56409e5b35aa6ceef94462f39ef4f533fc41 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 29 Jul 2021 08:48:45 +0200
Subject: scsi: bsg: Move the whole request execution into the SCSI/transport
 handlers

Remove the amount of indirect calls by making the handler responsible for
the entire execution of the request.

Link: https://lore.kernel.org/r/20210729064845.1044147-5-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/bsg.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bsg.h b/include/linux/bsg.h
index fa21f79beda2..1ac81c809da9 100644
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -6,20 +6,14 @@
 
 struct bsg_device;
 struct device;
-struct request;
 struct request_queue;
 
-struct bsg_ops {
-	int	(*check_proto)(struct sg_io_v4 *hdr);
-	int	(*fill_hdr)(struct request *rq, struct sg_io_v4 *hdr,
-				fmode_t mode);
-	int	(*complete_rq)(struct request *rq, struct sg_io_v4 *hdr);
-	void	(*free_rq)(struct request *rq);
-};
+typedef int (bsg_sg_io_fn)(struct request_queue *, struct sg_io_v4 *hdr,
+		fmode_t mode, unsigned int timeout);
 
 struct bsg_device *bsg_register_queue(struct request_queue *q,
 		struct device *parent, const char *name,
-		const struct bsg_ops *ops);
+		bsg_sg_io_fn *sg_io_fn);
 void bsg_unregister_queue(struct bsg_device *bcd);
 
 #endif /* _LINUX_BSG_H */
-- 
cgit v1.2.3


From 3136895cc5b665c1ab406d78f90c0700a3551e74 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 23 Jul 2021 02:32:05 -0700
Subject: iommu: Improve iommu_iotlb_gather helpers

The Mediatek driver is not the only one which might want a basic
address-based gathering behaviour, so although it's arguably simple
enough to open-code, let's factor it out for the sake of cleanliness.
Let's also take this opportunity to document the intent of these
helpers for clarity.

Cc: Joerg Roedel <joro@8bytes.org>
Cc: Will Deacon <will@kernel.org>
Cc: Jiajun Cao <caojiajun@vmware.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Cc: iommu@lists.linux-foundation.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Nadav Amit <namit@vmware.com>
Link: https://lore.kernel.org/r/20210723093209.714328-4-namit@vmware.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 38 +++++++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 32d448050bf7..e554871db46f 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -497,6 +497,38 @@ static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 	iommu_iotlb_gather_init(iotlb_gather);
 }
 
+/**
+ * iommu_iotlb_gather_add_range - Gather for address-based TLB invalidation
+ * @gather: TLB gather data
+ * @iova: start of page to invalidate
+ * @size: size of page to invalidate
+ *
+ * Helper for IOMMU drivers to build arbitrarily-sized invalidation commands
+ * where only the address range matters, and simply minimising intermediate
+ * syncs is preferred.
+ */
+static inline void iommu_iotlb_gather_add_range(struct iommu_iotlb_gather *gather,
+						unsigned long iova, size_t size)
+{
+	unsigned long end = iova + size - 1;
+
+	if (gather->start > iova)
+		gather->start = iova;
+	if (gather->end < end)
+		gather->end = end;
+}
+
+/**
+ * iommu_iotlb_gather_add_page - Gather for page-based TLB invalidation
+ * @domain: IOMMU domain to be invalidated
+ * @gather: TLB gather data
+ * @iova: start of page to invalidate
+ * @size: size of page to invalidate
+ *
+ * Helper for IOMMU drivers to build invalidation commands based on individual
+ * pages, or with page size/table level hints which cannot be gathered if they
+ * differ.
+ */
 static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
 					       struct iommu_iotlb_gather *gather,
 					       unsigned long iova, size_t size)
@@ -515,11 +547,7 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
 		gather->pgsize = size;
 	}
 
-	if (gather->end < end)
-		gather->end = end;
-
-	if (gather->start > start)
-		gather->start = start;
+	iommu_iotlb_gather_add_range(gather, iova, size);
 }
 
 /* PCI device grouping function */
-- 
cgit v1.2.3


From febb82c208e481eee057c70fa3176bb48712a111 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Fri, 23 Jul 2021 02:32:06 -0700
Subject: iommu: Factor iommu_iotlb_gather_is_disjoint() out
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor iommu_iotlb_gather_add_page() and factor out the logic that
detects whether IOTLB gather range and a new range are disjoint. To be
used by the next patch that implements different gathering logic for
AMD.

Note that updating gather->pgsize unconditionally does not affect
correctness as the function had (and has) an invariant, in which
gather->pgsize always represents the flushing granularity of its range.
Arguably, “size" should never be zero, but lets assume for the matter of
discussion that it might.

If "size" equals to "gather->pgsize", then the assignment in question
has no impact.

Otherwise, if "size" is non-zero, then iommu_iotlb_sync() would
initialize the size and range (see iommu_iotlb_gather_init()), and the
invariant is kept.

Otherwise, "size" is zero, and "gather" already holds a range, so
gather->pgsize is non-zero and (gather->pgsize && gather->pgsize !=
size) is true. Therefore, again, iommu_iotlb_sync() would be called and
initialize the size.

Cc: Joerg Roedel <joro@8bytes.org>
Cc: Jiajun Cao <caojiajun@vmware.com>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Cc: iommu@lists.linux-foundation.org
Cc: linux-kernel@vger.kernel.org>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Nadav Amit <namit@vmware.com>
Link: https://lore.kernel.org/r/20210723093209.714328-5-namit@vmware.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e554871db46f..979a5ceeea55 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -497,6 +497,28 @@ static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 	iommu_iotlb_gather_init(iotlb_gather);
 }
 
+/**
+ * iommu_iotlb_gather_is_disjoint - Checks whether a new range is disjoint
+ *
+ * @gather: TLB gather data
+ * @iova: start of page to invalidate
+ * @size: size of page to invalidate
+ *
+ * Helper for IOMMU drivers to check whether a new range and the gathered range
+ * are disjoint. For many IOMMUs, flushing the IOMMU in this case is better
+ * than merging the two, which might lead to unnecessary invalidations.
+ */
+static inline
+bool iommu_iotlb_gather_is_disjoint(struct iommu_iotlb_gather *gather,
+				    unsigned long iova, size_t size)
+{
+	unsigned long start = iova, end = start + size - 1;
+
+	return gather->end != 0 &&
+		(end + 1 < gather->start || start > gather->end + 1);
+}
+
+
 /**
  * iommu_iotlb_gather_add_range - Gather for address-based TLB invalidation
  * @gather: TLB gather data
@@ -533,20 +555,16 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
 					       struct iommu_iotlb_gather *gather,
 					       unsigned long iova, size_t size)
 {
-	unsigned long start = iova, end = start + size - 1;
-
 	/*
 	 * If the new page is disjoint from the current range or is mapped at
 	 * a different granularity, then sync the TLB so that the gather
 	 * structure can be rewritten.
 	 */
-	if (gather->pgsize != size ||
-	    end + 1 < gather->start || start > gather->end + 1) {
-		if (gather->pgsize)
-			iommu_iotlb_sync(domain, gather);
-		gather->pgsize = size;
-	}
+	if ((gather->pgsize && gather->pgsize != size) ||
+	    iommu_iotlb_gather_is_disjoint(gather, iova, size))
+		iommu_iotlb_sync(domain, gather);
 
+	gather->pgsize = size;
 	iommu_iotlb_gather_add_range(gather, iova, size);
 }
 
-- 
cgit v1.2.3


From 205d76ff0684a0b4fe3ff3a283d143a47439d191 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 26 Jul 2021 16:35:50 +0100
Subject: KVM: Remove kvm_is_transparent_hugepage() and PageTransCompoundMap()

Now that arm64 has stopped using kvm_is_transparent_hugepage(),
we can remove it, as well as PageTransCompoundMap() which was
only used by the former.

Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210726153552.1535838-5-maz@kernel.org
---
 include/linux/page-flags.h | 37 -------------------------------------
 1 file changed, 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5922031ffab6..1ace27c4a8e0 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -632,43 +632,6 @@ static inline int PageTransCompound(struct page *page)
 	return PageCompound(page);
 }
 
-/*
- * PageTransCompoundMap is the same as PageTransCompound, but it also
- * guarantees the primary MMU has the entire compound page mapped
- * through pmd_trans_huge, which in turn guarantees the secondary MMUs
- * can also map the entire compound page. This allows the secondary
- * MMUs to call get_user_pages() only once for each compound page and
- * to immediately map the entire compound page with a single secondary
- * MMU fault. If there will be a pmd split later, the secondary MMUs
- * will get an update through the MMU notifier invalidation through
- * split_huge_pmd().
- *
- * Unlike PageTransCompound, this is safe to be called only while
- * split_huge_pmd() cannot run from under us, like if protected by the
- * MMU notifier, otherwise it may result in page->_mapcount check false
- * positives.
- *
- * We have to treat page cache THP differently since every subpage of it
- * would get _mapcount inc'ed once it is PMD mapped.  But, it may be PTE
- * mapped in the current process so comparing subpage's _mapcount to
- * compound_mapcount to filter out PTE mapped case.
- */
-static inline int PageTransCompoundMap(struct page *page)
-{
-	struct page *head;
-
-	if (!PageTransCompound(page))
-		return 0;
-
-	if (PageAnon(page))
-		return atomic_read(&page->_mapcount) < 0;
-
-	head = compound_head(page);
-	/* File THP is PMD mapped and not PTE mapped */
-	return atomic_read(&page->_mapcount) ==
-	       atomic_read(compound_mapcount_ptr(head));
-}
-
 /*
  * PageTransTail returns true for both transparent huge pages
  * and hugetlbfs pages, so it should only be called when it's known
-- 
cgit v1.2.3


From 36c3ce6c0d03a6c9992c3359f879cdc70fde836a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 26 Jul 2021 16:35:52 +0100
Subject: KVM: Get rid of kvm_get_pfn()

Nobody is using kvm_get_pfn() anymore. Get rid of it.

Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210726153552.1535838-7-maz@kernel.org
---
 include/linux/kvm_host.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ae7735b490b4..9818d271c2a1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -824,7 +824,6 @@ void kvm_release_pfn_clean(kvm_pfn_t pfn);
 void kvm_release_pfn_dirty(kvm_pfn_t pfn);
 void kvm_set_pfn_dirty(kvm_pfn_t pfn);
 void kvm_set_pfn_accessed(kvm_pfn_t pfn);
-void kvm_get_pfn(kvm_pfn_t pfn);
 
 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache);
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
-- 
cgit v1.2.3


From 0b8f11737cffc1a406d1134b58687abc29d76b52 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 2 Jul 2021 15:04:23 -0700
Subject: KVM: Add infrastructure and macro to mark VM as bugged

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <3a0998645c328bf0895f1290e61821b70f048549.1625186503.git.isaku.yamahata@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ae7735b490b4..5342592841be 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -150,6 +150,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_MMU_RELOAD        (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_UNBLOCK           2
 #define KVM_REQ_UNHALT            3
+#define KVM_REQ_VM_BUGGED         (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQUEST_ARCH_BASE     8
 
 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
@@ -596,6 +597,7 @@ struct kvm {
 	pid_t userspace_pid;
 	unsigned int max_halt_poll_ns;
 	u32 dirty_ring_size;
+	bool vm_bugged;
 
 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 	struct notifier_block pm_notifier;
@@ -629,6 +631,31 @@ struct kvm {
 #define vcpu_err(vcpu, fmt, ...)					\
 	kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
 
+bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
+static inline void kvm_vm_bugged(struct kvm *kvm)
+{
+	kvm->vm_bugged = true;
+	kvm_make_all_cpus_request(kvm, KVM_REQ_VM_BUGGED);
+}
+
+#define KVM_BUG(cond, kvm, fmt...)				\
+({								\
+	int __ret = (cond);					\
+								\
+	if (WARN_ONCE(__ret && !(kvm)->vm_bugged, fmt))		\
+		kvm_vm_bugged(kvm);				\
+	unlikely(__ret);					\
+})
+
+#define KVM_BUG_ON(cond, kvm)					\
+({								\
+	int __ret = (cond);					\
+								\
+	if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged))		\
+		kvm_vm_bugged(kvm);				\
+	unlikely(__ret);					\
+})
+
 static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm)
 {
 	return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET);
@@ -946,7 +973,6 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
 				 struct kvm_vcpu *except,
 				 unsigned long *vcpu_bitmap, cpumask_var_t tmp);
-bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
 				      struct kvm_vcpu *except);
 bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req,
-- 
cgit v1.2.3


From 7ee3e8c39d3aed6ff4cc618d86ba9128f0c80087 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 2 Jul 2021 15:04:24 -0700
Subject: KVM: Export kvm_make_all_cpus_request() for use in marking VMs as
 bugged

Export kvm_make_all_cpus_request() and hoist the request helper
declarations of request up to the KVM_REQ_* definitions in preparation
for adding a "VM bugged" framework.  The framework will add KVM_BUG()
and KVM_BUG_ON() as alternatives to full BUG()/BUG_ON() for cases where
KVM has definitely hit a bug (in itself or in silicon) and the VM is all
but guaranteed to be hosed.  Marking a VM bugged will trigger a request
to all vCPUs to allow arch code to forcefully evict each vCPU from its
run loop.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Message-Id: <1d8cbbc8065d831343e70b5dcaea92268145eef1.1625186503.git.isaku.yamahata@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5342592841be..b7bf9d6a7780 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -159,6 +159,15 @@ static inline bool is_error_page(struct page *page)
 })
 #define KVM_ARCH_REQ(nr)           KVM_ARCH_REQ_FLAGS(nr, 0)
 
+bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
+				 struct kvm_vcpu *except,
+				 unsigned long *vcpu_bitmap, cpumask_var_t tmp);
+bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
+bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
+				      struct kvm_vcpu *except);
+bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req,
+				unsigned long *vcpu_bitmap);
+
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
 
@@ -631,7 +640,6 @@ struct kvm {
 #define vcpu_err(vcpu, fmt, ...)					\
 	kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
 
-bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
 static inline void kvm_vm_bugged(struct kvm *kvm)
 {
 	kvm->vm_bugged = true;
@@ -970,14 +978,6 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 #endif
 
-bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
-				 struct kvm_vcpu *except,
-				 unsigned long *vcpu_bitmap, cpumask_var_t tmp);
-bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
-				      struct kvm_vcpu *except);
-bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req,
-				unsigned long *vcpu_bitmap);
-
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
 long kvm_arch_vcpu_ioctl(struct file *filp,
-- 
cgit v1.2.3


From 605c713023e3925d0444f495a42c903cb6ce875f Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 25 Jun 2021 11:32:07 -0400
Subject: KVM: Introduce kvm_get_kvm_safe()

Introduce this safe version of kvm_get_kvm() so that it can be called even
during vm destruction.  Use it in kvm_debugfs_open() and remove the verbose
comment.  Prepare to be used elsewhere.

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20210625153214.43106-3-peterx@redhat.com>
[Preserve the comment in kvm_debugfs_open. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b7bf9d6a7780..de58a0890b1a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -755,6 +755,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 void kvm_exit(void);
 
 void kvm_get_kvm(struct kvm *kvm);
+bool kvm_get_kvm_safe(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);
 bool file_is_kvm(struct file *file);
 void kvm_put_kvm_no_destroy(struct kvm *kvm);
-- 
cgit v1.2.3


From 52ac8b358b0cb7e91c966225fca61be5d1c984bc Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 27 May 2021 08:09:15 -0400
Subject: KVM: Block memslot updates across range_start() and range_end()

We would like to avoid taking mmu_lock for .invalidate_range_{start,end}()
notifications that are unrelated to KVM.  Because mmu_notifier_count
must be modified while holding mmu_lock for write, and must always
be paired across start->end to stay balanced, lock elision must
happen in both or none.  Therefore, in preparation for this change,
this patch prevents memslot updates across range_start() and range_end().

Note, technically flag-only memslot updates could be allowed in parallel,
but stalling a memslot update for a relatively short amount of time is
not a scalability issue, and this is all more than complex enough.

A long note on the locking: a previous version of the patch used an rwsem
to block the memslot update while the MMU notifier run, but this resulted
in the following deadlock involving the pseudo-lock tagged as
"mmu_notifier_invalidate_range_start".

   ======================================================
   WARNING: possible circular locking dependency detected
   5.12.0-rc3+ #6 Tainted: G           OE
   ------------------------------------------------------
   qemu-system-x86/3069 is trying to acquire lock:
   ffffffff9c775ca0 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}, at: __mmu_notifier_invalidate_range_end+0x5/0x190

   but task is already holding lock:
   ffffaff7410a9160 (&kvm->mmu_notifier_slots_lock){.+.+}-{3:3}, at: kvm_mmu_notifier_invalidate_range_start+0x36d/0x4f0 [kvm]

   which lock already depends on the new lock.

This corresponds to the following MMU notifier logic:

    invalidate_range_start
      take pseudo lock
      down_read()           (*)
      release pseudo lock
    invalidate_range_end
      take pseudo lock      (**)
      up_read()
      release pseudo lock

At point (*) we take the mmu_notifiers_slots_lock inside the pseudo lock;
at point (**) we take the pseudo lock inside the mmu_notifiers_slots_lock.

This could cause a deadlock (ignoring for a second that the pseudo lock
is not a lock):

- invalidate_range_start waits on down_read(), because the rwsem is
held by install_new_memslots

- install_new_memslots waits on down_write(), because the rwsem is
held till (another) invalidate_range_end finishes

- invalidate_range_end sits waits on the pseudo lock, held by
invalidate_range_start.

Removing the fairness of the rwsem breaks the cycle (in lockdep terms,
it would change the *shared* rwsem readers into *shared recursive*
readers), so open-code the wait using a readers count and a
spinlock.  This also allows handling blockable and non-blockable
critical section in the same way.

Losing the rwsem fairness does theoretically allow MMU notifiers to
block install_new_memslots forever.  Note that mm/mmu_notifier.c's own
retry scheme in mmu_interval_read_begin also uses wait/wake_up
and is likewise not fair.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index de58a0890b1a..5b6a69caccb5 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -548,6 +548,11 @@ struct kvm {
 	struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 
+	/* Used to wait for completion of MMU notifiers.  */
+	spinlock_t mn_invalidate_lock;
+	unsigned long mn_active_invalidate_count;
+	struct rcuwait mn_memslots_update_rcuwait;
+
 	/*
 	 * created_vcpus is protected by kvm->lock, and is incremented
 	 * at the beginning of KVM_CREATE_VCPU.  online_vcpus is only
-- 
cgit v1.2.3


From d08c8b855140e9f5240b3ffd1b8b9d435675e281 Mon Sep 17 00:00:00 2001
From: Wasim Khan <wasim.khan@nxp.com>
Date: Thu, 29 Jul 2021 14:17:47 +0200
Subject: PCI: Add ACS quirks for NXP LX2xx0 and LX2xx2 platforms

Root Ports in NXP LX2xx0 and LX2xx2, where each Root Port is a Root Complex
with unique segment numbers, do provide isolation features to disable peer
transactions and validate bus numbers in requests, but do not provide an
actual PCIe ACS capability.

Add ACS quirks for NXP LX2xx0 A/C/E/N and LX2xx2 A/C/E/N platforms.

  LX2xx0A : without security features + CAN-FD
    LX2160A (0x8d81) - 16 cores
    LX2120A (0x8da1) - 12 cores
    LX2080A (0x8d83) -  8 cores

  LX2xx0C : security features + CAN-FD
    LX2160C (0x8d80) - 16 cores
    LX2120C (0x8da0) - 12 cores
    LX2080C (0x8d82) -  8 cores

  LX2xx0E : security features + CAN
    LX2160E (0x8d90) - 16 cores
    LX2120E (0x8db0) - 12 cores
    LX2080E (0x8d92) -  8 cores

  LX2xx0N : without security features + CAN
    LX2160N (0x8d91) - 16 cores
    LX2120N (0x8db1) - 12 cores
    LX2080N (0x8d93) -  8 cores

  LX2xx2A : without security features + CAN-FD
    LX2162A (0x8d89) - 16 cores
    LX2122A (0x8da9) - 12 cores
    LX2082A (0x8d8b) -  8 cores

  LX2xx2C : security features + CAN-FD
    LX2162C (0x8d88) - 16 cores
    LX2122C (0x8da8) - 12 cores
    LX2082C (0x8d8a) -  8 cores

  LX2xx2E : security features + CAN
    LX2162E (0x8d98) - 16 cores
    LX2122E (0x8db8) - 12 cores
    LX2082E (0x8d9a) -  8 cores

  LX2xx2N : without security features + CAN
    LX2162N (0x8d99) - 16 cores
    LX2122N (0x8db9) - 12 cores
    LX2082N (0x8d9b) -  8 cores

[bhelgaas: put PCI_VENDOR_ID_NXP definition next to PCI_VENDOR_ID_FREESCALE
as a clue that they share the same Device ID namespace]
Link: https://lore.kernel.org/r/20210729121747.1823086-1-wasim.khan@oss.nxp.com
Link: https://lore.kernel.org/r/20210803180021.3252886-1-wasim.khan@oss.nxp.com
Signed-off-by: Wasim Khan <wasim.khan@nxp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci_ids.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 4bac1831de80..1a9b8589391c 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2451,7 +2451,8 @@
 #define PCI_VENDOR_ID_TDI               0x192E
 #define PCI_DEVICE_ID_TDI_EHCI          0x0101
 
-#define PCI_VENDOR_ID_FREESCALE		0x1957
+#define PCI_VENDOR_ID_FREESCALE		0x1957	/* duplicate: NXP */
+#define PCI_VENDOR_ID_NXP		0x1957	/* duplicate: FREESCALE */
 #define PCI_DEVICE_ID_MPC8308		0xc006
 #define PCI_DEVICE_ID_MPC8315E		0x00b4
 #define PCI_DEVICE_ID_MPC8315		0x00b5
-- 
cgit v1.2.3


From a065d5615fc83908ef21ed8159ffb63d816ff5de Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Wed, 28 Jul 2021 16:42:27 +0200
Subject: of: unify of_count_phandle_with_args() arguments with !CONFIG_OF

Unify the declaration of of_count_phandle_with_args() between enabled
and disabled OF by making constifying pointed device_node.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
 include/linux/of.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index 9c2e71e202d1..dfeb065c3fad 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -896,7 +896,7 @@ static inline int of_parse_phandle_with_fixed_args(const struct device_node *np,
 	return -ENOSYS;
 }
 
-static inline int of_count_phandle_with_args(struct device_node *np,
+static inline int of_count_phandle_with_args(const struct device_node *np,
 					     const char *list_name,
 					     const char *cells_name)
 {
-- 
cgit v1.2.3


From e6ae9a833ef4043b940954b8dcac31493706b9d6 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Date: Wed, 28 Jul 2021 16:42:28 +0200
Subject: gpiolib: constify passed device_node pointer

Several gpiolib functions receive pointer to struct device_node which is
later passed to OF functions.  These OF functions accept already pointer
to const, so gpiolib can follow similar approach to indicate they are
not modifying the struct device_node.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
 include/linux/gpio/consumer.h |  8 ++++----
 include/linux/of_gpio.h       | 15 ++++++++-------
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 566feb56601f..bf945b776555 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -609,7 +609,7 @@ struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev,
 #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_OF_GPIO)
 struct device_node;
 
-struct gpio_desc *gpiod_get_from_of_node(struct device_node *node,
+struct gpio_desc *gpiod_get_from_of_node(const struct device_node *node,
 					 const char *propname, int index,
 					 enum gpiod_flags dflags,
 					 const char *label);
@@ -619,7 +619,7 @@ struct gpio_desc *gpiod_get_from_of_node(struct device_node *node,
 struct device_node;
 
 static inline
-struct gpio_desc *gpiod_get_from_of_node(struct device_node *node,
+struct gpio_desc *gpiod_get_from_of_node(const struct device_node *node,
 					 const char *propname, int index,
 					 enum gpiod_flags dflags,
 					 const char *label)
@@ -633,7 +633,7 @@ struct gpio_desc *gpiod_get_from_of_node(struct device_node *node,
 struct device_node;
 
 struct gpio_desc *devm_gpiod_get_from_of_node(struct device *dev,
-					      struct device_node *node,
+					      const struct device_node *node,
 					      const char *propname, int index,
 					      enum gpiod_flags dflags,
 					      const char *label);
@@ -644,7 +644,7 @@ struct device_node;
 
 static inline
 struct gpio_desc *devm_gpiod_get_from_of_node(struct device *dev,
-					      struct device_node *node,
+					      const struct device_node *node,
 					      const char *propname, int index,
 					      enum gpiod_flags dflags,
 					      const char *label)
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index f821095218b0..8bf2ea859653 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -49,7 +49,7 @@ static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc)
 	return container_of(gc, struct of_mm_gpio_chip, gc);
 }
 
-extern int of_get_named_gpio_flags(struct device_node *np,
+extern int of_get_named_gpio_flags(const struct device_node *np,
 		const char *list_name, int index, enum of_gpio_flags *flags);
 
 extern int of_mm_gpiochip_add_data(struct device_node *np,
@@ -67,7 +67,7 @@ extern void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc);
 #include <linux/errno.h>
 
 /* Drivers may not strictly depend on the GPIO support, so let them link. */
-static inline int of_get_named_gpio_flags(struct device_node *np,
+static inline int of_get_named_gpio_flags(const struct device_node *np,
 		const char *list_name, int index, enum of_gpio_flags *flags)
 {
 	if (flags)
@@ -98,7 +98,8 @@ static inline int of_get_named_gpio_flags(struct device_node *np,
  * The above example defines four GPIOs, two of which are not specified.
  * This function will return '4'
  */
-static inline int of_gpio_named_count(struct device_node *np, const char* propname)
+static inline int of_gpio_named_count(const struct device_node *np,
+				      const char *propname)
 {
 	return of_count_phandle_with_args(np, propname, "#gpio-cells");
 }
@@ -109,12 +110,12 @@ static inline int of_gpio_named_count(struct device_node *np, const char* propna
  *
  * Same as of_gpio_named_count, but hard coded to use the 'gpios' property
  */
-static inline int of_gpio_count(struct device_node *np)
+static inline int of_gpio_count(const struct device_node *np)
 {
 	return of_gpio_named_count(np, "gpios");
 }
 
-static inline int of_get_gpio_flags(struct device_node *np, int index,
+static inline int of_get_gpio_flags(const struct device_node *np, int index,
 		      enum of_gpio_flags *flags)
 {
 	return of_get_named_gpio_flags(np, "gpios", index, flags);
@@ -129,7 +130,7 @@ static inline int of_get_gpio_flags(struct device_node *np, int index,
  * Returns GPIO number to use with Linux generic GPIO API, or one of the errno
  * value on the error condition.
  */
-static inline int of_get_named_gpio(struct device_node *np,
+static inline int of_get_named_gpio(const struct device_node *np,
                                    const char *propname, int index)
 {
 	return of_get_named_gpio_flags(np, propname, index, NULL);
@@ -143,7 +144,7 @@ static inline int of_get_named_gpio(struct device_node *np,
  * Returns GPIO number to use with Linux generic GPIO API, or one of the errno
  * value on the error condition.
  */
-static inline int of_get_gpio(struct device_node *np, int index)
+static inline int of_get_gpio(const struct device_node *np, int index)
 {
 	return of_get_gpio_flags(np, index, NULL);
 }
-- 
cgit v1.2.3


From edfa378448b566f705f5e81fd1565dc39ef6b716 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 14 Jul 2021 12:17:36 +0200
Subject: clk: Align provider-specific CLK_* bit definitions

The definition of CLK_MULTIPLIER_ROUND_CLOSEST is not aligned to the two
bit definitions next to it.  A deeper inspection reveals that the
alignment of CLK_MULTIPLIER_ROUND_CLOSEST does match the most common
alignment.

Align the bit definitions for the various provider types throughout the
file at 40 columns, to increase uniformity.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/r/5468cd9e50cda8fc59cb6baab9413c6c0de1a974.1626257689.git.geert+renesas@glider.be
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index d83b829305c0..7be81d5fcf8c 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -342,7 +342,7 @@ struct clk_fixed_rate {
 	unsigned long	flags;
 };
 
-#define CLK_FIXED_RATE_PARENT_ACCURACY		BIT(0)
+#define CLK_FIXED_RATE_PARENT_ACCURACY	BIT(0)
 
 extern const struct clk_ops clk_fixed_rate_ops;
 struct clk_hw *__clk_hw_register_fixed_rate(struct device *dev,
@@ -1020,8 +1020,8 @@ struct clk_fractional_divider {
 
 #define to_clk_fd(_hw) container_of(_hw, struct clk_fractional_divider, hw)
 
-#define CLK_FRAC_DIVIDER_ZERO_BASED		BIT(0)
-#define CLK_FRAC_DIVIDER_BIG_ENDIAN		BIT(1)
+#define CLK_FRAC_DIVIDER_ZERO_BASED	BIT(0)
+#define CLK_FRAC_DIVIDER_BIG_ENDIAN	BIT(1)
 
 extern const struct clk_ops clk_fractional_divider_ops;
 struct clk *clk_register_fractional_divider(struct device *dev,
@@ -1069,9 +1069,9 @@ struct clk_multiplier {
 
 #define to_clk_multiplier(_hw) container_of(_hw, struct clk_multiplier, hw)
 
-#define CLK_MULTIPLIER_ZERO_BYPASS		BIT(0)
+#define CLK_MULTIPLIER_ZERO_BYPASS	BIT(0)
 #define CLK_MULTIPLIER_ROUND_CLOSEST	BIT(1)
-#define CLK_MULTIPLIER_BIG_ENDIAN		BIT(2)
+#define CLK_MULTIPLIER_BIG_ENDIAN	BIT(2)
 
 extern const struct clk_ops clk_multiplier_ops;
 
-- 
cgit v1.2.3


From edeb2ca74716056b2be62925f21494d7af65150f Mon Sep 17 00:00:00 2001
From: Martin Botka <martin.botka@somainline.org>
Date: Fri, 30 Jul 2021 23:59:24 +0200
Subject: clk: qcom: smd: Add support for SM6125 rpm clocks

Add rpm smd clocks, PMIC and bus clocks which are required on SM6125
for clients to vote on.

Signed-off-by: Martin Botka <martin.botka@somainline.org>
Link: https://lore.kernel.org/r/20210730215924.733350-2-martin.botka@somainline.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/soc/qcom/smd-rpm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/smd-rpm.h b/include/linux/soc/qcom/smd-rpm.h
index f2645ec52520..60e66fc9b6bf 100644
--- a/include/linux/soc/qcom/smd-rpm.h
+++ b/include/linux/soc/qcom/smd-rpm.h
@@ -29,6 +29,7 @@ struct qcom_smd_rpm;
 #define QCOM_SMD_RPM_NCPB	0x6270636E
 #define QCOM_SMD_RPM_OCMEM_PWR	0x706d636f
 #define QCOM_SMD_RPM_QPIC_CLK	0x63697071
+#define QCOM_SMD_RPM_QUP_CLK	0x707571
 #define QCOM_SMD_RPM_SMPA	0x61706d73
 #define QCOM_SMD_RPM_SMPB	0x62706d73
 #define QCOM_SMD_RPM_SPDM	0x63707362
-- 
cgit v1.2.3


From 87689270b10fa9e6fac7242233b355cb6792b845 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Wed, 4 Aug 2021 22:28:38 +0000
Subject: KVM: Rename lru_slot to last_used_slot

lru_slot is used to keep track of the index of the most-recently used
memslot. The correct acronym would be "mru" but that is not a common
acronym. So call it last_used_slot which is a bit more obvious.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20210804222844.1419481-2-dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5b6a69caccb5..bdfd5ed539c9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -522,7 +522,7 @@ struct kvm_memslots {
 	u64 generation;
 	/* The mapping table from slot id to the index in memslots[]. */
 	short id_to_index[KVM_MEM_SLOTS_NUM];
-	atomic_t lru_slot;
+	atomic_t last_used_slot;
 	int used_slots;
 	struct kvm_memory_slot memslots[];
 };
@@ -1200,7 +1200,7 @@ static inline struct kvm_memory_slot *
 search_memslots(struct kvm_memslots *slots, gfn_t gfn)
 {
 	int start = 0, end = slots->used_slots;
-	int slot = atomic_read(&slots->lru_slot);
+	int slot = atomic_read(&slots->last_used_slot);
 	struct kvm_memory_slot *memslots = slots->memslots;
 
 	if (unlikely(!slots->used_slots))
@@ -1221,7 +1221,7 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn)
 
 	if (start < slots->used_slots && gfn >= memslots[start].base_gfn &&
 	    gfn < memslots[start].base_gfn + memslots[start].npages) {
-		atomic_set(&slots->lru_slot, start);
+		atomic_set(&slots->last_used_slot, start);
 		return &memslots[start];
 	}
 
-- 
cgit v1.2.3


From 0f22af940dc8ec4f437189096a5f8677995323b0 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Wed, 4 Aug 2021 22:28:39 +0000
Subject: KVM: Move last_used_slot logic out of search_memslots

Make search_memslots unconditionally search all memslots and move the
last_used_slot logic up one level to __gfn_to_memslot. This is in
preparation for introducing a per-vCPU last_used_slot.

As part of this change convert existing callers of search_memslots to
__gfn_to_memslot to avoid making any functional changes.

Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20210804222844.1419481-3-dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 64 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 48 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bdfd5ed539c9..f30b53a07917 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1189,29 +1189,43 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
 
 /*
- * search_memslots() and __gfn_to_memslot() are here because they are
- * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
- * gfn_to_memslot() itself isn't here as an inline because that would
- * bloat other code too much.
+ * Returns a pointer to the memslot at slot_index if it contains gfn.
+ * Otherwise returns NULL.
+ */
+static inline struct kvm_memory_slot *
+try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn)
+{
+	struct kvm_memory_slot *slot;
+
+	if (slot_index < 0 || slot_index >= slots->used_slots)
+		return NULL;
+
+	slot = &slots->memslots[slot_index];
+
+	if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages)
+		return slot;
+	else
+		return NULL;
+}
+
+/*
+ * Returns a pointer to the memslot that contains gfn and records the index of
+ * the slot in index. Otherwise returns NULL.
  *
  * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!
  */
 static inline struct kvm_memory_slot *
-search_memslots(struct kvm_memslots *slots, gfn_t gfn)
+search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index)
 {
 	int start = 0, end = slots->used_slots;
-	int slot = atomic_read(&slots->last_used_slot);
 	struct kvm_memory_slot *memslots = slots->memslots;
+	struct kvm_memory_slot *slot;
 
 	if (unlikely(!slots->used_slots))
 		return NULL;
 
-	if (gfn >= memslots[slot].base_gfn &&
-	    gfn < memslots[slot].base_gfn + memslots[slot].npages)
-		return &memslots[slot];
-
 	while (start < end) {
-		slot = start + (end - start) / 2;
+		int slot = start + (end - start) / 2;
 
 		if (gfn >= memslots[slot].base_gfn)
 			end = slot;
@@ -1219,19 +1233,37 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn)
 			start = slot + 1;
 	}
 
-	if (start < slots->used_slots && gfn >= memslots[start].base_gfn &&
-	    gfn < memslots[start].base_gfn + memslots[start].npages) {
-		atomic_set(&slots->last_used_slot, start);
-		return &memslots[start];
+	slot = try_get_memslot(slots, start, gfn);
+	if (slot) {
+		*index = start;
+		return slot;
 	}
 
 	return NULL;
 }
 
+/*
+ * __gfn_to_memslot() and its descendants are here because it is called from
+ * non-modular code in arch/powerpc/kvm/book3s_64_vio{,_hv}.c. gfn_to_memslot()
+ * itself isn't here as an inline because that would bloat other code too much.
+ */
 static inline struct kvm_memory_slot *
 __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
 {
-	return search_memslots(slots, gfn);
+	struct kvm_memory_slot *slot;
+	int slot_index = atomic_read(&slots->last_used_slot);
+
+	slot = try_get_memslot(slots, slot_index, gfn);
+	if (slot)
+		return slot;
+
+	slot = search_memslots(slots, gfn, &slot_index);
+	if (slot) {
+		atomic_set(&slots->last_used_slot, slot_index);
+		return slot;
+	}
+
+	return NULL;
 }
 
 static inline unsigned long
-- 
cgit v1.2.3


From fe22ed827c5b60b895b15c5c3f04e04ac606be38 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Wed, 4 Aug 2021 22:28:40 +0000
Subject: KVM: Cache the last used slot index per vCPU

The memslot for a given gfn is looked up multiple times during page
fault handling. Avoid binary searching for it multiple times by caching
the most recently used slot. There is an existing VM-wide last_used_slot
but that does not work well for cases where vCPUs are accessing memory
in different slots (see performance data below).

Another benefit of caching the most recently use slot (versus looking
up the slot once and passing around a pointer) is speeding up memslot
lookups *across* faults and during spte prefetching.

To measure the performance of this change I ran dirty_log_perf_test with
64 vCPUs and 64 memslots and measured "Populate memory time" and
"Iteration 2 dirty memory time".  Tests were ran with eptad=N to force
dirty logging to use fast_page_fault so its performance could be
measured.

Config     | Metric                        | Before | After
---------- | ----------------------------- | ------ | ------
tdp_mmu=Y  | Populate memory time          | 6.76s  | 5.47s
tdp_mmu=Y  | Iteration 2 dirty memory time | 2.83s  | 0.31s
tdp_mmu=N  | Populate memory time          | 20.4s  | 18.7s
tdp_mmu=N  | Iteration 2 dirty memory time | 2.65s  | 0.30s

The "Iteration 2 dirty memory time" results are especially compelling
because they are equivalent to running the same test with a single
memslot. In other words, fast_page_fault performance no longer scales
with the number of memslots.

Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20210804222844.1419481-4-dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f30b53a07917..492d183dd7d0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -354,6 +354,13 @@ struct kvm_vcpu {
 	struct kvm_vcpu_stat stat;
 	char stats_id[KVM_STATS_NAME_SIZE];
 	struct kvm_dirty_ring dirty_ring;
+
+	/*
+	 * The index of the most recently used memslot by this vCPU. It's ok
+	 * if this becomes stale due to memslot changes since we always check
+	 * it is a valid slot.
+	 */
+	int last_used_slot;
 };
 
 /* must be called with irqs disabled */
@@ -1200,6 +1207,12 @@ try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn)
 	if (slot_index < 0 || slot_index >= slots->used_slots)
 		return NULL;
 
+	/*
+	 * slot_index can come from vcpu->last_used_slot which is not kept
+	 * in sync with userspace-controllable memslot deletion. So use nospec
+	 * to prevent the CPU from speculating past the end of memslots[].
+	 */
+	slot_index = array_index_nospec(slot_index, slots->used_slots);
 	slot = &slots->memslots[slot_index];
 
 	if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages)
-- 
cgit v1.2.3


From 08bf54fcf5ca87328541e035090c6a85c8e064f4 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 2 Aug 2021 21:43:54 +0300
Subject: dmaengine: dw: Convert members to u32 in platform data

u32 is a type that is used for properties retrieval from DT.
With the type change it allows to clean up properties reading routine.

While at it, order the fields in way how they are parsed.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Link: https://lore.kernel.org/r/20210802184355.49879-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/platform_data/dma-dw.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/dma-dw.h b/include/linux/platform_data/dma-dw.h
index b11b0c8bc5da..860ba4bc5ead 100644
--- a/include/linux/platform_data/dma-dw.h
+++ b/include/linux/platform_data/dma-dw.h
@@ -41,11 +41,11 @@ struct dw_dma_slave {
 
 /**
  * struct dw_dma_platform_data - Controller configuration parameters
+ * @nr_masters: Number of AHB masters supported by the controller
  * @nr_channels: Number of channels supported by hardware (max 8)
  * @chan_allocation_order: Allocate channels starting from 0 or 7
  * @chan_priority: Set channel priority increasing from 0 to 7 or 7 to 0.
  * @block_size: Maximum block size supported by the controller
- * @nr_masters: Number of AHB masters supported by the controller
  * @data_width: Maximum data width supported by hardware per AHB master
  *		(in bytes, power of 2)
  * @multi_block: Multi block transfers supported by hardware per channel.
@@ -55,25 +55,25 @@ struct dw_dma_slave {
  * @quirks: Optional platform quirks.
  */
 struct dw_dma_platform_data {
-	unsigned int	nr_channels;
+	u32		nr_masters;
+	u32		nr_channels;
 #define CHAN_ALLOCATION_ASCENDING	0	/* zero to seven */
 #define CHAN_ALLOCATION_DESCENDING	1	/* seven to zero */
-	unsigned char	chan_allocation_order;
+	u32		chan_allocation_order;
 #define CHAN_PRIORITY_ASCENDING		0	/* chan0 highest */
 #define CHAN_PRIORITY_DESCENDING	1	/* chan7 highest */
-	unsigned char	chan_priority;
-	unsigned int	block_size;
-	unsigned char	nr_masters;
-	unsigned char	data_width[DW_DMA_MAX_NR_MASTERS];
-	unsigned char	multi_block[DW_DMA_MAX_NR_CHANNELS];
+	u32		chan_priority;
+	u32		block_size;
+	u32		data_width[DW_DMA_MAX_NR_MASTERS];
+	u32		multi_block[DW_DMA_MAX_NR_CHANNELS];
 	u32		max_burst[DW_DMA_MAX_NR_CHANNELS];
 #define CHAN_PROTCTL_PRIVILEGED		BIT(0)
 #define CHAN_PROTCTL_BUFFERABLE		BIT(1)
 #define CHAN_PROTCTL_CACHEABLE		BIT(2)
 #define CHAN_PROTCTL_MASK		GENMASK(2, 0)
-	unsigned char	protctl;
+	u32		protctl;
 #define DW_DMA_QUIRK_XBAR_PRESENT	BIT(0)
-	unsigned int	quirks;
+	u32		quirks;
 };
 
 #endif /* _PLATFORM_DATA_DMA_DW_H */
-- 
cgit v1.2.3


From f1653c2e2831e9db6cd68473bbec581782df03a5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 6 Aug 2021 11:05:37 -0700
Subject: xfs: introduce CPU hotplug infrastructure

We need to move to per-cpu state for both deferred inode
inactivation and CIL tracking, but to do that we
need to handle CPUs being removed from the system by the hot-plug
code. Introduce generic XFS infrastructure to handle CPU hotplug
events that is set up at module init time and torn down at module
exit time.

Initially, we only need CPU dead notifications, so we only set
up a callback for these notifications. The infrastructure can be
updated in future for other CPU hotplug state machine notifications
easily if ever needed.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
[djwong: rearrange some macros, fix function prototypes]
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/linux/cpuhotplug.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index f39b34b13871..439adc05be4e 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -52,6 +52,7 @@ enum cpuhp_state {
 	CPUHP_FS_BUFF_DEAD,
 	CPUHP_PRINTK_DEAD,
 	CPUHP_MM_MEMCQ_DEAD,
+	CPUHP_XFS_DEAD,
 	CPUHP_PERCPU_CNT_DEAD,
 	CPUHP_RADIX_DEAD,
 	CPUHP_PAGE_ALLOC,
-- 
cgit v1.2.3


From 9050ad816f5205c0d069e3e492eb849265ae5167 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 2 Aug 2021 01:33:14 +0200
Subject: mfd: db8500-prcmu: Handle missing FW variant

There was an "unknown" firmware variant turning up in the wild
causing problems in the clock driver. Add this missing variant
and clarify that varian 11 and 15 are Samsung variants, as this
is now very well known from released products.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/dbx500-prcmu.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/dbx500-prcmu.h b/include/linux/mfd/dbx500-prcmu.h
index e6ee2ec35de9..cbf9d7619493 100644
--- a/include/linux/mfd/dbx500-prcmu.h
+++ b/include/linux/mfd/dbx500-prcmu.h
@@ -186,10 +186,11 @@ enum ddr_pwrst {
 #define PRCMU_FW_PROJECT_U8500_C3	8
 #define PRCMU_FW_PROJECT_U8500_C4	9
 #define PRCMU_FW_PROJECT_U9500_MBL	10
-#define PRCMU_FW_PROJECT_U8500_MBL	11 /* Customer specific */
+#define PRCMU_FW_PROJECT_U8500_SSG1	11 /* Samsung specific */
 #define PRCMU_FW_PROJECT_U8500_MBL2	12 /* Customer specific */
 #define PRCMU_FW_PROJECT_U8520		13
 #define PRCMU_FW_PROJECT_U8420		14
+#define PRCMU_FW_PROJECT_U8500_SSG2	15 /* Samsung specific */
 #define PRCMU_FW_PROJECT_U8420_SYSCLK	17
 #define PRCMU_FW_PROJECT_A9420		20
 /* [32..63] 9540 and derivatives */
-- 
cgit v1.2.3


From fffe3cc8c2194f60c4af4fac7f27d25e8828f001 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 29 Jul 2021 14:15:19 -0600
Subject: dma-mapping: allow map_sg() ops to return negative error codes

Allow dma_map_sgtable() to pass errors from the map_sg() ops. This
will be required for returning appropriate error codes when mapping
P2PDMA memory.

Introduce __dma_map_sg_attrs() which will return the raw error code
from the map_sg operation (whether it be negative or zero). Then add a
dma_map_sg_attrs() wrapper to convert any negative errors to zero to
satisfy the existing calling convention.

dma_map_sgtable() defines three error codes that .map_sg implementations
are allowed to return: -EINVAL, -ENOMEM and -EIO. The latter of which
is a generic return for cases that are passing DMA_MAPPING_ERROR
through.

dma_map_sgtable() will convert a zero error return for old map_sg() ops
into a -EIO return and return any negative errors as reported.

This allows map_sg implementations to start returning multiple
negative error codes. Legacy map_sg implementations can continue
to return zero until they are all converted.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-map-ops.h |  5 +++--
 include/linux/dma-mapping.h | 35 +++++++----------------------------
 2 files changed, 10 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 0d53a96a3d64..2f842498c448 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -41,8 +41,9 @@ struct dma_map_ops {
 			size_t size, enum dma_data_direction dir,
 			unsigned long attrs);
 	/*
-	 * map_sg returns 0 on error and a value > 0 on success.
-	 * It should never return a value < 0.
+	 * map_sg should return a negative error code on error. See
+	 * dma_map_sgtable() for a list of appropriate error codes
+	 * and their meanings.
 	 */
 	int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents,
 			enum dma_data_direction dir, unsigned long attrs);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 183e7103a66d..daa1e360f0ee 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -110,6 +110,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
 void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
 				      int nents, enum dma_data_direction dir,
 				      unsigned long attrs);
+int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+		enum dma_data_direction dir, unsigned long attrs);
 dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs);
 void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
@@ -174,6 +176,11 @@ static inline void dma_unmap_sg_attrs(struct device *dev,
 		unsigned long attrs)
 {
 }
+static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	return -EOPNOTSUPP;
+}
 static inline dma_addr_t dma_map_resource(struct device *dev,
 		phys_addr_t phys_addr, size_t size, enum dma_data_direction dir,
 		unsigned long attrs)
@@ -343,34 +350,6 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
 	return dma_sync_single_for_device(dev, addr + offset, size, dir);
 }
 
-/**
- * dma_map_sgtable - Map the given buffer for DMA
- * @dev:	The device for which to perform the DMA operation
- * @sgt:	The sg_table object describing the buffer
- * @dir:	DMA direction
- * @attrs:	Optional DMA attributes for the map operation
- *
- * Maps a buffer described by a scatterlist stored in the given sg_table
- * object for the @dir DMA operation by the @dev device. After success the
- * ownership for the buffer is transferred to the DMA domain.  One has to
- * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the
- * ownership of the buffer back to the CPU domain before touching the
- * buffer by the CPU.
- *
- * Returns 0 on success or -EINVAL on error during mapping the buffer.
- */
-static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
-		enum dma_data_direction dir, unsigned long attrs)
-{
-	int nents;
-
-	nents = dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
-	if (nents <= 0)
-		return -EINVAL;
-	sgt->nents = nents;
-	return 0;
-}
-
 /**
  * dma_unmap_sgtable - Unmap the given buffer for DMA
  * @dev:	The device for which to perform the DMA operation
-- 
cgit v1.2.3


From ad8f36e4b6b1c826a0daa5fda2c5839205b5aa8b Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 29 Jul 2021 14:15:21 -0600
Subject: iommu: return full error code from iommu_map_sg[_atomic]()

Convert to ssize_t return code so the return code from __iommu_map()
can be returned all the way down through dma_iommu_map_sg().

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/iommu.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 32d448050bf7..9369458ba1bd 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -414,11 +414,11 @@ extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova,
 extern size_t iommu_unmap_fast(struct iommu_domain *domain,
 			       unsigned long iova, size_t size,
 			       struct iommu_iotlb_gather *iotlb_gather);
-extern size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-			   struct scatterlist *sg,unsigned int nents, int prot);
-extern size_t iommu_map_sg_atomic(struct iommu_domain *domain,
-				  unsigned long iova, struct scatterlist *sg,
-				  unsigned int nents, int prot);
+extern ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+		struct scatterlist *sg, unsigned int nents, int prot);
+extern ssize_t iommu_map_sg_atomic(struct iommu_domain *domain,
+				   unsigned long iova, struct scatterlist *sg,
+				   unsigned int nents, int prot);
 extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova);
 extern void iommu_set_fault_handler(struct iommu_domain *domain,
 			iommu_fault_handler_t handler, void *token);
@@ -679,18 +679,18 @@ static inline size_t iommu_unmap_fast(struct iommu_domain *domain,
 	return 0;
 }
 
-static inline size_t iommu_map_sg(struct iommu_domain *domain,
-				  unsigned long iova, struct scatterlist *sg,
-				  unsigned int nents, int prot)
+static inline ssize_t iommu_map_sg(struct iommu_domain *domain,
+				   unsigned long iova, struct scatterlist *sg,
+				   unsigned int nents, int prot)
 {
-	return 0;
+	return -ENODEV;
 }
 
-static inline size_t iommu_map_sg_atomic(struct iommu_domain *domain,
+static inline ssize_t iommu_map_sg_atomic(struct iommu_domain *domain,
 				  unsigned long iova, struct scatterlist *sg,
 				  unsigned int nents, int prot)
 {
-	return 0;
+	return -ENODEV;
 }
 
 static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
-- 
cgit v1.2.3


From 71d3d0ebc894294ef9454e45a3ac2e9ba60b3351 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 26 Jul 2021 08:01:27 -0400
Subject: SUNRPC: Convert rpc_client refcount to use refcount_t

There are now tools in the refcount library that allow us to convert the
client shutdown code.

Reported-by: Xiyu Yang <xiyuyang19@fudan.edu.cn>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/clnt.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 8b5d5c97553e..b2edd5fc2f0c 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -14,6 +14,7 @@
 #include <linux/socket.h>
 #include <linux/in.h>
 #include <linux/in6.h>
+#include <linux/refcount.h>
 
 #include <linux/sunrpc/msg_prot.h>
 #include <linux/sunrpc/sched.h>
@@ -35,7 +36,7 @@ struct rpc_sysfs_client;
  * The high-level client handle
  */
 struct rpc_clnt {
-	atomic_t		cl_count;	/* Number of references */
+	refcount_t		cl_count;	/* Number of references */
 	unsigned int		cl_clid;	/* client id */
 	struct list_head	cl_clients;	/* Global list of clients */
 	struct list_head	cl_tasks;	/* List of tasks */
-- 
cgit v1.2.3


From c2dc3e5fad13aca5d7bdf4bcb52b1a1d707c8555 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 26 Jul 2021 07:59:23 -0400
Subject: SUNRPC: Fix potential memory corruption

We really should not call rpc_wake_up_queued_task_set_status() with
xprt->snd_task as an argument unless we are certain that is actually an
rpc_task.

Fixes: 0445f92c5d53 ("SUNRPC: Fix disconnection races")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprt.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index c8c39f22d3b1..59cd97da895b 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -432,6 +432,7 @@ void			xprt_release_write(struct rpc_xprt *, struct rpc_task *);
 #define XPRT_CONGESTED		(9)
 #define XPRT_CWND_WAIT		(10)
 #define XPRT_WRITE_SPACE	(11)
+#define XPRT_SND_IS_COOKIE	(12)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
-- 
cgit v1.2.3


From de2860f4636256836450c6543be744a50118fc66 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 9 Aug 2021 10:10:00 -0700
Subject: mm: Add kvrealloc()

During log recovery of an XFS filesystem with 64kB directory
buffers, rebuilding a buffer split across two log records results
in a memory allocation warning from krealloc like this:

xfs filesystem being mounted at /mnt/scratch supports timestamps until 2038 (0x7fffffff)
XFS (dm-0): Unmounting Filesystem
XFS (dm-0): Mounting V5 Filesystem
XFS (dm-0): Starting recovery (logdev: internal)
------------[ cut here ]------------
WARNING: CPU: 5 PID: 3435170 at mm/page_alloc.c:3539 get_page_from_freelist+0xdee/0xe40
.....
RIP: 0010:get_page_from_freelist+0xdee/0xe40
Call Trace:
 ? complete+0x3f/0x50
 __alloc_pages+0x16f/0x300
 alloc_pages+0x87/0x110
 kmalloc_order+0x2c/0x90
 kmalloc_order_trace+0x1d/0x90
 __kmalloc_track_caller+0x215/0x270
 ? xlog_recover_add_to_cont_trans+0x63/0x1f0
 krealloc+0x54/0xb0
 xlog_recover_add_to_cont_trans+0x63/0x1f0
 xlog_recovery_process_trans+0xc1/0xd0
 xlog_recover_process_ophdr+0x86/0x130
 xlog_recover_process_data+0x9f/0x160
 xlog_recover_process+0xa2/0x120
 xlog_do_recovery_pass+0x40b/0x7d0
 ? __irq_work_queue_local+0x4f/0x60
 ? irq_work_queue+0x3a/0x50
 xlog_do_log_recovery+0x70/0x150
 xlog_do_recover+0x38/0x1d0
 xlog_recover+0xd8/0x170
 xfs_log_mount+0x181/0x300
 xfs_mountfs+0x4a1/0x9b0
 xfs_fs_fill_super+0x3c0/0x7b0
 get_tree_bdev+0x171/0x270
 ? suffix_kstrtoint.constprop.0+0xf0/0xf0
 xfs_fs_get_tree+0x15/0x20
 vfs_get_tree+0x24/0xc0
 path_mount+0x2f5/0xaf0
 __x64_sys_mount+0x108/0x140
 do_syscall_64+0x3a/0x70
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Essentially, we are taking a multi-order allocation from kmem_alloc()
(which has an open coded no fail, no warn loop) and then
reallocating it out to 64kB using krealloc(__GFP_NOFAIL) and that is
then triggering the above warning.

This is a regression caused by converting this code from an open
coded no fail/no warn reallocation loop to using __GFP_NOFAIL.

What we actually need here is kvrealloc(), so that if contiguous
page allocation fails we fall back to vmalloc() and we don't
get nasty warnings happening in XFS.

Fixes: 771915c4f688 ("xfs: remove kmem_realloc()")
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 include/linux/mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ca22e6e694a..e59646a5d44d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -829,6 +829,8 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
 	return kvmalloc_array(n, size, flags | __GFP_ZERO);
 }
 
+extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize,
+		gfp_t flags);
 extern void kvfree(const void *addr);
 extern void kvfree_sensitive(const void *addr, size_t len);
 
-- 
cgit v1.2.3


From 438623a06bacd69c40c4af633bb09a3bbb9dfc78 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 15 Jul 2021 15:52:06 -0400
Subject: SUNRPC: Add svc_rqst::rq_auth_stat

I'd like to take commit 4532608d71c8 ("SUNRPC: Clean up generic
dispatcher code") even further by using only private local SVC
dispatchers for all kernel RPC services. This change would enable
the removal of the logic that switches between
svc_generic_dispatch() and a service's private dispatcher, and
simplify the invocation of the service's pc_release method
so that humans can visually verify that it is always invoked
properly.

All that will come later.

First, let's provide a better way to return authentication errors
from SVC dispatcher functions. Instead of overloading the dispatch
method's *statp argument, add a field to struct svc_rqst that can
hold an error value.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/svc.h     | 1 +
 include/linux/sunrpc/svcauth.h | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index e91d51ea028b..35f12963e1ff 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -282,6 +282,7 @@ struct svc_rqst {
 	void *			rq_argp;	/* decoded arguments */
 	void *			rq_resp;	/* xdr'd results */
 	void *			rq_auth_data;	/* flavor-specific data */
+	__be32			rq_auth_stat;	/* authentication status */
 	int			rq_auth_slack;	/* extra space xdr code
 						 * should leave in head
 						 * for krb5i, krb5p.
diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index b0003866a249..6d9cc9080aca 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -127,7 +127,7 @@ struct auth_ops {
 	char *	name;
 	struct module *owner;
 	int	flavour;
-	int	(*accept)(struct svc_rqst *rq, __be32 *authp);
+	int	(*accept)(struct svc_rqst *rq);
 	int	(*release)(struct svc_rqst *rq);
 	void	(*domain_release)(struct auth_domain *);
 	int	(*set_client)(struct svc_rqst *rq);
@@ -149,7 +149,7 @@ struct auth_ops {
 
 struct svc_xprt;
 
-extern int	svc_authenticate(struct svc_rqst *rqstp, __be32 *authp);
+extern int	svc_authenticate(struct svc_rqst *rqstp);
 extern int	svc_authorise(struct svc_rqst *rqstp);
 extern int	svc_set_client(struct svc_rqst *rqstp);
 extern int	svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops);
-- 
cgit v1.2.3


From 9082e1d914f8b27114352b1940bbcc7522f682e7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 15 Jul 2021 15:52:19 -0400
Subject: SUNRPC: Eliminate the RQ_AUTHERR flag

Now that there is an alternate method for returning an auth_stat
value, replace the RQ_AUTHERR flag with use of that new method.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/svc.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 35f12963e1ff..63c9210cae06 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -275,7 +275,6 @@ struct svc_rqst {
 #define	RQ_VICTIM	(5)			/* about to be shut down */
 #define	RQ_BUSY		(6)			/* request is busy */
 #define	RQ_DATA		(7)			/* request has data */
-#define RQ_AUTHERR	(8)			/* Request status is auth error */
 	unsigned long		rq_flags;	/* flags field */
 	ktime_t			rq_qtime;	/* enqueue time */
 
@@ -533,7 +532,6 @@ unsigned int	   svc_fill_write_vector(struct svc_rqst *rqstp,
 char		  *svc_fill_symlink_pathname(struct svc_rqst *rqstp,
 					     struct kvec *first, void *p,
 					     size_t total);
-__be32		   svc_return_autherr(struct svc_rqst *rqstp, __be32 auth_err);
 __be32		   svc_generic_init_request(struct svc_rqst *rqstp,
 					    const struct svc_program *progp,
 					    struct svc_process_info *procinfo);
-- 
cgit v1.2.3


From b390752191a6e09e8fb89625e227db0d5cc0ca33 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 3 Aug 2021 20:39:25 +0300
Subject: gpiolib: Deduplicate forward declaration in the consumer.h header

struct acpi_device is repeated in two branches of ifdeffery.
Move it out and hence deduplicate.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/consumer.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 566feb56601f..414b8f98d70f 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -680,10 +680,10 @@ struct acpi_gpio_mapping {
 	unsigned int quirks;
 };
 
-#if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_ACPI)
-
 struct acpi_device;
 
+#if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_ACPI)
+
 int acpi_dev_add_driver_gpios(struct acpi_device *adev,
 			      const struct acpi_gpio_mapping *gpios);
 void acpi_dev_remove_driver_gpios(struct acpi_device *adev);
@@ -696,8 +696,6 @@ struct gpio_desc *acpi_get_and_request_gpiod(char *path, int pin, char *label);
 
 #else  /* CONFIG_GPIOLIB && CONFIG_ACPI */
 
-struct acpi_device;
-
 static inline int acpi_dev_add_driver_gpios(struct acpi_device *adev,
 			      const struct acpi_gpio_mapping *gpios)
 {
-- 
cgit v1.2.3


From c1b291e96a6d27ac83938596829086945ff8a36e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 4 Aug 2021 19:00:16 +0300
Subject: gpio: dwapb: Unify ACPI enumeration checks in get_irq() and
 configure_irqs()

Shared IRQ is only enabled for ACPI enumeration, there is no need
to have a special flag for that, since we simple can test if device
has been enumerated by ACPI. This unifies the checks in dwapb_get_irq()
and dwapb_configure_irqs().

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Serge Semin <fancer.lancer@gmail.com>
---
 include/linux/platform_data/gpio-dwapb.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/gpio-dwapb.h b/include/linux/platform_data/gpio-dwapb.h
index 0aa5c6720259..535e5ed549d9 100644
--- a/include/linux/platform_data/gpio-dwapb.h
+++ b/include/linux/platform_data/gpio-dwapb.h
@@ -14,7 +14,6 @@ struct dwapb_port_property {
 	unsigned int	ngpio;
 	unsigned int	gpio_base;
 	int		irq[DWAPB_MAX_GPIOS];
-	bool		irq_shared;
 };
 
 struct dwapb_platform_data {
-- 
cgit v1.2.3


From 5111c2b6b0194b509f47e6338c4deeeb4497bda8 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 4 Aug 2021 19:00:19 +0300
Subject: gpio: dwapb: Get rid of legacy platform data

Platform data is a legacy interface to supply device properties
to the driver. In this case we don't have anymore in-kernel users
for it. Just remove it for good.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Serge Semin <fancer.lancer@gmail.com>
---
 include/linux/platform_data/gpio-dwapb.h | 24 ------------------------
 1 file changed, 24 deletions(-)
 delete mode 100644 include/linux/platform_data/gpio-dwapb.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/gpio-dwapb.h b/include/linux/platform_data/gpio-dwapb.h
deleted file mode 100644
index 535e5ed549d9..000000000000
--- a/include/linux/platform_data/gpio-dwapb.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright(c) 2014 Intel Corporation.
- */
-
-#ifndef GPIO_DW_APB_H
-#define GPIO_DW_APB_H
-
-#define DWAPB_MAX_GPIOS		32
-
-struct dwapb_port_property {
-	struct fwnode_handle *fwnode;
-	unsigned int	idx;
-	unsigned int	ngpio;
-	unsigned int	gpio_base;
-	int		irq[DWAPB_MAX_GPIOS];
-};
-
-struct dwapb_platform_data {
-	struct dwapb_port_property *properties;
-	unsigned int nports;
-};
-
-#endif
-- 
cgit v1.2.3


From ae03c3771b8cbbed3802ad1153d896c32015c520 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 5 Aug 2021 22:18:59 -0300
Subject: vfio: Introduce a vfio_uninit_group_dev() API call

This pairs with vfio_init_group_dev() and allows undoing any state that is
stored in the vfio_device unrelated to registration. Add appropriately
placed calls to all the drivers.

The following patch will use this to add pre-registration state for the
device set.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/3-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index a2c5b30e1763..b0875cf8e496 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -61,6 +61,7 @@ extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev);
 
 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
 			 const struct vfio_device_ops *ops);
+void vfio_uninit_group_dev(struct vfio_device *device);
 int vfio_register_group_dev(struct vfio_device *device);
 void vfio_unregister_group_dev(struct vfio_device *device);
 extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
-- 
cgit v1.2.3


From 2fd585f4ed9de9b9259e95affdd7d8cde06b48c3 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 5 Aug 2021 22:19:00 -0300
Subject: vfio: Provide better generic support for open/release vfio_device_ops

Currently the driver ops have an open/release pair that is called once
each time a device FD is opened or closed. Add an additional set of
open/close_device() ops which are called when the device FD is opened for
the first time and closed for the last time.

An analysis shows that all of the drivers require this semantic. Some are
open coding it as part of their reflck implementation, and some are just
buggy and miss it completely.

To retain the current semantics PCI and FSL depend on, introduce the idea
of a "device set" which is a grouping of vfio_device's that share the same
lock around opening.

The device set is established by providing a 'set_id' pointer. All
vfio_device's that provide the same pointer will be joined to the same
singleton memory and lock across the whole set. This effectively replaces
the oddly named reflck.

After conversion the set_id will be sourced from:
 - A struct device from a fsl_mc_device (fsl)
 - A struct pci_slot (pci)
 - A struct pci_bus (pci)
 - The struct vfio_device (everything)

The design ensures that the above pointers are live as long as the
vfio_device is registered, so they form reliable unique keys to group
vfio_devices into sets.

This implementation uses xarray instead of searching through the driver
core structures, which simplifies the somewhat tricky locking in this
area.

Following patches convert all the drivers.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h |  2 ++
 include/linux/vfio.h | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 3a38598c2605..cb5b7ed1d7c3 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -111,6 +111,8 @@ struct mdev_parent_ops {
 
 	int     (*create)(struct mdev_device *mdev);
 	int     (*remove)(struct mdev_device *mdev);
+	int     (*open_device)(struct mdev_device *mdev);
+	void    (*close_device)(struct mdev_device *mdev);
 	int     (*open)(struct mdev_device *mdev);
 	void    (*release)(struct mdev_device *mdev);
 	ssize_t (*read)(struct mdev_device *mdev, char __user *buf,
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index b0875cf8e496..f0e6a72875e4 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -15,13 +15,28 @@
 #include <linux/poll.h>
 #include <uapi/linux/vfio.h>
 
+/*
+ * VFIO devices can be placed in a set, this allows all devices to share this
+ * structure and the VFIO core will provide a lock that is held around
+ * open_device()/close_device() for all devices in the set.
+ */
+struct vfio_device_set {
+	void *set_id;
+	struct mutex lock;
+	struct list_head device_list;
+	unsigned int device_count;
+};
+
 struct vfio_device {
 	struct device *dev;
 	const struct vfio_device_ops *ops;
 	struct vfio_group *group;
+	struct vfio_device_set *dev_set;
+	struct list_head dev_set_list;
 
 	/* Members below here are private, not for driver use */
 	refcount_t refcount;
+	unsigned int open_count;
 	struct completion comp;
 	struct list_head group_next;
 };
@@ -29,6 +44,8 @@ struct vfio_device {
 /**
  * struct vfio_device_ops - VFIO bus driver device callbacks
  *
+ * @open_device: Called when the first file descriptor is opened for this device
+ * @close_device: Opposite of open_device
  * @open: Called when userspace creates new file descriptor for device
  * @release: Called when userspace releases file descriptor for device
  * @read: Perform read(2) on device file descriptor
@@ -43,6 +60,8 @@ struct vfio_device {
  */
 struct vfio_device_ops {
 	char	*name;
+	int	(*open_device)(struct vfio_device *vdev);
+	void	(*close_device)(struct vfio_device *vdev);
 	int	(*open)(struct vfio_device *vdev);
 	void	(*release)(struct vfio_device *vdev);
 	ssize_t	(*read)(struct vfio_device *vdev, char __user *buf,
@@ -67,6 +86,8 @@ void vfio_unregister_group_dev(struct vfio_device *device);
 extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
 
+int vfio_assign_device_set(struct vfio_device *device, void *set_id);
+
 /* events for the backend driver notify callback */
 enum vfio_iommu_notify_type {
 	VFIO_IOMMU_CONTAINER_CLOSE = 0,
-- 
cgit v1.2.3


From eb24c1007e6852e024dc33b0dd9617b8500a1291 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 5 Aug 2021 22:19:10 -0300
Subject: vfio: Remove struct vfio_device_ops open/release

Nothing uses this anymore, delete it.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Link: https://lore.kernel.org/r/14-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 7 -------
 include/linux/vfio.h | 4 ----
 2 files changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index cb5b7ed1d7c3..68427e8fadeb 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -72,11 +72,6 @@ struct device *mtype_get_parent_dev(struct mdev_type *mtype);
  *			@mdev: mdev_device device structure which is being
  *			       destroyed
  *			Returns integer: success (0) or error (< 0)
- * @open:		Open mediated device.
- *			@mdev: mediated device.
- *			Returns integer: success (0) or error (< 0)
- * @release:		release mediated device
- *			@mdev: mediated device.
  * @read:		Read emulation callback
  *			@mdev: mediated device structure
  *			@buf: read buffer
@@ -113,8 +108,6 @@ struct mdev_parent_ops {
 	int     (*remove)(struct mdev_device *mdev);
 	int     (*open_device)(struct mdev_device *mdev);
 	void    (*close_device)(struct mdev_device *mdev);
-	int     (*open)(struct mdev_device *mdev);
-	void    (*release)(struct mdev_device *mdev);
 	ssize_t (*read)(struct mdev_device *mdev, char __user *buf,
 			size_t count, loff_t *ppos);
 	ssize_t (*write)(struct mdev_device *mdev, const char __user *buf,
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index f0e6a72875e4..b53a9557884a 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -46,8 +46,6 @@ struct vfio_device {
  *
  * @open_device: Called when the first file descriptor is opened for this device
  * @close_device: Opposite of open_device
- * @open: Called when userspace creates new file descriptor for device
- * @release: Called when userspace releases file descriptor for device
  * @read: Perform read(2) on device file descriptor
  * @write: Perform write(2) on device file descriptor
  * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_*
@@ -62,8 +60,6 @@ struct vfio_device_ops {
 	char	*name;
 	int	(*open_device)(struct vfio_device *vdev);
 	void	(*close_device)(struct vfio_device *vdev);
-	int	(*open)(struct vfio_device *vdev);
-	void	(*release)(struct vfio_device *vdev);
 	ssize_t	(*read)(struct vfio_device *vdev, char __user *buf,
 			size_t count, loff_t *ppos);
 	ssize_t	(*write)(struct vfio_device *vdev, const char __user *buf,
-- 
cgit v1.2.3


From c17495b01b72b53bd290f442d39b060e015c7aea Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 10 Aug 2021 12:04:33 +0530
Subject: cpufreq: Add callback to register with energy model

Many cpufreq drivers register with the energy model for each policy and
do exactly the same thing. Follow the footsteps of thermal-cooling, to
get it done from the cpufreq core itself.

Provide a new callback, which will be called, if present, by the cpufreq
core at the right moment (more on that in the code's comment). Also
provide a generic implementation that uses dev_pm_opp_of_register_em().

This also allows us to register with the EM at a later point of time,
compared to ->init(), from where the EM core can access cpufreq policy
directly using cpufreq_cpu_get() type of helpers and perform other work,
like marking few frequencies inefficient, this will be done separately.

Reviewed-by: Quentin Perret <qperret@google.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/cpufreq.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 9fd719475fcd..c65a1d7385f8 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -9,10 +9,12 @@
 #define _LINUX_CPUFREQ_H
 
 #include <linux/clk.h>
+#include <linux/cpu.h>
 #include <linux/cpumask.h>
 #include <linux/completion.h>
 #include <linux/kobject.h>
 #include <linux/notifier.h>
+#include <linux/pm_opp.h>
 #include <linux/pm_qos.h>
 #include <linux/spinlock.h>
 #include <linux/sysfs.h>
@@ -373,6 +375,12 @@ struct cpufreq_driver {
 	/* platform specific boost support code */
 	bool		boost_enabled;
 	int		(*set_boost)(struct cpufreq_policy *policy, int state);
+
+	/*
+	 * Set by drivers that want to register with the energy model after the
+	 * policy is properly initialized, but before the governor is started.
+	 */
+	void		(*register_em)(struct cpufreq_policy *policy);
 };
 
 /* flags */
@@ -1046,4 +1054,10 @@ unsigned int cpufreq_generic_get(unsigned int cpu);
 void cpufreq_generic_init(struct cpufreq_policy *policy,
 		struct cpufreq_frequency_table *table,
 		unsigned int transition_latency);
+
+static inline void cpufreq_register_em_with_opp(struct cpufreq_policy *policy)
+{
+	dev_pm_opp_of_register_em(get_cpu_device(policy->cpu),
+				  policy->related_cpus);
+}
 #endif /* _LINUX_CPUFREQ_H */
-- 
cgit v1.2.3


From 892384cd998a17960dff6ebefc27375f63364111 Mon Sep 17 00:00:00 2001
From: Sven Peter <sven@svenpeter.dev>
Date: Tue, 3 Aug 2021 14:16:49 +0200
Subject: iommu/io-pgtable: Add DART pagetable format

Apple's DART iommu uses a pagetable format that shares some
similarities with the ones already implemented by io-pgtable.c.
Add a new format variant to support the required differences
so that we don't have to duplicate the pagetable handling code.

Reviewed-by: Alexander Graf <graf@amazon.com>
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Sven Peter <sven@svenpeter.dev>
Link: https://lore.kernel.org/r/20210803121651.61594-2-sven@svenpeter.dev
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/io-pgtable.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index c43f3b899d2a..a738483fb4da 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -16,6 +16,7 @@ enum io_pgtable_fmt {
 	ARM_V7S,
 	ARM_MALI_LPAE,
 	AMD_IOMMU_V1,
+	APPLE_DART,
 	IO_PGTABLE_NUM_FMTS,
 };
 
@@ -136,6 +137,11 @@ struct io_pgtable_cfg {
 			u64	transtab;
 			u64	memattr;
 		} arm_mali_lpae_cfg;
+
+		struct {
+			u64 ttbr[4];
+			u32 n_ttbrs;
+		} apple_dart_cfg;
 	};
 };
 
@@ -254,5 +260,6 @@ extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_apple_dart_init_fns;
 
 #endif /* __IO_PGTABLE_H */
-- 
cgit v1.2.3


From ca91ea34778f9b2a44a391b10164bcd73b4b0f25 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Sat, 7 Aug 2021 14:36:54 +1200
Subject: asus-wmi: Add panel overdrive functionality

Some ASUS ROG laptops have the ability to drive the display panel
a higher rate to eliminate or reduce ghosting.

Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20210807023656.25020-2-luke@ljones.dev
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/asus-wmi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 2f274cf52805..428aea701c7b 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -61,6 +61,7 @@
 #define ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY 0x00120075
 
 /* Misc */
+#define ASUS_WMI_DEVID_PANEL_OD		0x00050019
 #define ASUS_WMI_DEVID_CAMERA		0x00060013
 #define ASUS_WMI_DEVID_LID_FLIP		0x00060062
 
-- 
cgit v1.2.3


From 98829e84dc67630efb7de675f0a70066620468a3 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Sat, 7 Aug 2021 14:36:55 +1200
Subject: asus-wmi: Add dgpu disable method

In Windows the ASUS Armory Crate program can enable or disable the
dGPU via a WMI call. This functions much the same as various Linux
methods in software where the dGPU is removed from the device tree.

However the WMI call saves the state of dGPU (enabled or not) and
this then changes the dGPU visibility in Linux with no way for
Linux users to re-enable it. We expose the WMI method so users can
see and change the dGPU ACPI state.

Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20210807023656.25020-3-luke@ljones.dev
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/asus-wmi.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 428aea701c7b..a528f9d0e4b7 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -90,6 +90,9 @@
 /* Keyboard dock */
 #define ASUS_WMI_DEVID_KBD_DOCK		0x00120063
 
+/* dgpu on/off */
+#define ASUS_WMI_DEVID_DGPU		0x00090020
+
 /* DSTS masks */
 #define ASUS_WMI_DSTS_STATUS_BIT	0x00000001
 #define ASUS_WMI_DSTS_UNKNOWN_BIT	0x00000002
-- 
cgit v1.2.3


From 382b91db8044669d254006df799df9d85d4ad891 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Sat, 7 Aug 2021 14:36:56 +1200
Subject: asus-wmi: Add egpu enable method

The X13 Flow laptops can utilise an external GPU. This requires
toggling an ACPI method which will first disable the internal
dGPU, and then enable the eGPU.

Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20210807023656.25020-4-luke@ljones.dev
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/asus-wmi.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index a528f9d0e4b7..17dc5cb6f3f2 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -90,6 +90,9 @@
 /* Keyboard dock */
 #define ASUS_WMI_DEVID_KBD_DOCK		0x00120063
 
+/* dgpu on/off */
+#define ASUS_WMI_DEVID_EGPU		0x00090019
+
 /* dgpu on/off */
 #define ASUS_WMI_DEVID_DGPU		0x00090020
 
-- 
cgit v1.2.3


From 20a1b3acfc802ad7b6b327f2bdc0570711538561 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 3 Aug 2021 18:00:41 +0200
Subject: i2c: acpi: Add an i2c_acpi_client_count() helper function

We have 3 files now which have the need to count the number of
I2cSerialBus resources in an ACPI-device's resource-list.

Currently all implement their own helper function for this,
add a generic helper function to replace the 3 implementations.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20210803160044.158802-2-hdegoede@redhat.com
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/i2c.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 3eb60a2e9e61..2ce3efbe9198 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -1010,6 +1010,7 @@ struct acpi_resource_i2c_serialbus;
 #if IS_ENABLED(CONFIG_ACPI)
 bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 			       struct acpi_resource_i2c_serialbus **i2c);
+int i2c_acpi_client_count(struct acpi_device *adev);
 u32 i2c_acpi_find_bus_speed(struct device *dev);
 struct i2c_client *i2c_acpi_new_device(struct device *dev, int index,
 				       struct i2c_board_info *info);
@@ -1020,6 +1021,10 @@ static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 {
 	return false;
 }
+static inline int i2c_acpi_client_count(struct acpi_device *adev)
+{
+	return 0;
+}
 static inline u32 i2c_acpi_find_bus_speed(struct device *dev)
 {
 	return 0;
-- 
cgit v1.2.3


From fd00faa375fbb9d46ae0730d0faf4a3006301005 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 8 Aug 2021 19:21:56 +0200
Subject: PCI/VPD: Embed struct pci_vpd in struct pci_dev

Now that struct pci_vpd is really small, simplify the code by embedding
struct pci_vpd directly in struct pci_dev instead of dynamically allocating
it.

Link: https://lore.kernel.org/r/d898489e-22ba-71f1-2f31-f1a78dc15849@gmail.com
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 540b377ca8f6..e752cc39a1fe 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -300,9 +300,14 @@ struct pci_cap_saved_state {
 	struct pci_cap_saved_data	cap;
 };
 
+struct pci_vpd {
+	struct mutex	lock;
+	unsigned int	len;
+	u8		cap;
+};
+
 struct irq_affinity;
 struct pcie_link_state;
-struct pci_vpd;
 struct pci_sriov;
 struct pci_p2pdma;
 struct rcec_ea;
@@ -473,7 +478,7 @@ struct pci_dev {
 #ifdef CONFIG_PCI_MSI
 	const struct attribute_group **msi_irq_groups;
 #endif
-	struct pci_vpd *vpd;
+	struct pci_vpd	vpd;
 #ifdef CONFIG_PCIE_DPC
 	u16		dpc_cap;
 	unsigned int	dpc_rp_extensions:1;
-- 
cgit v1.2.3


From 928f9e2686110825262685b7aedc7b21b805fecd Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 12 Aug 2021 20:00:23 +0300
Subject: clk: fractional-divider: Hide clk_fractional_divider_ops from wide
 audience

The providers are all located in drivers/clk/ and hence no need
to export the clock operations to wider audience. Hide them by
moving to drivers/clk/clk-fractional-divider.h.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210812170025.67074-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index d83b829305c0..acb8e10d2898 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1023,7 +1023,6 @@ struct clk_fractional_divider {
 #define CLK_FRAC_DIVIDER_ZERO_BASED		BIT(0)
 #define CLK_FRAC_DIVIDER_BIG_ENDIAN		BIT(1)
 
-extern const struct clk_ops clk_fractional_divider_ops;
 struct clk *clk_register_fractional_divider(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 mshift, u8 mwidth, u8 nshift, u8 nwidth,
-- 
cgit v1.2.3


From 82f53f9ee5770177eb102446cc3513bf07e2668a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 12 Aug 2021 20:00:24 +0300
Subject: clk: fractional-divider: Introduce POWER_OF_TWO_PS flag

The newly introduced POWER_OF_TWO_PS flag, when set, makes the flow
to skip the assumption that the caller will use an additional 2^scale
prescaler to get the desired clock rate.

Reported-by: Liu Ying <victor.liu@nxp.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210812170025.67074-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index acb8e10d2898..d63d07fd251b 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1001,6 +1001,12 @@ struct clk_hw *devm_clk_hw_register_fixed_factor(struct device *dev,
  * CLK_FRAC_DIVIDER_BIG_ENDIAN - By default little endian register accesses are
  *	used for the divider register.  Setting this flag makes the register
  *	accesses big endian.
+ * CLK_FRAC_DIVIDER_POWER_OF_TWO_PS - By default the resulting fraction might
+ *	be saturated and the caller will get quite far from the good enough
+ *	approximation. Instead the caller may require, by setting this flag,
+ *	to shift left by a few bits in case, when the asked one is quite small
+ *	to satisfy the desired range of denominator. It assumes that on the
+ *	caller's side the power-of-two capable prescaler exists.
  */
 struct clk_fractional_divider {
 	struct clk_hw	hw;
@@ -1022,6 +1028,7 @@ struct clk_fractional_divider {
 
 #define CLK_FRAC_DIVIDER_ZERO_BASED		BIT(0)
 #define CLK_FRAC_DIVIDER_BIG_ENDIAN		BIT(1)
+#define CLK_FRAC_DIVIDER_POWER_OF_TWO_PS	BIT(2)
 
 struct clk *clk_register_fractional_divider(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
-- 
cgit v1.2.3


From 7bb698f09bdd01fbb6d48c14bb1dde556dc1af00 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 28 Jul 2021 07:47:33 -0500
Subject: fs: Move notify_change permission checks into may_setattr

Move the permission checks in notify_change into a separate function to
make them available to filesystems.

When notify_change is called, the vfs performs those checks before
calling into iop->setattr.  However, a filesystem like gfs2 can only
lock and revalidate the inode inside ->setattr, and it must then repeat
those checks to err on the safe side.

It would be nice to get rid of the double checking, but moving the
permission check into iop->setattr altogether isn't really an option.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 640574294216..50192964bf6b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3469,6 +3469,8 @@ extern int buffer_migrate_page_norefs(struct address_space *,
 #define buffer_migrate_page_norefs NULL
 #endif
 
+int may_setattr(struct user_namespace *mnt_userns, struct inode *inode,
+		unsigned int ia_valid);
 int setattr_prepare(struct user_namespace *, struct dentry *, struct iattr *);
 extern int inode_newsize_ok(const struct inode *, loff_t offset);
 void setattr_copy(struct user_namespace *, struct inode *inode,
-- 
cgit v1.2.3


From 3165af738ed3224a84ead7d97c6909de2e453b4c Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 30 Jul 2021 18:04:49 -0400
Subject: KVM: Allow to have arch-specific per-vm debugfs files

Allow archs to create arch-specific nodes under kvm->debugfs_dentry directory
besides the stats fields.  The new interface kvm_arch_create_vm_debugfs() is
defined but not yet used.  It's called after kvm->debugfs_dentry is created, so
it can be referenced directly in kvm_arch_create_vm_debugfs().  Arch should
define their own versions when they want to create extra debugfs nodes.

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20210730220455.26054-2-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 492d183dd7d0..f50bfcf225f0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1073,6 +1073,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
 bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_post_init_vm(struct kvm *kvm);
 void kvm_arch_pre_destroy_vm(struct kvm *kvm);
+int kvm_arch_create_vm_debugfs(struct kvm *kvm);
 
 #ifndef __KVM_HAVE_ARCH_VM_ALLOC
 /*
-- 
cgit v1.2.3


From 2a047e0662aee1bd773e0415accd785ad26a9398 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Aug 2021 17:19:36 +0200
Subject: dma-mapping: return an unsigned int from dma_map_sg{,_attrs}

These can only return 0 for failure or the number of entries, so turn
the return value into an unsigned int.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
---
 include/linux/dma-mapping.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index daa1e360f0ee..dca2b1355bb1 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -105,8 +105,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
 		unsigned long attrs);
 void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
 		enum dma_data_direction dir, unsigned long attrs);
-int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
-		enum dma_data_direction dir, unsigned long attrs);
+unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+		int nents, enum dma_data_direction dir, unsigned long attrs);
 void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
 				      int nents, enum dma_data_direction dir,
 				      unsigned long attrs);
@@ -166,8 +166,9 @@ static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 }
-static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
+static inline unsigned int dma_map_sg_attrs(struct device *dev,
+		struct scatterlist *sg, int nents, enum dma_data_direction dir,
+		unsigned long attrs)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From fb83610762dd5927212aa62a468dd3b756b57a88 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 22 Jul 2021 11:06:44 +0200
Subject: thermal/core: Fix thermal_cooling_device_register() prototype

There are two pairs of declarations for thermal_cooling_device_register()
and thermal_of_cooling_device_register(), and only one set was changed
in a recent patch, so the other one now causes a compile-time warning:

drivers/net/wireless/mediatek/mt76/mt7915/init.c: In function 'mt7915_thermal_init':
drivers/net/wireless/mediatek/mt76/mt7915/init.c:134:48: error: passing argument 1 of 'thermal_cooling_device_register' discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers]
  134 |         cdev = thermal_cooling_device_register(wiphy_name(wiphy), phy,
      |                                                ^~~~~~~~~~~~~~~~~
In file included from drivers/net/wireless/mediatek/mt76/mt7915/init.c:7:
include/linux/thermal.h:407:39: note: expected 'char *' but argument is of type 'const char *'
  407 | thermal_cooling_device_register(char *type, void *devdata,
      |                                 ~~~~~~^~~~

Change the dummy helper functions to have the same arguments as the
normal version.

Fixes: f991de53a8ab ("thermal: make device_register's type argument const")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Jean-Francois Dagenais <jeff.dagenais@gmail.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20210722090717.1116748-1-arnd@kernel.org
---
 include/linux/thermal.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index d296f3b88fb9..8050d929a5b4 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -404,12 +404,13 @@ static inline void thermal_zone_device_unregister(
 	struct thermal_zone_device *tz)
 { }
 static inline struct thermal_cooling_device *
-thermal_cooling_device_register(char *type, void *devdata,
+thermal_cooling_device_register(const char *type, void *devdata,
 	const struct thermal_cooling_device_ops *ops)
 { return ERR_PTR(-ENODEV); }
 static inline struct thermal_cooling_device *
 thermal_of_cooling_device_register(struct device_node *np,
-	char *type, void *devdata, const struct thermal_cooling_device_ops *ops)
+	const char *type, void *devdata,
+	const struct thermal_cooling_device_ops *ops)
 { return ERR_PTR(-ENODEV); }
 static inline struct thermal_cooling_device *
 devm_thermal_of_cooling_device_register(struct device *dev,
-- 
cgit v1.2.3


From 454f2ed4b34f9ef5726d080b1eb5dc47a7f36d6f Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 14 Jul 2021 13:27:01 +0200
Subject: thermal: Spelling s/scallbacks/callbacks/

Fix a misspelling of the word "callbacks".

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/ae38372996a23bb67769e2d62ca170ae9457c4df.1626261946.git.geert+renesas@glider.be
---
 include/linux/thermal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 8050d929a5b4..c314893970b3 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -285,7 +285,7 @@ struct thermal_zone_params {
 };
 
 /**
- * struct thermal_zone_of_device_ops - scallbacks for handling DT based zones
+ * struct thermal_zone_of_device_ops - callbacks for handling DT based zones
  *
  * Mandatory:
  * @get_temp: a pointer to a function that reads the sensor temperature.
-- 
cgit v1.2.3


From c3b1c377f0102e88dea6354d9cdb34a9d1c90971 Mon Sep 17 00:00:00 2001
From: Huang Shijie <shijie@os.amperecomputing.com>
Date: Mon, 2 Aug 2021 14:02:34 +0000
Subject: tracing: Fix a typo in tracepoint.h

It should be @prev_pid, not @prev_prid.

Link: https://lkml.kernel.org/r/20210802140234.5383-1-shijie@os.amperecomputing.com

Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index ab58696d0ddd..28031b15f878 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -475,7 +475,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  *	*
  *	* The declared 'local variable' is called '__entry'
  *	*
- *	* __field(pid_t, prev_prid) is equivalent to a standard declaration:
+ *	* __field(pid_t, prev_pid) is equivalent to a standard declaration:
  *	*
  *	*	pid_t	prev_pid;
  *	*
-- 
cgit v1.2.3


From 4f911138c8da94bcff84f1d093d28e378703c43f Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sat, 19 Jun 2021 12:26:16 +0300
Subject: fs: add generic helper for filling statx attribute flags

The immutable and append-only properties on an inode are published on
the inode's i_flags and enforced by the VFS.

Create a helper to fill the corresponding STATX_ATTR_ flags in the kstat
structure from the inode's i_flags.

Only orange was converted to use this helper.
Other filesystems could use it in the future.

Suggested-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/linux/fs.h   | 1 +
 include/linux/stat.h | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 640574294216..ae6c6c34db94 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3355,6 +3355,7 @@ extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
 extern void kfree_link(void *);
 void generic_fillattr(struct user_namespace *, struct inode *, struct kstat *);
+void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
 extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
 extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 void __inode_add_bytes(struct inode *inode, loff_t bytes);
diff --git a/include/linux/stat.h b/include/linux/stat.h
index fff27e603814..7df06931f25d 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -34,6 +34,10 @@ struct kstat {
 	 STATX_ATTR_ENCRYPTED |				\
 	 STATX_ATTR_VERITY				\
 	 )/* Attrs corresponding to FS_*_FL flags */
+#define KSTAT_ATTR_VFS_FLAGS				\
+	(STATX_ATTR_IMMUTABLE |				\
+	 STATX_ATTR_APPEND				\
+	 ) /* Attrs corresponding to S_* flags that are enforced by the VFS */
 	u64		ino;
 	dev_t		dev;
 	dev_t		rdev;
-- 
cgit v1.2.3


From 69139244806537f9d51364f37fe146bb2ee88a05 Mon Sep 17 00:00:00 2001
From: Amey Narkhede <ameynarkhede03@gmail.com>
Date: Tue, 17 Aug 2021 23:34:52 +0530
Subject: PCI: Cache PCIe Device Capabilities register

Add a new member called devcap in struct pci_dev for caching the PCIe
Device Capabilities register to avoid reading PCI_EXP_DEVCAP multiple
times.

Refactor pcie_has_flr() to use cached device capabilities.

Link: https://lore.kernel.org/r/20210817180500.1253-2-ameynarkhede03@gmail.com
Signed-off-by: Amey Narkhede <ameynarkhede03@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 540b377ca8f6..1179c0ee2bfb 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -333,6 +333,7 @@ struct pci_dev {
 	struct rcec_ea	*rcec_ea;	/* RCEC cached endpoint association */
 	struct pci_dev  *rcec;          /* Associated RCEC device */
 #endif
+	u32		devcap;		/* PCIe Device Capabilities */
 	u8		pcie_cap;	/* PCIe capability offset */
 	u8		msi_cap;	/* MSI capability offset */
 	u8		msix_cap;	/* MSI-X capability offset */
-- 
cgit v1.2.3


From 56f107d7813f116484019617043393a7753ffcbf Mon Sep 17 00:00:00 2001
From: Amey Narkhede <ameynarkhede03@gmail.com>
Date: Tue, 17 Aug 2021 23:34:53 +0530
Subject: PCI: Add pcie_reset_flr() with 'probe' argument

Most reset methods are of the form "pci_*_reset(dev, probe)".  pcie_flr()
was an exception because it relied on a separate pcie_has_flr() function
instead of taking a "probe" argument.

Add "pcie_reset_flr(dev, probe)" to follow the convention.  Remove
pcie_has_flr().

Some pcie_flr() callers that did not use pcie_has_flr() remain.

[bhelgaas: commit log, rework pcie_reset_flr() to use dev->devcap directly]
Link: https://lore.kernel.org/r/20210817180500.1253-3-ameynarkhede03@gmail.com
Signed-off-by: Amey Narkhede <ameynarkhede03@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
---
 include/linux/pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 1179c0ee2bfb..1de37e3fc29d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1229,7 +1229,7 @@ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
 			     enum pci_bus_speed *speed,
 			     enum pcie_link_width *width);
 void pcie_print_link_status(struct pci_dev *dev);
-bool pcie_has_flr(struct pci_dev *dev);
+int pcie_reset_flr(struct pci_dev *dev, int probe);
 int pcie_flr(struct pci_dev *dev);
 int __pci_reset_function_locked(struct pci_dev *dev);
 int pci_reset_function(struct pci_dev *dev);
-- 
cgit v1.2.3


From e20afa06244eb5d7fa850f9fe2a78ae17ba96f81 Mon Sep 17 00:00:00 2001
From: Amey Narkhede <ameynarkhede03@gmail.com>
Date: Tue, 17 Aug 2021 23:34:54 +0530
Subject: PCI: Add array to track reset method ordering

Add reset_methods[] in struct pci_dev to keep track of reset mechanisms
supported by the device and their ordering.

Refactor probing and reset functions to take advantage of calling
convention of reset functions.

Co-developed-by: Alex Williamson <alex.williamson@redhat.com>
Link: https://lore.kernel.org/r/20210817180500.1253-4-ameynarkhede03@gmail.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Amey Narkhede <ameynarkhede03@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
---
 include/linux/pci.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 1de37e3fc29d..2faf517d20c1 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -49,6 +49,9 @@
 			       PCI_STATUS_SIG_TARGET_ABORT | \
 			       PCI_STATUS_PARITY)
 
+/* Number of reset methods used in pci_reset_fn_methods array in pci.c */
+#define PCI_NUM_RESET_METHODS 6
+
 /*
  * The PCI interface treats multi-function devices as independent
  * devices.  The slot/function address of each device is encoded
@@ -506,6 +509,9 @@ struct pci_dev {
 	char		*driver_override; /* Driver name to force a match */
 
 	unsigned long	priv_flags;	/* Private flags for the PCI driver */
+
+	/* These methods index pci_reset_fn_methods[] */
+	u8 reset_methods[PCI_NUM_RESET_METHODS]; /* In priority order */
 };
 
 static inline struct pci_dev *pci_physfn(struct pci_dev *dev)
-- 
cgit v1.2.3


From 4ec36dfeb155b72da8d28ab006a46f2f8b981eac Mon Sep 17 00:00:00 2001
From: Amey Narkhede <ameynarkhede03@gmail.com>
Date: Tue, 17 Aug 2021 23:34:55 +0530
Subject: PCI: Remove reset_fn field from pci_dev

"reset_fn" indicates whether the device supports any reset mechanism.
Remove the use of reset_fn in favor of the reset_methods array that tracks
supported reset mechanisms of a device and their ordering.

The octeon driver incorrectly used reset_fn to detect whether the device
supports FLR or not. Use pcie_reset_flr() to probe whether it supports FLR.

Co-developed-by: Alex Williamson <alex.williamson@redhat.com>
Link: https://lore.kernel.org/r/20210817180500.1253-5-ameynarkhede03@gmail.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Amey Narkhede <ameynarkhede03@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
---
 include/linux/pci.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2faf517d20c1..d1f4d248617b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -431,7 +431,6 @@ struct pci_dev {
 	unsigned int	state_saved:1;
 	unsigned int	is_physfn:1;
 	unsigned int	is_virtfn:1;
-	unsigned int	reset_fn:1;
 	unsigned int	is_hotplug_bridge:1;
 	unsigned int	shpc_managed:1;		/* SHPC owned by shpchp */
 	unsigned int	is_thunderbolt:1;	/* Thunderbolt controller */
-- 
cgit v1.2.3


From de3438c47a8ddc75548e62a03736a9321c2b7bac Mon Sep 17 00:00:00 2001
From: Thara Gopinath <thara.gopinath@linaro.org>
Date: Mon, 9 Aug 2021 15:15:59 -0400
Subject: firmware: qcom_scm: Introduce SCM calls to access LMh

Introduce SCM calls to access/configure limits management hardware(LMH).

Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20210809191605.3742979-2-thara.gopinath@linaro.org
---
 include/linux/qcom_scm.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h
index 0165824c5128..c0475d1c9885 100644
--- a/include/linux/qcom_scm.h
+++ b/include/linux/qcom_scm.h
@@ -109,6 +109,12 @@ extern int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt,
 			     u32 *resp);
 
 extern int qcom_scm_qsmmu500_wait_safe_toggle(bool en);
+
+extern int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val,
+			      u64 limit_node, u32 node_id, u64 version);
+extern int qcom_scm_lmh_profile_change(u32 profile_id);
+extern bool qcom_scm_lmh_dcvsh_available(void);
+
 #else
 
 #include <linux/errno.h>
@@ -170,5 +176,13 @@ static inline int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt,
 
 static inline int qcom_scm_qsmmu500_wait_safe_toggle(bool en)
 		{ return -ENODEV; }
+
+static inline int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val,
+				     u64 limit_node, u32 node_id, u64 version)
+		{ return -ENODEV; }
+
+static inline int qcom_scm_lmh_profile_change(u32 profile_id) { return -ENODEV; }
+
+static inline bool qcom_scm_lmh_dcvsh_available(void) { return -ENODEV; }
 #endif
 #endif
-- 
cgit v1.2.3


From 46983fcd67ac5a830d41ebe3755314db67a6dd16 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:15 +0100
Subject: iommu: Pull IOVA cookie management into the core

Now that everyone has converged on iommu-dma for IOMMU_DOMAIN_DMA
support, we can abandon the notion of drivers being responsible for the
cookie type, and consolidate all the management into the core code.

CC: Yong Wu <yong.wu@mediatek.com>
CC: Chunyan Zhang <chunyan.zhang@unisoc.com>
CC: Maxime Ripard <mripard@kernel.org>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/46a2c0e7419c7d1d931762dc7b6a69fa082d199a.1628682048.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4997c78e2670..141779d76035 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -40,6 +40,7 @@ struct iommu_domain;
 struct notifier_block;
 struct iommu_sva;
 struct iommu_fault_event;
+struct iommu_dma_cookie;
 
 /* iommu fault flags */
 #define IOMMU_FAULT_READ	0x0
@@ -86,7 +87,7 @@ struct iommu_domain {
 	iommu_fault_handler_t handler;
 	void *handler_token;
 	struct iommu_domain_geometry geometry;
-	void *iova_cookie;
+	struct iommu_dma_cookie *iova_cookie;
 };
 
 enum iommu_cap {
-- 
cgit v1.2.3


From 7a7c5badf85806eab75e31ab8d45021f1545b0e3 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:28 +0100
Subject: iommu: Indicate queued flushes via gather data

Since iommu_iotlb_gather exists to help drivers optimise flushing for a
given unmap request, it is also the logical place to indicate whether
the unmap is strict or not, and thus help them further optimise for
whether to expect a sync or a flush_all subsequently. As part of that,
it also seems fair to make the flush queue code take responsibility for
enforcing the really subtle ordering requirement it brings, so that we
don't need to worry about forgetting that if new drivers want to add
flush queue support, and can consolidate the existing versions.

While we're adding to the kerneldoc, also fill in some info for
@freelist which was overlooked previously.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/bf5f8e2ad84e48c712ccbf80fa8c610594c7595f.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 141779d76035..f7679f6684b1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -161,16 +161,22 @@ enum iommu_dev_features {
  * @start: IOVA representing the start of the range to be flushed
  * @end: IOVA representing the end of the range to be flushed (inclusive)
  * @pgsize: The interval at which to perform the flush
+ * @freelist: Removed pages to free after sync
+ * @queued: Indicates that the flush will be queued
  *
  * This structure is intended to be updated by multiple calls to the
  * ->unmap() function in struct iommu_ops before eventually being passed
- * into ->iotlb_sync().
+ * into ->iotlb_sync(). Drivers can add pages to @freelist to be freed after
+ * ->iotlb_sync() or ->iotlb_flush_all() have cleared all cached references to
+ * them. @queued is set to indicate when ->iotlb_flush_all() will be called
+ * later instead of ->iotlb_sync(), so drivers may optimise accordingly.
  */
 struct iommu_iotlb_gather {
 	unsigned long		start;
 	unsigned long		end;
 	size_t			pgsize;
 	struct page		*freelist;
+	bool			queued;
 };
 
 /**
-- 
cgit v1.2.3


From a8e5f04458c4e496eada2b029ce96713bb6c388d Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:29 +0100
Subject: iommu/io-pgtable: Remove non-strict quirk

IO_PGTABLE_QUIRK_NON_STRICT was never a very comfortable fit, since it's
not a quirk of the pagetable format itself. Now that we have a more
appropriate way to convey non-strict unmaps, though, this last of the
non-quirk quirks can also go, and with the flush queue code also now
enforcing its own ordering we can have a lovely cleanup all round.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/155b5c621cd8936472e273a8b07a182f62c6c20d.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/io-pgtable.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index c43f3b899d2a..9ba6d9ea316e 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -73,10 +73,6 @@ struct io_pgtable_cfg {
 	 *	to support up to 35 bits PA where the bit32, bit33 and bit34 are
 	 *	encoded in the bit9, bit4 and bit5 of the PTE respectively.
 	 *
-	 * IO_PGTABLE_QUIRK_NON_STRICT: Skip issuing synchronous leaf TLBIs
-	 *	on unmap, for DMA domains using the flush queue mechanism for
-	 *	delayed invalidation.
-	 *
 	 * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
 	 *	for use in the upper half of a split address space.
 	 *
@@ -86,7 +82,6 @@ struct io_pgtable_cfg {
 	#define IO_PGTABLE_QUIRK_ARM_NS		BIT(0)
 	#define IO_PGTABLE_QUIRK_NO_PERMS	BIT(1)
 	#define IO_PGTABLE_QUIRK_ARM_MTK_EXT	BIT(3)
-	#define IO_PGTABLE_QUIRK_NON_STRICT	BIT(4)
 	#define IO_PGTABLE_QUIRK_ARM_TTBR1	BIT(5)
 	#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA	BIT(6)
 	unsigned long			quirks;
-- 
cgit v1.2.3


From bf3aed4660c6e3c44c69f07d8927ee5a22a952ac Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:30 +0100
Subject: iommu: Introduce explicit type for non-strict DMA domains

Promote the difference between strict and non-strict DMA domains from an
internal detail to a distinct domain feature and type, to pave the road
for exposing it through the sysfs default domain interface.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/08cd2afaf6b63c58ad49acec3517c9b32c2bb946.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index f7679f6684b1..5629ae42951f 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -61,6 +61,7 @@ struct iommu_domain_geometry {
 #define __IOMMU_DOMAIN_DMA_API	(1U << 1)  /* Domain for use in DMA-API
 					      implementation              */
 #define __IOMMU_DOMAIN_PT	(1U << 2)  /* Domain is identity mapped   */
+#define __IOMMU_DOMAIN_DMA_FQ	(1U << 3)  /* DMA-API uses flush queue    */
 
 /*
  * This are the possible domain-types
@@ -73,12 +74,17 @@ struct iommu_domain_geometry {
  *	IOMMU_DOMAIN_DMA	- Internally used for DMA-API implementations.
  *				  This flag allows IOMMU drivers to implement
  *				  certain optimizations for these domains
+ *	IOMMU_DOMAIN_DMA_FQ	- As above, but definitely using batched TLB
+ *				  invalidation.
  */
 #define IOMMU_DOMAIN_BLOCKED	(0U)
 #define IOMMU_DOMAIN_IDENTITY	(__IOMMU_DOMAIN_PT)
 #define IOMMU_DOMAIN_UNMANAGED	(__IOMMU_DOMAIN_PAGING)
 #define IOMMU_DOMAIN_DMA	(__IOMMU_DOMAIN_PAGING |	\
 				 __IOMMU_DOMAIN_DMA_API)
+#define IOMMU_DOMAIN_DMA_FQ	(__IOMMU_DOMAIN_PAGING |	\
+				 __IOMMU_DOMAIN_DMA_API |	\
+				 __IOMMU_DOMAIN_DMA_FQ)
 
 struct iommu_domain {
 	unsigned type;
@@ -90,6 +96,11 @@ struct iommu_domain {
 	struct iommu_dma_cookie *iova_cookie;
 };
 
+static inline bool iommu_is_dma_domain(struct iommu_domain *domain)
+{
+	return domain->type & __IOMMU_DOMAIN_DMA_API;
+}
+
 enum iommu_cap {
 	IOMMU_CAP_CACHE_COHERENCY,	/* IOMMU can enforce cache coherent DMA
 					   transactions */
-- 
cgit v1.2.3


From c208916fe6c7b84e3ec95cd91853039596eeb2cf Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:34 +0100
Subject: iommu: Express DMA strictness via the domain type

Eliminate the iommu_get_dma_strict() indirection and pipe the
information through the domain type from the beginning. Besides
the flow simplification this also has several nice side-effects:

 - Automatically implies strict mode for untrusted devices by
   virtue of their IOMMU_DOMAIN_DMA override.
 - Ensures that we only end up using flush queues for drivers
   which are aware of them and can actually benefit.
 - Allows us to handle flush queue init failure by falling back
   to strict mode instead of leaving it to possibly blow up later.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/47083d69155577f1367877b1594921948c366eb3.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5629ae42951f..923a8d1c5e39 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -504,7 +504,6 @@ int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirks);
 
 void iommu_set_dma_strict(void);
-bool iommu_get_dma_strict(struct iommu_domain *domain);
 
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
-- 
cgit v1.2.3


From 452e69b58c2889e5546edb92d9e66285410f7463 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 11 Aug 2021 13:21:38 +0100
Subject: iommu: Allow enabling non-strict mode dynamically

Allocating and enabling a flush queue is in fact something we can
reasonably do while a DMA domain is active, without having to rebuild it
from scratch. Thus we can allow a strict -> non-strict transition from
sysfs without requiring to unbind the device's driver, which is of
particular interest to users who want to make selective relaxations to
critical devices like the one serving their root filesystem.

Disabling and draining a queue also seems technically possible to
achieve without rebuilding the whole domain, but would certainly be more
involved. Furthermore there's not such a clear use-case for tightening
up security *after* the device may already have done whatever it is that
you don't trust it not to do, so we only consider the relaxation case.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/d652966348c78457c38bf18daf369272a4ebc2c9.1628682049.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/dma-iommu.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 758ca4694257..24607dc3c2ac 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -20,6 +20,7 @@ void iommu_put_dma_cookie(struct iommu_domain *domain);
 
 /* Setup call for arch DMA mapping code */
 void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
+int iommu_dma_init_fq(struct iommu_domain *domain);
 
 /* The DMA API isn't _quite_ the whole story, though... */
 /*
@@ -54,6 +55,11 @@ static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base,
 {
 }
 
+static inline int iommu_dma_init_fq(struct iommu_domain *domain)
+{
+	return -EINVAL;
+}
+
 static inline int iommu_get_dma_cookie(struct iommu_domain *domain)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From 39a2d3506b2d53c569a6db13d65b2f3728c4feec Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 23 Jun 2021 16:05:05 +0200
Subject: dma-mapping: add a dma_init_global_coherent helper

Add a new helper to initialize the global coherent pool.  This both
cleans up the existing initialization which indirects through the
reserved_mem_ops that are normally only used for struct device, and
also allows using the global pool for non-devicetree architectures.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Dillon Min <dillon.minfei@gmail.com>
---
 include/linux/dma-map-ops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 2f842498c448..068f1b11a6a4 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -177,7 +177,7 @@ void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
 int dma_release_from_global_coherent(int order, void *vaddr);
 int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr,
 		size_t size, int *ret);
-
+int dma_init_global_coherent(phys_addr_t phys_addr, size_t size);
 #else
 static inline int dma_declare_coherent_memory(struct device *dev,
 		phys_addr_t phys_addr, dma_addr_t device_addr, size_t size)
-- 
cgit v1.2.3


From 0cad6246621b5887d5b33fea84219d2a71f2f99a Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 18 Aug 2021 22:08:24 +0200
Subject: vfs: add rcu argument to ->get_acl() callback

Add a rcu argument to the ->get_acl() callback to allow
get_cached_acl_rcu() to call the ->get_acl() method in the next patch.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae6c6c34db94..73376dfe28d0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2065,7 +2065,7 @@ struct inode_operations {
 	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
 	const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
 	int (*permission) (struct user_namespace *, struct inode *, int);
-	struct posix_acl * (*get_acl)(struct inode *, int);
+	struct posix_acl * (*get_acl)(struct inode *, int, bool);
 
 	int (*readlink) (struct dentry *, char __user *,int);
 
-- 
cgit v1.2.3


From 332f606b32b6291a944c8cf23b91f53a6e676525 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 18 Aug 2021 22:08:24 +0200
Subject: ovl: enable RCU'd ->get_acl()

Overlayfs does not cache ACL's (to avoid double caching).  Instead it just
calls the underlying filesystem's i_op->get_acl(), which will return the
cached value, if possible.

In rcu path walk, however, get_cached_acl_rcu() is employed to get the
value from the cache, which will fail on overlayfs resulting in dropping
out of rcu walk mode.  This can result in a big performance hit in certain
situations.

Fix by calling ->get_acl() with rcu=true in case of ACL_DONT_CACHE (which
indicates pass-through)

Reported-by: garyhuang <zjh.20052005@163.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/linux/fs.h        | 5 +++++
 include/linux/posix_acl.h | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 73376dfe28d0..c6e5bcbff0c0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -581,6 +581,11 @@ static inline void mapping_allow_writable(struct address_space *mapping)
 
 struct posix_acl;
 #define ACL_NOT_CACHED ((void *)(-1))
+/*
+ * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to
+ * cache the ACL.  This also means that ->get_acl() can be called in RCU mode
+ * with the LOOKUP_RCU flag.
+ */
 #define ACL_DONT_CACHE ((void *)(-3))
 
 static inline struct posix_acl *
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 307094ebb88c..b65c877d92b8 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -72,6 +72,8 @@ extern struct posix_acl *get_posix_acl(struct inode *, int);
 extern int set_posix_acl(struct user_namespace *, struct inode *, int,
 			 struct posix_acl *);
 
+struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
+
 #ifdef CONFIG_FS_POSIX_ACL
 int posix_acl_chmod(struct user_namespace *, struct inode *, umode_t);
 extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
@@ -84,7 +86,6 @@ extern int simple_set_acl(struct user_namespace *, struct inode *,
 extern int simple_acl_create(struct inode *, struct inode *);
 
 struct posix_acl *get_cached_acl(struct inode *inode, int type);
-struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
 void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
 void forget_cached_acl(struct inode *inode, int type);
 void forget_all_cached_acls(struct inode *inode);
-- 
cgit v1.2.3


From 8b0e6c744fef6462382041b30878c91f15069fc6 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 16 Aug 2021 23:42:56 -0400
Subject: tracing: Add DYNAMIC flag for dynamic events

To differentiate between static and dynamic events, add a new flag
DYNAMIC to the event flags that all dynamic events have set. This will
allow to differentiate when attaching to a dynamic event from a static
event.

Static events have a mod pointer that references the module they were
created in (or NULL for core kernel). This can be incremented when the
event has something attached to it. But there exists no such mechanism for
dynamic events. This is dangerous as the dynamic events may now disappear
without the "attachment" knowing that it no longer exists.

To enforce the dynamic flag, change dyn_event_add() to pass the event that
is being created such that it can set the DYNAMIC flag of the event. This
helps make sure that no location that creates a dynamic event misses
setting this flag.

Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/
Link: https://lkml.kernel.org/r/20210817035026.936958254@goodmis.org

Suggested-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index ad413b382a3c..53c9dffd87fd 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -310,6 +310,7 @@ enum {
 	TRACE_EVENT_FL_NO_SET_FILTER_BIT,
 	TRACE_EVENT_FL_IGNORE_ENABLE_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
+	TRACE_EVENT_FL_DYNAMIC_BIT,
 	TRACE_EVENT_FL_KPROBE_BIT,
 	TRACE_EVENT_FL_UPROBE_BIT,
 };
@@ -321,6 +322,7 @@ enum {
  *  NO_SET_FILTER - Set when filter has error and is to be ignored
  *  IGNORE_ENABLE - For trace internal events, do not enable with debugfs file
  *  TRACEPOINT    - Event is a tracepoint
+ *  DYNAMIC       - Event is a dynamic event (created at run time)
  *  KPROBE        - Event is a kprobe
  *  UPROBE        - Event is a uprobe
  */
@@ -330,6 +332,7 @@ enum {
 	TRACE_EVENT_FL_NO_SET_FILTER	= (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT),
 	TRACE_EVENT_FL_IGNORE_ENABLE	= (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+	TRACE_EVENT_FL_DYNAMIC		= (1 << TRACE_EVENT_FL_DYNAMIC_BIT),
 	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 	TRACE_EVENT_FL_UPROBE		= (1 << TRACE_EVENT_FL_UPROBE_BIT),
 };
-- 
cgit v1.2.3


From 1d18538e6a09265003a0a94ca779d7a6127cb76c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 16 Aug 2021 23:42:57 -0400
Subject: tracing: Have dynamic events have a ref counter

As dynamic events are not created by modules, if something is attached to
one, calling "try_module_get()" on its "mod" field, is not going to keep
the dynamic event from going away.

Since dynamic events do not need the "mod" pointer of the event structure,
make a union out of it in order to save memory (there's one structure for
each of the thousand+ events in the kernel), and have any event with the
DYNAMIC flag set to use a ref counter instead.

Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/
Link: https://lkml.kernel.org/r/20210817035027.174869074@goodmis.org

Suggested-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 45 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 53c9dffd87fd..9564c4d9a3b6 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -350,7 +350,14 @@ struct trace_event_call {
 	struct trace_event	event;
 	char			*print_fmt;
 	struct event_filter	*filter;
-	void			*mod;
+	/*
+	 * Static events can disappear with modules,
+	 * where as dynamic ones need their own ref count.
+	 */
+	union {
+		void				*module;
+		atomic_t			refcnt;
+	};
 	void			*data;
 
 	/* See the TRACE_EVENT_FL_* flags above */
@@ -366,6 +373,42 @@ struct trace_event_call {
 #endif
 };
 
+#ifdef CONFIG_DYNAMIC_EVENTS
+bool trace_event_dyn_try_get_ref(struct trace_event_call *call);
+void trace_event_dyn_put_ref(struct trace_event_call *call);
+bool trace_event_dyn_busy(struct trace_event_call *call);
+#else
+static inline bool trace_event_dyn_try_get_ref(struct trace_event_call *call)
+{
+	/* Without DYNAMIC_EVENTS configured, nothing should be calling this */
+	return false;
+}
+static inline void trace_event_dyn_put_ref(struct trace_event_call *call)
+{
+}
+static inline bool trace_event_dyn_busy(struct trace_event_call *call)
+{
+	/* Nothing should call this without DYNAIMIC_EVENTS configured. */
+	return true;
+}
+#endif
+
+static inline bool trace_event_try_get_ref(struct trace_event_call *call)
+{
+	if (call->flags & TRACE_EVENT_FL_DYNAMIC)
+		return trace_event_dyn_try_get_ref(call);
+	else
+		return try_module_get(call->module);
+}
+
+static inline void trace_event_put_ref(struct trace_event_call *call)
+{
+	if (call->flags & TRACE_EVENT_FL_DYNAMIC)
+		trace_event_dyn_put_ref(call);
+	else
+		module_put(call->module);
+}
+
 #ifdef CONFIG_PERF_EVENTS
 static inline bool bpf_prog_array_valid(struct trace_event_call *call)
 {
-- 
cgit v1.2.3


From 6937b7dd434962377e00efc04adac0390c287199 Mon Sep 17 00:00:00 2001
From: Shanker Donthineni <sdonthineni@nvidia.com>
Date: Tue, 17 Aug 2021 23:34:59 +0530
Subject: PCI: Add support for ACPI _RST reset method

_RST is a standard ACPI method that performs a function level reset of a
device (ACPI v6.3, sec 7.3.25).

Add pci_dev_acpi_reset() to probe for _RST method and execute if present.
The default priority of this reset is set to below device-specific and
above hardware resets.

Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Link: https://lore.kernel.org/r/20210817180500.1253-9-ameynarkhede03@gmail.com
Signed-off-by: Shanker Donthineni <sdonthineni@nvidia.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Sinan Kaya <okaya@kernel.org>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index d1f4d248617b..98718f46a61c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -50,7 +50,7 @@
 			       PCI_STATUS_PARITY)
 
 /* Number of reset methods used in pci_reset_fn_methods array in pci.c */
-#define PCI_NUM_RESET_METHODS 6
+#define PCI_NUM_RESET_METHODS 7
 
 /*
  * The PCI interface treats multi-function devices as independent
-- 
cgit v1.2.3


From 9bdc81ce440ec6ea899b236879aee470ec388020 Mon Sep 17 00:00:00 2001
From: Amey Narkhede <ameynarkhede03@gmail.com>
Date: Tue, 17 Aug 2021 23:35:00 +0530
Subject: PCI: Change the type of probe argument in reset functions

Change the type of probe argument in functions which implement reset
methods from int to bool to make the context and intent clear.

Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Link: https://lore.kernel.org/r/20210817180500.1253-10-ameynarkhede03@gmail.com
Signed-off-by: Amey Narkhede <ameynarkhede03@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h         | 5 ++++-
 include/linux/pci_hotplug.h | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 98718f46a61c..a46363f29b68 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -52,6 +52,9 @@
 /* Number of reset methods used in pci_reset_fn_methods array in pci.c */
 #define PCI_NUM_RESET_METHODS 7
 
+#define PCI_RESET_PROBE		true
+#define PCI_RESET_DO_RESET	false
+
 /*
  * The PCI interface treats multi-function devices as independent
  * devices.  The slot/function address of each device is encoded
@@ -1234,7 +1237,7 @@ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
 			     enum pci_bus_speed *speed,
 			     enum pcie_link_width *width);
 void pcie_print_link_status(struct pci_dev *dev);
-int pcie_reset_flr(struct pci_dev *dev, int probe);
+int pcie_reset_flr(struct pci_dev *dev, bool probe);
 int pcie_flr(struct pci_dev *dev);
 int __pci_reset_function_locked(struct pci_dev *dev);
 int pci_reset_function(struct pci_dev *dev);
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 2dac431d94ac..3a10d6ec3ee7 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -44,7 +44,7 @@ struct hotplug_slot_ops {
 	int (*get_attention_status)	(struct hotplug_slot *slot, u8 *value);
 	int (*get_latch_status)		(struct hotplug_slot *slot, u8 *value);
 	int (*get_adapter_status)	(struct hotplug_slot *slot, u8 *value);
-	int (*reset_slot)		(struct hotplug_slot *slot, int probe);
+	int (*reset_slot)		(struct hotplug_slot *slot, bool probe);
 };
 
 /**
-- 
cgit v1.2.3


From 39f75da7bcc829ddc4d40bb60d0e95520de7898b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 2 Aug 2021 23:40:31 +0300
Subject: isystem: trim/fixup stdarg.h and other headers

Delete/fixup few includes in anticipation of global -isystem compile
option removal.

Note: crypto/aegis128-neon-inner.c keeps <stddef.h> due to redefinition
of uintptr_t error (one definition comes from <stddef.h>, another from
<linux/types.h>).

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/filter.h  | 2 --
 include/linux/mISDNif.h | 1 -
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 83b896044e79..c1711c9f9439 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -5,8 +5,6 @@
 #ifndef __LINUX_FILTER_H__
 #define __LINUX_FILTER_H__
 
-#include <stdarg.h>
-
 #include <linux/atomic.h>
 #include <linux/refcount.h>
 #include <linux/compat.h>
diff --git a/include/linux/mISDNif.h b/include/linux/mISDNif.h
index a7330eb3ec64..7dd1f01ec4f9 100644
--- a/include/linux/mISDNif.h
+++ b/include/linux/mISDNif.h
@@ -18,7 +18,6 @@
 #ifndef mISDNIF_H
 #define mISDNIF_H
 
-#include <stdarg.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/socket.h>
-- 
cgit v1.2.3


From c0891ac15f0428ffa81b2e818d416bdf3cb74ab6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 2 Aug 2021 23:40:32 +0300
Subject: isystem: ship and use stdarg.h

Ship minimal stdarg.h (1 type, 4 macros) as <linux/stdarg.h>.
stdarg.h is the only userspace header commonly used in the kernel.

GPL 2 version of <stdarg.h> can be extracted from
http://archive.debian.org/debian/pool/main/g/gcc-4.2/gcc-4.2_4.2.4.orig.tar.gz

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/kernel.h |  2 +-
 include/linux/printk.h |  2 +-
 include/linux/stdarg.h | 11 +++++++++++
 include/linux/string.h |  2 +-
 4 files changed, 14 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/stdarg.h

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 1b2f0a7e00d6..2776423a587e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -2,7 +2,7 @@
 #ifndef _LINUX_KERNEL_H
 #define _LINUX_KERNEL_H
 
-#include <stdarg.h>
+#include <linux/stdarg.h>
 #include <linux/align.h>
 #include <linux/limits.h>
 #include <linux/linkage.h>
diff --git a/include/linux/printk.h b/include/linux/printk.h
index e834d78f0478..9f3f29ea348e 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -2,7 +2,7 @@
 #ifndef __KERNEL_PRINTK__
 #define __KERNEL_PRINTK__
 
-#include <stdarg.h>
+#include <linux/stdarg.h>
 #include <linux/init.h>
 #include <linux/kern_levels.h>
 #include <linux/linkage.h>
diff --git a/include/linux/stdarg.h b/include/linux/stdarg.h
new file mode 100644
index 000000000000..c8dc7f4f390c
--- /dev/null
+++ b/include/linux/stdarg.h
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#ifndef _LINUX_STDARG_H
+#define _LINUX_STDARG_H
+
+typedef __builtin_va_list va_list;
+#define va_start(v, l)	__builtin_va_start(v, l)
+#define va_end(v)	__builtin_va_end(v)
+#define va_arg(v, T)	__builtin_va_arg(v, T)
+#define va_copy(d, s)	__builtin_va_copy(d, s)
+
+#endif
diff --git a/include/linux/string.h b/include/linux/string.h
index b48d2d28e0b1..5e96d656be7a 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -6,7 +6,7 @@
 #include <linux/types.h>	/* for size_t */
 #include <linux/stddef.h>	/* for NULL */
 #include <linux/errno.h>	/* for E2BIG */
-#include <stdarg.h>
+#include <linux/stdarg.h>
 #include <uapi/linux/string.h>
 
 extern char *strndup_user(const char __user *, long);
-- 
cgit v1.2.3


From 22f9feb49950885cdb6e37513f134d154175e743 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 24 Jun 2021 19:37:00 +0200
Subject: dma-mapping: make the global coherent pool conditional

Only build the code to support the global coherent pool if support for
it is enabled.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Dillon Min <dillon.minfei@gmail.com>
---
 include/linux/dma-map-ops.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 068f1b11a6a4..0d5b06b3a4a6 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -171,13 +171,6 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
 int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr);
 int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
 		void *cpu_addr, size_t size, int *ret);
-
-void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
-		dma_addr_t *dma_handle);
-int dma_release_from_global_coherent(int order, void *vaddr);
-int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr,
-		size_t size, int *ret);
-int dma_init_global_coherent(phys_addr_t phys_addr, size_t size);
 #else
 static inline int dma_declare_coherent_memory(struct device *dev,
 		phys_addr_t phys_addr, dma_addr_t device_addr, size_t size)
@@ -187,7 +180,16 @@ static inline int dma_declare_coherent_memory(struct device *dev,
 #define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0)
 #define dma_release_from_dev_coherent(dev, order, vaddr) (0)
 #define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0)
+#endif /* CONFIG_DMA_DECLARE_COHERENT */
 
+#ifdef CONFIG_DMA_GLOBAL_POOL
+void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
+		dma_addr_t *dma_handle);
+int dma_release_from_global_coherent(int order, void *vaddr);
+int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr,
+		size_t size, int *ret);
+int dma_init_global_coherent(phys_addr_t phys_addr, size_t size);
+#else
 static inline void *dma_alloc_from_global_coherent(struct device *dev,
 		ssize_t size, dma_addr_t *dma_handle)
 {
@@ -202,7 +204,7 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma,
 {
 	return 0;
 }
-#endif /* CONFIG_DMA_DECLARE_COHERENT */
+#endif /* CONFIG_DMA_GLOBAL_POOL */
 
 /*
  * This is the actual return value from the ->alloc_noncontiguous method.
-- 
cgit v1.2.3


From 4d99efb229e63928c6b03a756a2e38cd4777fbe8 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 18 Aug 2021 21:48:44 +0800
Subject: iommu/vt-d: Update the virtual command related registers

The VT-d spec Revision 3.3 updated the virtual command registers, virtual
command opcode B register, virtual command response register and virtual
command capability register (Section 10.4.43, 10.4.44, 10.4.45, 10.4.46).
This updates the virtual command interface implementation in the Intel
IOMMU driver accordingly.

Fixes: 24f27d32ab6b7 ("iommu/vt-d: Enlightened PASID allocation")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Sanjay Kumar <sanjay.k.kumar@intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20210713042649.3547403-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210818134852.1847070-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/intel-iommu.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index d0fa0b31994d..05a65eb155f7 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -124,9 +124,9 @@
 #define DMAR_MTRR_PHYSMASK8_REG 0x208
 #define DMAR_MTRR_PHYSBASE9_REG 0x210
 #define DMAR_MTRR_PHYSMASK9_REG 0x218
-#define DMAR_VCCAP_REG		0xe00 /* Virtual command capability register */
-#define DMAR_VCMD_REG		0xe10 /* Virtual command register */
-#define DMAR_VCRSP_REG		0xe20 /* Virtual command response register */
+#define DMAR_VCCAP_REG		0xe30 /* Virtual command capability register */
+#define DMAR_VCMD_REG		0xe00 /* Virtual command register */
+#define DMAR_VCRSP_REG		0xe10 /* Virtual command response register */
 
 #define DMAR_IQER_REG_IQEI(reg)		FIELD_GET(GENMASK_ULL(3, 0), reg)
 #define DMAR_IQER_REG_ITESID(reg)	FIELD_GET(GENMASK_ULL(47, 32), reg)
-- 
cgit v1.2.3


From 48811c44349ffbb778d3e36b53beb03ad43a979c Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 18 Aug 2021 21:48:49 +0800
Subject: iommu/vt-d: Allow devices to have more than 32 outstanding PRs

The minimum per-IOMMU PRQ queue size is one 4K page, this is more entries
than the hardcoded limit of 32 in the current VT-d code. Some devices can
support up to 512 outstanding PRQs but underutilized by this limit of 32.
Although, 32 gives some rough fairness when multiple devices share the same
IOMMU PRQ queue, but far from optimal for customized use case. This extends
the per-IOMMU PRQ queue size to four 4K pages and let the devices have as
many outstanding page requests as they can.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210720013856.4143880-1-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/r/20210818134852.1847070-7-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/intel-svm.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 10fa80eef13a..57cceecbe37f 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -14,6 +14,11 @@
 #define SVM_REQ_EXEC	(1<<1)
 #define SVM_REQ_PRIV	(1<<0)
 
+/* Page Request Queue depth */
+#define PRQ_ORDER	2
+#define PRQ_RING_MASK	((0x1000 << PRQ_ORDER) - 0x20)
+#define PRQ_DEPTH	((0x1000 << PRQ_ORDER) >> 5)
+
 /*
  * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only
  * for access to kernel addresses. No IOTLB flushes are automatically done
-- 
cgit v1.2.3


From e1c6b9e1669e44fb7f9688e34e460b759e3b9187 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oupton@google.com>
Date: Mon, 2 Aug 2021 19:28:08 +0000
Subject: entry: KVM: Allow use of generic KVM entry w/o full generic support

Some architectures (e.g. arm64) have yet to adopt the generic entry
infrastructure. Despite that, it would be nice to use some common
plumbing for guest entry/exit handling. For example, KVM/arm64 currently
does not handle TIF_NOTIFY_PENDING correctly.

Allow use of only the generic KVM entry code by tightening up the
include list. No functional change intended.

Signed-off-by: Oliver Upton <oupton@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210802192809.1851010-3-oupton@google.com
---
 include/linux/entry-kvm.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
index 136b8d97d8c0..0d7865a0731c 100644
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -2,7 +2,11 @@
 #ifndef __LINUX_ENTRYKVM_H
 #define __LINUX_ENTRYKVM_H
 
-#include <linux/entry-common.h>
+#include <linux/static_call_types.h>
+#include <linux/tracehook.h>
+#include <linux/syscalls.h>
+#include <linux/seccomp.h>
+#include <linux/sched.h>
 #include <linux/tick.h>
 
 /* Transfer to guest mode work */
-- 
cgit v1.2.3


From 1cf362e907f36f104b9cf590ee6ced786226b388 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Thu, 19 Aug 2021 18:03:37 +0530
Subject: PCI: endpoint: Add support to add virtual function in endpoint core

Add support to add virtual function in endpoint core. The virtual
function can only be associated with a physical function instead of a
endpoint controller. Provide APIs to associate a virtual function with
a physical function here.

[weiyongjun1@huawei.com: PCI: endpoint: Fix missing unlock on error in
 pci_epf_add_vepf() - Reported-by: Hulk Robot <hulkci@huawei.com>]

Link: https://lore.kernel.org/r/20210819123343.1951-3-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
---
 include/linux/pci-epf.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 2debc27ba95e..043b4c9c7188 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -121,8 +121,10 @@ struct pci_epf_bar {
  * @bar: represents the BAR of EPF device
  * @msi_interrupts: number of MSI interrupts required by this function
  * @msix_interrupts: number of MSI-X interrupts required by this function
- * @func_no: unique function number within this endpoint device
+ * @func_no: unique (physical) function number within this endpoint device
+ * @vfunc_no: unique virtual function number within a physical function
  * @epc: the EPC device to which this EPF device is bound
+ * @epf_pf: the physical EPF device to which this virtual EPF device is bound
  * @driver: the EPF driver to which this EPF device is bound
  * @list: to add pci_epf as a list of PCI endpoint functions to pci_epc
  * @nb: notifier block to notify EPF of any EPC events (like linkup)
@@ -133,6 +135,10 @@ struct pci_epf_bar {
  * @sec_epc_bar: represents the BAR of EPF device associated with secondary EPC
  * @sec_epc_func_no: unique (physical) function number within the secondary EPC
  * @group: configfs group associated with the EPF device
+ * @is_bound: indicates if bind notification to function driver has been invoked
+ * @is_vf: true - virtual function, false - physical function
+ * @vfunction_num_map: bitmap to manage virtual function number
+ * @pci_vepf: list of virtual endpoint functions associated with this function
  */
 struct pci_epf {
 	struct device		dev;
@@ -142,8 +148,10 @@ struct pci_epf {
 	u8			msi_interrupts;
 	u16			msix_interrupts;
 	u8			func_no;
+	u8			vfunc_no;
 
 	struct pci_epc		*epc;
+	struct pci_epf		*epf_pf;
 	struct pci_epf_driver	*driver;
 	struct list_head	list;
 	struct notifier_block   nb;
@@ -156,6 +164,10 @@ struct pci_epf {
 	struct pci_epf_bar	sec_epc_bar[6];
 	u8			sec_epc_func_no;
 	struct config_group	*group;
+	unsigned int		is_bound;
+	unsigned int		is_vf;
+	unsigned long		vfunction_num_map;
+	struct list_head	pci_vepf;
 };
 
 /**
@@ -199,4 +211,6 @@ int pci_epf_bind(struct pci_epf *epf);
 void pci_epf_unbind(struct pci_epf *epf);
 struct config_group *pci_epf_type_add_cfs(struct pci_epf *epf,
 					  struct config_group *group);
+int pci_epf_add_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf);
+void pci_epf_remove_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf);
 #endif /* __LINUX_PCI_EPF_H */
-- 
cgit v1.2.3


From 53fd3cbe5e9d791d6bb6059f73a3851f155ce7c6 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Thu, 19 Aug 2021 18:03:39 +0530
Subject: PCI: endpoint: Add virtual function number in pci_epc ops

Add virtual function number in pci_epc ops. EPC controller driver
can perform virtual function specific initialization based on the
virtual function number.

Link: https://lore.kernel.org/r/20210819123343.1951-5-kishon@ti.com
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
---
 include/linux/pci-epc.h | 57 +++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 50a649d33e68..a48778e1a4ee 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -62,31 +62,32 @@ pci_epc_interface_string(enum pci_epc_interface_type type)
  * @owner: the module owner containing the ops
  */
 struct pci_epc_ops {
-	int	(*write_header)(struct pci_epc *epc, u8 func_no,
+	int	(*write_header)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 				struct pci_epf_header *hdr);
-	int	(*set_bar)(struct pci_epc *epc, u8 func_no,
+	int	(*set_bar)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			   struct pci_epf_bar *epf_bar);
-	void	(*clear_bar)(struct pci_epc *epc, u8 func_no,
+	void	(*clear_bar)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			     struct pci_epf_bar *epf_bar);
-	int	(*map_addr)(struct pci_epc *epc, u8 func_no,
+	int	(*map_addr)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			    phys_addr_t addr, u64 pci_addr, size_t size);
-	void	(*unmap_addr)(struct pci_epc *epc, u8 func_no,
+	void	(*unmap_addr)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			      phys_addr_t addr);
-	int	(*set_msi)(struct pci_epc *epc, u8 func_no, u8 interrupts);
-	int	(*get_msi)(struct pci_epc *epc, u8 func_no);
-	int	(*set_msix)(struct pci_epc *epc, u8 func_no, u16 interrupts,
-			    enum pci_barno, u32 offset);
-	int	(*get_msix)(struct pci_epc *epc, u8 func_no);
-	int	(*raise_irq)(struct pci_epc *epc, u8 func_no,
+	int	(*set_msi)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
+			   u8 interrupts);
+	int	(*get_msi)(struct pci_epc *epc, u8 func_no, u8 vfunc_no);
+	int	(*set_msix)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
+			    u16 interrupts, enum pci_barno, u32 offset);
+	int	(*get_msix)(struct pci_epc *epc, u8 func_no, u8 vfunc_no);
+	int	(*raise_irq)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			     enum pci_epc_irq_type type, u16 interrupt_num);
-	int	(*map_msi_irq)(struct pci_epc *epc, u8 func_no,
+	int	(*map_msi_irq)(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			       phys_addr_t phys_addr, u8 interrupt_num,
 			       u32 entry_size, u32 *msi_data,
 			       u32 *msi_addr_offset);
 	int	(*start)(struct pci_epc *epc);
 	void	(*stop)(struct pci_epc *epc);
 	const struct pci_epc_features* (*get_features)(struct pci_epc *epc,
-						       u8 func_no);
+						       u8 func_no, u8 vfunc_no);
 	struct module *owner;
 };
 
@@ -128,6 +129,8 @@ struct pci_epc_mem {
  *       single window.
  * @num_windows: number of windows supported by device
  * @max_functions: max number of functions that can be configured in this EPC
+ * @max_vfs: Array indicating the maximum number of virtual functions that can
+ *   be associated with each physical function
  * @group: configfs group representing the PCI EPC device
  * @lock: mutex to protect pci_epc ops
  * @function_num_map: bitmap to manage physical function number
@@ -141,6 +144,7 @@ struct pci_epc {
 	struct pci_epc_mem		*mem;
 	unsigned int			num_windows;
 	u8				max_functions;
+	u8				*max_vfs;
 	struct config_group		*group;
 	/* mutex to protect against concurrent access of EP controller */
 	struct mutex			lock;
@@ -208,31 +212,32 @@ void pci_epc_linkup(struct pci_epc *epc);
 void pci_epc_init_notify(struct pci_epc *epc);
 void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf,
 			enum pci_epc_interface_type type);
-int pci_epc_write_header(struct pci_epc *epc, u8 func_no,
+int pci_epc_write_header(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			 struct pci_epf_header *hdr);
-int pci_epc_set_bar(struct pci_epc *epc, u8 func_no,
+int pci_epc_set_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 		    struct pci_epf_bar *epf_bar);
-void pci_epc_clear_bar(struct pci_epc *epc, u8 func_no,
+void pci_epc_clear_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 		       struct pci_epf_bar *epf_bar);
-int pci_epc_map_addr(struct pci_epc *epc, u8 func_no,
+int pci_epc_map_addr(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 		     phys_addr_t phys_addr,
 		     u64 pci_addr, size_t size);
-void pci_epc_unmap_addr(struct pci_epc *epc, u8 func_no,
+void pci_epc_unmap_addr(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			phys_addr_t phys_addr);
-int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 interrupts);
-int pci_epc_get_msi(struct pci_epc *epc, u8 func_no);
-int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u16 interrupts,
-		     enum pci_barno, u32 offset);
-int pci_epc_get_msix(struct pci_epc *epc, u8 func_no);
-int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no,
+int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
+		    u8 interrupts);
+int pci_epc_get_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no);
+int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
+		     u16 interrupts, enum pci_barno, u32 offset);
+int pci_epc_get_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no);
+int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 			phys_addr_t phys_addr, u8 interrupt_num,
 			u32 entry_size, u32 *msi_data, u32 *msi_addr_offset);
-int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no,
+int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no,
 		      enum pci_epc_irq_type type, u16 interrupt_num);
 int pci_epc_start(struct pci_epc *epc);
 void pci_epc_stop(struct pci_epc *epc);
 const struct pci_epc_features *pci_epc_get_features(struct pci_epc *epc,
-						    u8 func_no);
+						    u8 func_no, u8 vfunc_no);
 enum pci_barno
 pci_epc_get_first_free_bar(const struct pci_epc_features *epc_features);
 enum pci_barno pci_epc_get_next_free_bar(const struct pci_epc_features
-- 
cgit v1.2.3


From 249dbe74d3c4b568a623fb55c56cddf19fdf0b89 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 11 Aug 2021 08:30:22 +0100
Subject: ARM: 9108/1: oabi-compat: rework epoll_wait/epoll_pwait emulation

The epoll_wait() system call wrapper is one of the remaining users of
the set_fs() infrasturcture for Arm. Changing it to not require set_fs()
is rather complex unfortunately.

The approach I'm taking here is to allow architectures to override
the code that copies the output to user space, and let the oabi-compat
implementation check whether it is getting called from an EABI or OABI
system call based on the thread_info->syscall value.

The in_oabi_syscall() check here mirrors the in_compat_syscall() and
in_x32_syscall() helpers for 32-bit compat implementations on other
architectures.

Overall, the amount of code goes down, at least with the newly added
sys_oabi_epoll_pwait() helper getting removed again. The downside
is added complexity in the source code for the native implementation.
There should be no difference in runtime performance except for Arm
kernels with CONFIG_OABI_COMPAT enabled that now have to go through
an external function call to check which of the two variants to use.

Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
---
 include/linux/eventpoll.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 593322c946e6..3337745d81bd 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -68,4 +68,22 @@ static inline void eventpoll_release(struct file *file) {}
 
 #endif
 
+#if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT)
+/* ARM OABI has an incompatible struct layout and needs a special handler */
+extern struct epoll_event __user *
+epoll_put_uevent(__poll_t revents, __u64 data,
+		 struct epoll_event __user *uevent);
+#else
+static inline struct epoll_event __user *
+epoll_put_uevent(__poll_t revents, __u64 data,
+		 struct epoll_event __user *uevent)
+{
+	if (__put_user(revents, &uevent->events) ||
+	    __put_user(data, &uevent->data))
+		return NULL;
+
+	return uevent+1;
+}
+#endif
+
 #endif /* #ifndef _LINUX_EVENTPOLL_H */
-- 
cgit v1.2.3


From bdec0145286f7e6be9b3134aa35f0f335fa27c38 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 11 Aug 2021 08:30:23 +0100
Subject: ARM: 9114/1: oabi-compat: rework sys_semtimedop emulation

sys_oabi_semtimedop() is one of the last users of set_fs() on Arm. To
remove this one, expose the internal code of the actual implementation
that operates on a kernel pointer and call it directly after copying.

There should be no measurable impact on the normal execution of this
function, and it makes the overly long function a little shorter, which
may help readability.

While reworking the oabi version, make it behave a little more like
the native one, using kvmalloc_array() and restructure the code
flow in a similar way.

The naming of __do_semtimedop() is not very good, I hope someone can
come up with a better name.

One regression was spotted by kernel test robot <rong.a.chen@intel.com>
and fixed before the first mailing list submission.

Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
---
 include/linux/syscalls.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 69c9a7010081..6c6fc3fd5b72 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1373,6 +1373,9 @@ long ksys_old_shmctl(int shmid, int cmd, struct shmid_ds __user *buf);
 long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems,
 			    unsigned int nsops,
 			    const struct old_timespec32 __user *timeout);
+long __do_semtimedop(int semid, struct sembuf *tsems, unsigned int nsops,
+		     const struct timespec64 *timeout,
+		     struct ipc_namespace *ns);
 
 int __sys_getsockopt(int fd, int level, int optname, char __user *optval,
 		int __user *optlen);
-- 
cgit v1.2.3


From f7403abf5f06f407c50252e003f5fb332325147b Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 20 Aug 2021 14:14:42 +0100
Subject: iommu/io-pgtable: Abstract iommu_iotlb_gather access

Previously io-pgtable merely passed the iommu_iotlb_gather pointer
through to helpers, but now it has grown its own direct dereference.
This turns out to break the build for !IOMMU_API configs where the
structure only has a dummy definition. It will probably also crash
drivers who don't use the gather mechanism and simply pass in NULL.

Wrap this dereference in a suitable helper which can both be stubbed
out for !IOMMU_API and encapsulate a NULL check otherwise.

Fixes: 7a7c5badf858 ("iommu: Indicate queued flushes via gather data")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/83672ee76f6405c82845a55c148fa836f56fbbc1.1629465282.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 923a8d1c5e39..a23779c093c7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -548,6 +548,11 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
 		gather->start = start;
 }
 
+static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather)
+{
+	return gather && gather->queued;
+}
+
 /* PCI device grouping function */
 extern struct iommu_group *pci_device_group(struct device *dev);
 /* Generic device grouping function */
@@ -896,6 +901,11 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
 {
 }
 
+static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather)
+{
+	return false;
+}
+
 static inline void iommu_device_unregister(struct iommu_device *iommu)
 {
 }
-- 
cgit v1.2.3


From 7491e2c442781a1860181adb5ab472a52075f393 Mon Sep 17 00:00:00 2001
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
Date: Thu, 19 Aug 2021 11:26:06 -0400
Subject: tracing: Add a probe that attaches to trace events

A new dynamic event is introduced: event probe. The event is attached
to an existing tracepoint and uses its fields as arguments. The user
can specify custom format string of the new event, select what tracepoint
arguments will be printed and how to print them.
An event probe is created by writing configuration string in
'dynamic_events' ftrace file:
 e[:[SNAME/]ENAME] SYSTEM/EVENT [FETCHARGS]	- Set an event probe
 -:SNAME/ENAME					- Delete an event probe

Where:
 SNAME	- System name, if omitted 'eprobes' is used.
 ENAME	- Name of the new event in SNAME, if omitted the SYSTEM_EVENT is used.
 SYSTEM	- Name of the system, where the tracepoint is defined, mandatory.
 EVENT	- Name of the tracepoint event in SYSTEM, mandatory.
 FETCHARGS - Arguments:
  <name>=$<field>[:TYPE] - Fetch given filed of the tracepoint and print
			   it as given TYPE with given name. Supported
			   types are:
	                    (u8/u16/u32/u64/s8/s16/s32/s64), basic type
        	            (x8/x16/x32/x64), hexadecimal types
			    "string", "ustring" and bitfield.

Example, attach an event probe on openat system call and print name of the
file that will be opened:
 echo "e:esys/eopen syscalls/sys_enter_openat file=\$filename:string" >> dynamic_events
A new dynamic event is created in events/esys/eopen/ directory. It
can be deleted with:
 echo "-:esys/eopen" >> dynamic_events

Filters, triggers and histograms can be attached to the new event, it can
be matched in synthetic events. There is one limitation - an event probe
can not be attached to kprobe, uprobe or another event probe.

Link: https://lkml.kernel.org/r/20210812145805.2292326-1-tz.stoyanov@gmail.com
Link: https://lkml.kernel.org/r/20210819152825.142428383@goodmis.org

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 9564c4d9a3b6..0a0144580bbd 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -313,6 +313,7 @@ enum {
 	TRACE_EVENT_FL_DYNAMIC_BIT,
 	TRACE_EVENT_FL_KPROBE_BIT,
 	TRACE_EVENT_FL_UPROBE_BIT,
+	TRACE_EVENT_FL_EPROBE_BIT,
 };
 
 /*
@@ -325,6 +326,7 @@ enum {
  *  DYNAMIC       - Event is a dynamic event (created at run time)
  *  KPROBE        - Event is a kprobe
  *  UPROBE        - Event is a uprobe
+ *  EPROBE        - Event is an event probe
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -335,6 +337,7 @@ enum {
 	TRACE_EVENT_FL_DYNAMIC		= (1 << TRACE_EVENT_FL_DYNAMIC_BIT),
 	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 	TRACE_EVENT_FL_UPROBE		= (1 << TRACE_EVENT_FL_UPROBE_BIT),
+	TRACE_EVENT_FL_EPROBE		= (1 << TRACE_EVENT_FL_EPROBE_BIT),
 };
 
 #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE)
@@ -680,6 +683,7 @@ enum event_trigger_type {
 	ETT_EVENT_ENABLE	= (1 << 3),
 	ETT_EVENT_HIST		= (1 << 4),
 	ETT_HIST_ENABLE		= (1 << 5),
+	ETT_EVENT_EPROBE	= (1 << 6),
 };
 
 extern int filter_match_preds(struct event_filter *filter, void *rec);
-- 
cgit v1.2.3


From edb298c663fccad65fe99fcec6a4f96cc344520d Mon Sep 17 00:00:00 2001
From: Maxim Levitsky <mlevitsk@redhat.com>
Date: Tue, 10 Aug 2021 23:52:39 +0300
Subject: KVM: x86/mmu: bump mmu notifier count in kvm_zap_gfn_range

This together with previous patch, ensures that
kvm_zap_gfn_range doesn't race with page fault
running on another vcpu, and will make this page fault code
retry instead.

This is based on a patch suggested by Sean Christopherson:
https://lkml.org/lkml/2021/7/22/1025

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20210810205251.424103-5-mlevitsk@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f50bfcf225f0..4e43843fe0d7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -991,6 +991,11 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 #endif
 
+void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
+				   unsigned long end);
+void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
+				   unsigned long end);
+
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
 long kvm_arch_vcpu_ioctl(struct file *filp,
-- 
cgit v1.2.3


From f95937ccf5bd5e0a6bbac2b8e65a87982ffae403 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Mon, 2 Aug 2021 16:56:29 +0000
Subject: KVM: stats: Support linear and logarithmic histogram statistics

Add new types of KVM stats, linear and logarithmic histogram.
Histogram are very useful for observing the value distribution
of time or size related stats.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
Message-Id: <20210802165633.1866976-2-jingzhangos@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 90 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 75 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4e43843fe0d7..09fc0274b1eb 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1356,56 +1356,66 @@ struct _kvm_stats_desc {
 	char name[KVM_STATS_NAME_SIZE];
 };
 
-#define STATS_DESC_COMMON(type, unit, base, exp)			       \
+#define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz)		       \
 	.flags = type | unit | base |					       \
 		 BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) |	       \
 		 BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) |	       \
 		 BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK),	       \
 	.exponent = exp,						       \
-	.size = 1
+	.size = sz,							       \
+	.bucket_size = bsz
 
-#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp)		       \
+#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz)	       \
 	{								       \
 		{							       \
-			STATS_DESC_COMMON(type, unit, base, exp),	       \
+			STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
 			.offset = offsetof(struct kvm_vm_stat, generic.stat)   \
 		},							       \
 		.name = #stat,						       \
 	}
-#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp)		       \
+#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz)	       \
 	{								       \
 		{							       \
-			STATS_DESC_COMMON(type, unit, base, exp),	       \
+			STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
 			.offset = offsetof(struct kvm_vcpu_stat, generic.stat) \
 		},							       \
 		.name = #stat,						       \
 	}
-#define VM_STATS_DESC(stat, type, unit, base, exp)			       \
+#define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz)		       \
 	{								       \
 		{							       \
-			STATS_DESC_COMMON(type, unit, base, exp),	       \
+			STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
 			.offset = offsetof(struct kvm_vm_stat, stat)	       \
 		},							       \
 		.name = #stat,						       \
 	}
-#define VCPU_STATS_DESC(stat, type, unit, base, exp)			       \
+#define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz)		       \
 	{								       \
 		{							       \
-			STATS_DESC_COMMON(type, unit, base, exp),	       \
+			STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
 			.offset = offsetof(struct kvm_vcpu_stat, stat)	       \
 		},							       \
 		.name = #stat,						       \
 	}
 /* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */
-#define STATS_DESC(SCOPE, stat, type, unit, base, exp)			       \
-	SCOPE##_STATS_DESC(stat, type, unit, base, exp)
+#define STATS_DESC(SCOPE, stat, type, unit, base, exp, sz, bsz)		       \
+	SCOPE##_STATS_DESC(stat, type, unit, base, exp, sz, bsz)
 
 #define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent)	       \
-	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, unit, base, exponent)
+	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE,		       \
+		unit, base, exponent, 1, 0)
 #define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent)		       \
-	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, unit, base, exponent)
+	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT,			       \
+		unit, base, exponent, 1, 0)
 #define STATS_DESC_PEAK(SCOPE, name, unit, base, exponent)		       \
-	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, unit, base, exponent)
+	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK,			       \
+		unit, base, exponent, 1, 0)
+#define STATS_DESC_LINEAR_HIST(SCOPE, name, unit, base, exponent, sz, bsz)     \
+	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LINEAR_HIST,		       \
+		unit, base, exponent, sz, bsz)
+#define STATS_DESC_LOG_HIST(SCOPE, name, unit, base, exponent, sz)	       \
+	STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LOG_HIST,		       \
+		unit, base, exponent, sz, 0)
 
 /* Cumulative counter, read/write */
 #define STATS_DESC_COUNTER(SCOPE, name)					       \
@@ -1424,6 +1434,14 @@ struct _kvm_stats_desc {
 #define STATS_DESC_TIME_NSEC(SCOPE, name)				       \
 	STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,	       \
 		KVM_STATS_BASE_POW10, -9)
+/* Linear histogram for time in nanosecond */
+#define STATS_DESC_LINHIST_TIME_NSEC(SCOPE, name, sz, bsz)		       \
+	STATS_DESC_LINEAR_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS,	       \
+		KVM_STATS_BASE_POW10, -9, sz, bsz)
+/* Logarithmic histogram for time in nanosecond */
+#define STATS_DESC_LOGHIST_TIME_NSEC(SCOPE, name, sz)			       \
+	STATS_DESC_LOG_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS,	       \
+		KVM_STATS_BASE_POW10, -9, sz)
 
 #define KVM_GENERIC_VM_STATS()						       \
 	STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush)
@@ -1437,10 +1455,52 @@ struct _kvm_stats_desc {
 	STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns)
 
 extern struct dentry *kvm_debugfs_dir;
+
 ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
 		       const struct _kvm_stats_desc *desc,
 		       void *stats, size_t size_stats,
 		       char __user *user_buffer, size_t size, loff_t *offset);
+
+/**
+ * kvm_stats_linear_hist_update() - Update bucket value for linear histogram
+ * statistics data.
+ *
+ * @data: start address of the stats data
+ * @size: the number of bucket of the stats data
+ * @value: the new value used to update the linear histogram's bucket
+ * @bucket_size: the size (width) of a bucket
+ */
+static inline void kvm_stats_linear_hist_update(u64 *data, size_t size,
+						u64 value, size_t bucket_size)
+{
+	size_t index = div64_u64(value, bucket_size);
+
+	index = min(index, size - 1);
+	++data[index];
+}
+
+/**
+ * kvm_stats_log_hist_update() - Update bucket value for logarithmic histogram
+ * statistics data.
+ *
+ * @data: start address of the stats data
+ * @size: the number of bucket of the stats data
+ * @value: the new value used to update the logarithmic histogram's bucket
+ */
+static inline void kvm_stats_log_hist_update(u64 *data, size_t size, u64 value)
+{
+	size_t index = fls64(value);
+
+	index = min(index, size - 1);
+	++data[index];
+}
+
+#define KVM_STATS_LINEAR_HIST_UPDATE(array, value, bsize)		       \
+	kvm_stats_linear_hist_update(array, ARRAY_SIZE(array), value, bsize)
+#define KVM_STATS_LOG_HIST_UPDATE(array, value)				       \
+	kvm_stats_log_hist_update(array, ARRAY_SIZE(array), value)
+
+
 extern const struct kvm_stats_header kvm_vm_stats_header;
 extern const struct _kvm_stats_desc kvm_vm_stats_desc[];
 extern const struct kvm_stats_header kvm_vcpu_stats_header;
-- 
cgit v1.2.3


From 87bcc5fa092f82a9890f9e73e4f4c7016ef64049 Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Mon, 2 Aug 2021 16:56:32 +0000
Subject: KVM: stats: Add halt_wait_ns stats for all architectures

Add simple stats halt_wait_ns to record the time a VCPU has spent on
waiting for all architectures (not just powerpc).

Signed-off-by: Jing Zhang <jingzhangos@google.com>
Message-Id: <20210802165633.1866976-5-jingzhangos@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h  | 3 ++-
 include/linux/kvm_types.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 09fc0274b1eb..58a8ffee265e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1452,7 +1452,8 @@ struct _kvm_stats_desc {
 	STATS_DESC_COUNTER(VCPU_GENERIC, halt_poll_invalid),		       \
 	STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup),			       \
 	STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns),	       \
-	STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns)
+	STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns),		       \
+	STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns)
 
 extern struct dentry *kvm_debugfs_dir;
 
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index ed6a985c5680..291ef55125b2 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -87,6 +87,7 @@ struct kvm_vcpu_stat_generic {
 	u64 halt_wakeup;
 	u64 halt_poll_success_ns;
 	u64 halt_poll_fail_ns;
+	u64 halt_wait_ns;
 };
 
 #define KVM_STATS_NAME_SIZE	48
-- 
cgit v1.2.3


From 8ccba534a1a5c6565220c81113d6157571f380cb Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Mon, 2 Aug 2021 16:56:33 +0000
Subject: KVM: stats: Add halt polling related histogram stats

Add three log histogram stats to record the distribution of time spent
on successful polling, failed polling and VCPU wait.
halt_poll_success_hist: Distribution of spent time for a successful poll.
halt_poll_fail_hist: Distribution of spent time for a failed poll.
halt_wait_hist: Distribution of time a VCPU has spent on waiting.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
Message-Id: <20210802165633.1866976-6-jingzhangos@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h  | 8 +++++++-
 include/linux/kvm_types.h | 5 +++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 58a8ffee265e..e4d712e9f760 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1453,7 +1453,13 @@ struct _kvm_stats_desc {
 	STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup),			       \
 	STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns),	       \
 	STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns),		       \
-	STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns)
+	STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns),		       \
+	STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_success_hist,     \
+			HALT_POLL_HIST_COUNT),				       \
+	STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_hist,	       \
+			HALT_POLL_HIST_COUNT),				       \
+	STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist,	       \
+			HALT_POLL_HIST_COUNT)
 
 extern struct dentry *kvm_debugfs_dir;
 
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 291ef55125b2..de7fb5f364d8 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -76,6 +76,8 @@ struct kvm_mmu_memory_cache {
 };
 #endif
 
+#define HALT_POLL_HIST_COUNT			32
+
 struct kvm_vm_stat_generic {
 	u64 remote_tlb_flush;
 };
@@ -88,6 +90,9 @@ struct kvm_vcpu_stat_generic {
 	u64 halt_poll_success_ns;
 	u64 halt_poll_fail_ns;
 	u64 halt_wait_ns;
+	u64 halt_poll_success_hist[HALT_POLL_HIST_COUNT];
+	u64 halt_poll_fail_hist[HALT_POLL_HIST_COUNT];
+	u64 halt_wait_hist[HALT_POLL_HIST_COUNT];
 };
 
 #define KVM_STATS_NAME_SIZE	48
-- 
cgit v1.2.3


From 76f3c032adad86aad26f8ad3eebc993b4ba32138 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 18 Aug 2021 20:59:31 +0200
Subject: PCI/VPD: Add pci_vpd_alloc()

Several users of the VPD API use a fixed-size buffer and read the VPD into
it for further usage. This requires special handling for the case that the
buffer isn't big enough to hold the full VPD data.  Also the buffer is
often allocated on the stack, which isn't too nice.

Add pci_vpd_alloc() to dynamically allocate buffer of the correct size and
read VPD into it.

Link: https://lore.kernel.org/r/955ff598-0021-8446-f856-0c2c077635d7@gmail.com
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index e752cc39a1fe..8c681e24be8b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2330,6 +2330,15 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field)
 	return info_field[2];
 }
 
+/**
+ * pci_vpd_alloc - Allocate buffer and read VPD into it
+ * @dev: PCI device
+ * @size: pointer to field where VPD length is returned
+ *
+ * Returns pointer to allocated buffer or an ERR_PTR in case of failure
+ */
+void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size);
+
 /**
  * pci_vpd_find_tag - Locates the Resource Data Type tag provided
  * @buf: Pointer to buffered vpd data
-- 
cgit v1.2.3


From 9e515c9f6c0b6f0ace6f5cf2202b527d745b494d Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 18 Aug 2021 21:00:57 +0200
Subject: PCI/VPD: Add pci_vpd_find_ro_info_keyword()

All users of pci_vpd_find_info_keyword() are interested in the VPD RO
section only. In addition all calls are followed by the same activities to
calculate start of tag data area and size of the data area.

Add pci_vpd_find_ro_info_keyword() that combines these functionalities.

pci_vpd_find_info_keyword() can be phased out once all users are converted.

[bhelgaas: split pci_vpd_check_csum() to separate patch]
Link: https://lore.kernel.org/r/1643bd7a-088e-1028-c9b0-9d112cf48d63@gmail.com
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8c681e24be8b..9e3b60963a52 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2363,6 +2363,19 @@ int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt);
 int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off,
 			      unsigned int len, const char *kw);
 
+/**
+ * pci_vpd_find_ro_info_keyword - Locate info field keyword in VPD RO section
+ * @buf: Pointer to buffered VPD data
+ * @len: The length of the buffer area in which to search
+ * @kw: The keyword to search for
+ * @size: Pointer to field where length of found keyword data is returned
+ *
+ * Returns the index of the information field keyword data or -ENOENT if
+ * not found.
+ */
+int pci_vpd_find_ro_info_keyword(const void *buf, unsigned int len,
+				 const char *kw, unsigned int *size);
+
 /* PCI <-> OF binding helpers */
 #ifdef CONFIG_OF
 struct device_node;
-- 
cgit v1.2.3


From 6107e5cb907cffc5576cc1297847f9fc69a8d5d9 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 20 Aug 2021 15:32:42 -0500
Subject: PCI/VPD: Add pci_vpd_check_csum()

VPD checksum information and checksum calculation are specified by PCIe
r5.0, sec 6.28.2.2.  Therefore checksum handling can and should be moved
into the PCI VPD core.

Add pci_vpd_check_csum() to validate the VPD checksum.

[bhelgaas: split to separate patch]
Link: https://lore.kernel.org/r/1643bd7a-088e-1028-c9b0-9d112cf48d63@gmail.com
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 9e3b60963a52..827b7eefd550 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2376,6 +2376,15 @@ int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off,
 int pci_vpd_find_ro_info_keyword(const void *buf, unsigned int len,
 				 const char *kw, unsigned int *size);
 
+/**
+ * pci_vpd_check_csum - Check VPD checksum
+ * @buf: Pointer to buffered VPD data
+ * @len: VPD size
+ *
+ * Returns 1 if VPD has no checksum, otherwise 0 or an errno
+ */
+int pci_vpd_check_csum(const void *buf, unsigned int len);
+
 /* PCI <-> OF binding helpers */
 #ifdef CONFIG_OF
 struct device_node;
-- 
cgit v1.2.3


From f0ab00174eb7574732737fc0734d4b406aed6231 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Mon, 2 Aug 2021 17:17:28 -0500
Subject: PCI: Make saved capability state private to core

Interfaces and structs for saving and restoring PCI Capability state were
declared in include/linux/pci.h, but aren't needed outside drivers/pci/.

Move these to drivers/pci/pci.h:

  struct pci_cap_saved_data
  struct pci_cap_saved_state
  void pci_allocate_cap_save_buffers()
  void pci_free_cap_save_buffers()
  int pci_add_cap_save_buffer()
  int pci_add_ext_cap_save_buffer()
  struct pci_cap_saved_state *pci_find_saved_cap()
  struct pci_cap_saved_state *pci_find_saved_ext_cap()

Link: https://lore.kernel.org/r/20210802221728.1469304-1-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/pci.h | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 540b377ca8f6..fd35327812af 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -288,18 +288,6 @@ enum pci_bus_speed {
 enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev);
 enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev);
 
-struct pci_cap_saved_data {
-	u16		cap_nr;
-	bool		cap_extended;
-	unsigned int	size;
-	u32		data[];
-};
-
-struct pci_cap_saved_state {
-	struct hlist_node		next;
-	struct pci_cap_saved_data	cap;
-};
-
 struct irq_affinity;
 struct pcie_link_state;
 struct pci_vpd;
@@ -1278,12 +1266,6 @@ int pci_load_saved_state(struct pci_dev *dev,
 			 struct pci_saved_state *state);
 int pci_load_and_free_saved_state(struct pci_dev *dev,
 				  struct pci_saved_state **state);
-struct pci_cap_saved_state *pci_find_saved_cap(struct pci_dev *dev, char cap);
-struct pci_cap_saved_state *pci_find_saved_ext_cap(struct pci_dev *dev,
-						   u16 cap);
-int pci_add_cap_save_buffer(struct pci_dev *dev, char cap, unsigned int size);
-int pci_add_ext_cap_save_buffer(struct pci_dev *dev,
-				u16 cap, unsigned int size);
 int pci_platform_power_transition(struct pci_dev *dev, pci_power_t state);
 int pci_set_power_state(struct pci_dev *dev, pci_power_t state);
 pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state);
-- 
cgit v1.2.3


From ca32b5310a1a3835f81f498367f1bb7450c8b67b Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Tue, 13 Jul 2021 15:22:36 +0800
Subject: PCI: Optimize pci_resource_len() to reduce kernel size

pci_resource_end() can be 0 only when pci_resource_start() is 0.
Otherwise, it is definitely an error. In this case, pci_resource_len()
should be regarded as 0. Therefore, determining whether
pci_resource_start() and pci_resource_end() are both 0 can be reduced to
determining only whether pci_resource_end() is 0.

Although only one condition judgment is reduced, the macro function
pci_resource_len() is widely referenced in the kernel. I used defconfig to
compile the latest kernel on X86, and its binary code size was reduced by
about 3KB.

  Before:
   [ 2] .rela.text        RELA             0000000000000000  093bfcb0
        0000000001a67168  0000000000000018   I      68     1     8

  After:
   [ 2] .rela.text        RELA             0000000000000000  093bfcb0
        0000000001a66598  0000000000000018   I      68     1     8

Link: https://lore.kernel.org/r/20210713072236.3043-1-thunder.leizhen@huawei.com
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 540b377ca8f6..23ef1a15eb5d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1881,9 +1881,7 @@ int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma);
 #define pci_resource_end(dev, bar)	((dev)->resource[(bar)].end)
 #define pci_resource_flags(dev, bar)	((dev)->resource[(bar)].flags)
 #define pci_resource_len(dev,bar) \
-	((pci_resource_start((dev), (bar)) == 0 &&	\
-	  pci_resource_end((dev), (bar)) ==		\
-	  pci_resource_start((dev), (bar))) ? 0 :	\
+	((pci_resource_end((dev), (bar)) == 0) ? 0 :	\
 							\
 	 (pci_resource_end((dev), (bar)) -		\
 	  pci_resource_start((dev), (bar)) + 1))
-- 
cgit v1.2.3


From 817f9916a6e96ae43acdd4e75459ef4f92d96eb1 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 13 Aug 2021 18:36:19 +0300
Subject: PCI: Sync __pci_register_driver() stub for CONFIG_PCI=n

The CONFIG_PCI=y case got a new parameter long time ago.  Sync the stub as
well.

[bhelgaas: add parameter names]
Fixes: 725522b5453d ("PCI: add the sysfs driver name to all modules")
Link: https://lore.kernel.org/r/20210813153619.89574-1-andriy.shevchenko@linux.intel.com
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index fd35327812af..a662f6c1f120 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1722,8 +1722,9 @@ static inline void pci_disable_device(struct pci_dev *dev) { }
 static inline int pcim_enable_device(struct pci_dev *pdev) { return -EIO; }
 static inline int pci_assign_resource(struct pci_dev *dev, int i)
 { return -EBUSY; }
-static inline int __pci_register_driver(struct pci_driver *drv,
-					struct module *owner)
+static inline int __must_check __pci_register_driver(struct pci_driver *drv,
+						     struct module *owner,
+						     const char *mod_name)
 { return 0; }
 static inline int pci_register_driver(struct pci_driver *drv)
 { return 0; }
-- 
cgit v1.2.3


From 60bcd91aafd22ef62cef9ae2037fa2e1d4da2fb3 Mon Sep 17 00:00:00 2001
From: Grzegorz Jaszczyk <grzegorz.jaszczyk@linaro.org>
Date: Fri, 18 Jun 2021 21:50:32 +0200
Subject: watchdog: introduce watchdog_dev_suspend/resume

The watchdog drivers often disable wdog clock during suspend and then
enable it again during resume. Nevertheless the ping worker is still
running and can issue low-level ping while the wdog clock is disabled
causing the system hang. To prevent such condition register pm notifier
in the watchdog core which will call watchdog_dev_suspend/resume and
actually cancel ping worker during suspend and restore it back, if
needed, during resume.

Signed-off-by: Grzegorz Jaszczyk <grzegorz.jaszczyk@linaro.org>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20210618195033.3209598-2-grzegorz.jaszczyk@linaro.org
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
---
 include/linux/watchdog.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 9b19e6bb68b5..99660197a36c 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -107,6 +107,7 @@ struct watchdog_device {
 	unsigned int max_hw_heartbeat_ms;
 	struct notifier_block reboot_nb;
 	struct notifier_block restart_nb;
+	struct notifier_block pm_nb;
 	void *driver_data;
 	struct watchdog_core_data *wd_data;
 	unsigned long status;
@@ -116,6 +117,7 @@ struct watchdog_device {
 #define WDOG_STOP_ON_REBOOT	2	/* Should be stopped on reboot */
 #define WDOG_HW_RUNNING		3	/* True if HW watchdog running */
 #define WDOG_STOP_ON_UNREGISTER	4	/* Should be stopped on unregister */
+#define WDOG_NO_PING_ON_SUSPEND	5	/* Ping worker should be stopped on suspend */
 	struct list_head deferred;
 };
 
@@ -156,6 +158,12 @@ static inline void watchdog_stop_on_unregister(struct watchdog_device *wdd)
 	set_bit(WDOG_STOP_ON_UNREGISTER, &wdd->status);
 }
 
+/* Use the following function to stop the wdog ping worker when suspending */
+static inline void watchdog_stop_ping_on_suspend(struct watchdog_device *wdd)
+{
+	set_bit(WDOG_NO_PING_ON_SUSPEND, &wdd->status);
+}
+
 /* Use the following function to check if a timeout value is invalid */
 static inline bool watchdog_timeout_invalid(struct watchdog_device *wdd, unsigned int t)
 {
@@ -209,6 +217,8 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd,
 				  unsigned int timeout_parm, struct device *dev);
 extern int watchdog_register_device(struct watchdog_device *);
 extern void watchdog_unregister_device(struct watchdog_device *);
+int watchdog_dev_suspend(struct watchdog_device *wdd);
+int watchdog_dev_resume(struct watchdog_device *wdd);
 
 int watchdog_set_last_hw_keepalive(struct watchdog_device *, unsigned int);
 
-- 
cgit v1.2.3


From 6e7c1770a212239e88ec01ddc7a741505bfd10e5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jul 2021 16:23:21 -0400
Subject: fs: simplify get_filesystem_list / get_all_fs_names

Just output the '\0' separate list of supported file systems for block
devices directly rather than going through a pointless round of string
manipulation.

Based on an earlier patch from Al Viro <viro@zeniv.linux.org.uk>.

Vivek:
Modified list_bdev_fs_names() and split_fs_names() to return number of
null terminted strings to caller. Callers now use that information to
loop through all the strings instead of relying on one extra null char
being present at the end.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 640574294216..c76dfc01cf9d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3622,7 +3622,7 @@ int proc_nr_dentry(struct ctl_table *table, int write,
 		  void *buffer, size_t *lenp, loff_t *ppos);
 int proc_nr_inodes(struct ctl_table *table, int write,
 		   void *buffer, size_t *lenp, loff_t *ppos);
-int __init get_filesystem_list(char *buf);
+int __init list_bdev_fs_names(char *buf, size_t size);
 
 #define __FMODE_EXEC		((__force int) FMODE_EXEC)
 #define __FMODE_NONOTIFY	((__force int) FMODE_NONOTIFY)
-- 
cgit v1.2.3


From e5e26d80840b69c1bcea4f5b0cb7ed4026a8f6a3 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sat, 21 Aug 2021 00:58:21 +0200
Subject: gpio: max730x: Use the right include

<linux/spi/max7301.h> despite the placement of the header, is
used by drivers/gpio/gpio-max730*.

The include needs struct gpio_chip and needs to include
<linux/gpio/driver.h> not the legacy <linux/gpio.h> include.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
 include/linux/spi/max7301.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/max7301.h b/include/linux/spi/max7301.h
index 433c20e2f46e..21449067aedb 100644
--- a/include/linux/spi/max7301.h
+++ b/include/linux/spi/max7301.h
@@ -2,7 +2,7 @@
 #ifndef LINUX_SPI_MAX7301_H
 #define LINUX_SPI_MAX7301_H
 
-#include <linux/gpio.h>
+#include <linux/gpio/driver.h>
 
 /*
  * Some registers must be read back to modify.
-- 
cgit v1.2.3


From 15d82ca23c996d50062286d27ed6a42a8105c04a Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Tue, 27 Jul 2021 02:06:50 +0800
Subject: PCI: Introduce domain_nr in pci_host_bridge

Currently we retrieve the PCI domain number of the host bridge from the
bus sysdata (or pci_config_window if PCI_DOMAINS_GENERIC=y). Actually
we have the information at PCI host bridge probing time, and it makes
sense that we store it into pci_host_bridge. One benefit of doing so is
the requirement for supporting PCI on Hyper-V for ARM64, because the
host bridge of Hyper-V doesn't have pci_config_window, whereas ARM64 is
a PCI_DOMAINS_GENERIC=y arch, so we cannot retrieve the PCI domain
number from pci_config_window on ARM64 Hyper-V guest.

As the preparation for ARM64 Hyper-V PCI support, we introduce the
domain_nr in pci_host_bridge and a sentinel value to allow drivers to
set domain numbers properly at probing time. Currently
CONFIG_PCI_DOMAINS_GENERIC=y archs are only users of this
newly-introduced field.

Link: https://lore.kernel.org/r/20210726180657.142727-2-boqun.feng@gmail.com
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 540b377ca8f6..01aa201e1df0 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -526,6 +526,16 @@ static inline int pci_channel_offline(struct pci_dev *pdev)
 	return (pdev->error_state != pci_channel_io_normal);
 }
 
+/*
+ * Currently in ACPI spec, for each PCI host bridge, PCI Segment
+ * Group number is limited to a 16-bit value, therefore (int)-1 is
+ * not a valid PCI domain number, and can be used as a sentinel
+ * value indicating ->domain_nr is not set by the driver (and
+ * CONFIG_PCI_DOMAINS_GENERIC=y archs will set it with
+ * pci_bus_find_domain_nr()).
+ */
+#define PCI_DOMAIN_NR_NOT_SET (-1)
+
 struct pci_host_bridge {
 	struct device	dev;
 	struct pci_bus	*bus;		/* Root bus */
@@ -533,6 +543,7 @@ struct pci_host_bridge {
 	struct pci_ops	*child_ops;
 	void		*sysdata;
 	int		busnr;
+	int		domain_nr;
 	struct list_head windows;	/* resource_entry */
 	struct list_head dma_ranges;	/* dma ranges resource list */
 	u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */
-- 
cgit v1.2.3


From 90e7a6de62781c27d6a111fccfb19b807f9b6887 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@nvidia.com>
Date: Tue, 24 Aug 2021 17:25:29 +0300
Subject: lib/scatterlist: Provide a dedicated function to support table append

RDMA is the only in-kernel user that uses __sg_alloc_table_from_pages to
append pages dynamically. In the next patch. That mode will be extended
and that function will get more parameters. So separate it into a unique
function to make such change more clear.

Link: https://lore.kernel.org/r/20210824142531.3877007-2-maorg@nvidia.com
Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/scatterlist.h | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index ecf87484814f..5c700f2a0d18 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -285,14 +285,45 @@ void sg_free_table(struct sg_table *);
 int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int,
 		     struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *);
 int sg_alloc_table(struct sg_table *, unsigned int, gfp_t);
-struct scatterlist *__sg_alloc_table_from_pages(struct sg_table *sgt,
+struct scatterlist *sg_alloc_append_table_from_pages(struct sg_table *sgt,
 		struct page **pages, unsigned int n_pages, unsigned int offset,
 		unsigned long size, unsigned int max_segment,
 		struct scatterlist *prv, unsigned int left_pages,
 		gfp_t gfp_mask);
-int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages,
-			      unsigned int n_pages, unsigned int offset,
-			      unsigned long size, gfp_t gfp_mask);
+int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages,
+				      unsigned int n_pages, unsigned int offset,
+				      unsigned long size,
+				      unsigned int max_segment, gfp_t gfp_mask);
+
+/**
+ * sg_alloc_table_from_pages - Allocate and initialize an sg table from
+ *			       an array of pages
+ * @sgt:	 The sg table header to use
+ * @pages:	 Pointer to an array of page pointers
+ * @n_pages:	 Number of pages in the pages array
+ * @offset:      Offset from start of the first page to the start of a buffer
+ * @size:        Number of valid bytes in the buffer (after offset)
+ * @gfp_mask:	 GFP allocation mask
+ *
+ *  Description:
+ *    Allocate and initialize an sg table from a list of pages. Contiguous
+ *    ranges of the pages are squashed into a single scatterlist node. A user
+ *    may provide an offset at a start and a size of valid data in a buffer
+ *    specified by the page array. The returned sg table is released by
+ *    sg_free_table.
+ *
+ * Returns:
+ *   0 on success, negative error on failure
+ */
+static inline int sg_alloc_table_from_pages(struct sg_table *sgt,
+					    struct page **pages,
+					    unsigned int n_pages,
+					    unsigned int offset,
+					    unsigned long size, gfp_t gfp_mask)
+{
+	return sg_alloc_table_from_pages_segment(sgt, pages, n_pages, offset,
+						 size, UINT_MAX, gfp_mask);
+}
 
 #ifdef CONFIG_SGL_ALLOC
 struct scatterlist *sgl_alloc_order(unsigned long long length,
-- 
cgit v1.2.3


From 3e302dbc6774a27edaea39a1d5107f0c12e35cf2 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@nvidia.com>
Date: Tue, 24 Aug 2021 17:25:30 +0300
Subject: lib/scatterlist: Fix wrong update of orig_nents

orig_nents should represent the number of entries with pages,
but __sg_alloc_table_from_pages sets orig_nents as the number of
total entries in the table. This is wrong when the API is used for
dynamic allocation where not all the table entries are mapped with
pages. It wasn't observed until now, since RDMA umem who uses this
API in the dynamic form doesn't use orig_nents implicit or explicit
by the scatterlist APIs.

Fix it by changing the append API to track the SG append table
state and have an API to free the append table according to the
total number of entries in the table.
Now all APIs set orig_nents as number of enries with pages.

Fixes: 07da1223ec93 ("lib/scatterlist: Add support in dynamic allocation of SG table from pages")
Link: https://lore.kernel.org/r/20210824142531.3877007-3-maorg@nvidia.com
Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/scatterlist.h | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 5c700f2a0d18..266754a55327 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -39,6 +39,12 @@ struct sg_table {
 	unsigned int orig_nents;	/* original size of list */
 };
 
+struct sg_append_table {
+	struct sg_table sgt;		/* The scatter list table */
+	struct scatterlist *prv;	/* last populated sge in the table */
+	unsigned int total_nents;	/* Total entries in the table */
+};
+
 /*
  * Notes on SG table design.
  *
@@ -280,16 +286,17 @@ typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t);
 typedef void (sg_free_fn)(struct scatterlist *, unsigned int);
 
 void __sg_free_table(struct sg_table *, unsigned int, unsigned int,
-		     sg_free_fn *);
+		     sg_free_fn *, unsigned int);
 void sg_free_table(struct sg_table *);
+void sg_free_append_table(struct sg_append_table *sgt);
 int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int,
 		     struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *);
 int sg_alloc_table(struct sg_table *, unsigned int, gfp_t);
-struct scatterlist *sg_alloc_append_table_from_pages(struct sg_table *sgt,
-		struct page **pages, unsigned int n_pages, unsigned int offset,
-		unsigned long size, unsigned int max_segment,
-		struct scatterlist *prv, unsigned int left_pages,
-		gfp_t gfp_mask);
+int sg_alloc_append_table_from_pages(struct sg_append_table *sgt,
+				     struct page **pages, unsigned int n_pages,
+				     unsigned int offset, unsigned long size,
+				     unsigned int max_segment,
+				     unsigned int left_pages, gfp_t gfp_mask);
 int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages,
 				      unsigned int n_pages, unsigned int offset,
 				      unsigned long size,
-- 
cgit v1.2.3


From a7e20e31f6c063d928868ecc8e2effb7d4b9fe1b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 12 May 2021 14:10:09 +0100
Subject: netfs: Move cookie debug ID to struct netfs_cache_resources

Move the cookie debug ID from struct netfs_read_request to struct
netfs_cache_resources and drop the 'cookie_' prefix.  This makes it
available for things that want to use netfs_cache_resources without having
a netfs_read_request.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/162431190784.2908479.13386972676539789127.stgit@warthog.procyon.org.uk/
---
 include/linux/netfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 9062adfa2fb9..5d6a4158a9a6 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -102,6 +102,7 @@ struct netfs_cache_resources {
 	const struct netfs_cache_ops	*ops;
 	void				*cache_priv;
 	void				*cache_priv2;
+	unsigned int			debug_id;	/* Cookie debug ID */
 };
 
 /*
@@ -137,7 +138,6 @@ struct netfs_read_request {
 	struct list_head	subrequests;	/* Requests to fetch I/O from disk or net */
 	void			*netfs_priv;	/* Private data for the netfs */
 	unsigned int		debug_id;
-	unsigned int		cookie_debug_id;
 	atomic_t		nr_rd_ops;	/* Number of read ops in progress */
 	atomic_t		nr_wr_ops;	/* Number of write ops in progress */
 	size_t			submitted;	/* Amount submitted for I/O so far */
-- 
cgit v1.2.3


From 343b7258687ecfbb363bfda8833a7cf641aac524 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:08 +0300
Subject: PCI: Add 'override_only' field to struct pci_device_id

Add 'override_only' field to struct pci_device_id to be used as part of
pci_match_device().

When set, a driver only matches the entry when dev->driver_override is
set to that driver.

In addition, add a helper macro named 'PCI_DEVICE_DRIVER_OVERRIDE' to
enable setting some data on it.

Next patch from this series will use the above functionality.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-10-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mod_devicetable.h |  2 ++
 include/linux/pci.h             | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 8e291cfdaf06..2e3ba6d9ece0 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -34,12 +34,14 @@ typedef unsigned long kernel_ulong_t;
  *			Best practice is to use driver_data as an index
  *			into a static list of equivalent device types,
  *			instead of using it as a pointer.
+ * @override_only:	Match only when dev->driver_override is this driver.
  */
 struct pci_device_id {
 	__u32 vendor, device;		/* Vendor and device ID or PCI_ANY_ID*/
 	__u32 subvendor, subdevice;	/* Subsystem ID's or PCI_ANY_ID */
 	__u32 class, class_mask;	/* (class,subclass,prog-if) triplet */
 	kernel_ulong_t driver_data;	/* Data private to the driver */
+	__u32 override_only;
 };
 
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 540b377ca8f6..0506b1a8c921 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -901,6 +901,21 @@ struct pci_driver {
 	.vendor = (vend), .device = (dev), \
 	.subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
 
+/**
+ * PCI_DEVICE_DRIVER_OVERRIDE - macro used to describe a PCI device with
+ *                              override_only flags.
+ * @vend: the 16 bit PCI Vendor ID
+ * @dev: the 16 bit PCI Device ID
+ * @driver_override: the 32 bit PCI Device override_only
+ *
+ * This macro is used to create a struct pci_device_id that matches only a
+ * driver_override device. The subvendor and subdevice fields will be set to
+ * PCI_ANY_ID.
+ */
+#define PCI_DEVICE_DRIVER_OVERRIDE(vend, dev, driver_override) \
+	.vendor = (vend), .device = (dev), .subvendor = PCI_ANY_ID, \
+	.subdevice = PCI_ANY_ID, .override_only = (driver_override)
+
 /**
  * PCI_DEVICE_SUB - macro used to describe a specific PCI device with subsystem
  * @vend: the 16 bit PCI Vendor ID
-- 
cgit v1.2.3


From cc6711b0bf36de068b10490198d05ac168377989 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:09 +0300
Subject: PCI / VFIO: Add 'override_only' support for VFIO PCI sub system

Expose an 'override_only' helper macro (i.e.
PCI_DRIVER_OVERRIDE_DEVICE_VFIO) for VFIO PCI sub system and add the
required code to prefix its matching entries with "vfio_" in
modules.alias file.

It allows VFIO device drivers to include match entries in the
modules.alias file produced by kbuild that are not used for normal
driver autoprobing and module autoloading. Drivers using these match
entries can be connected to the PCI device manually, by userspace, using
the existing driver_override sysfs.

For example the resulting modules.alias may have:

  alias pci:v000015B3d00001021sv*sd*bc*sc*i* mlx5_core
  alias vfio_pci:v000015B3d00001021sv*sd*bc*sc*i* mlx5_vfio_pci
  alias vfio_pci:v*d*sv*sd*bc*sc*i* vfio_pci

In this example mlx5_core and mlx5_vfio_pci match to the same PCI
device. The kernel will autoload and autobind to mlx5_core but the
kernel and udev mechanisms will ignore mlx5_vfio_pci.

When userspace wants to change a device to the VFIO subsystem it can
implement a generic algorithm:

   1) Identify the sysfs path to the device:
    /sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0

   2) Get the modalias string from the kernel:
    $ cat /sys/bus/pci/devices/0000:01:00.0/modalias
    pci:v000015B3d00001021sv000015B3sd00000001bc02sc00i00

   3) Prefix it with vfio_:
    vfio_pci:v000015B3d00001021sv000015B3sd00000001bc02sc00i00

   4) Search modules.alias for the above string and select the entry that
      has the fewest *'s:
    alias vfio_pci:v000015B3d00001021sv*sd*bc*sc*i* mlx5_vfio_pci

   5) modprobe the matched module name:
    $ modprobe mlx5_vfio_pci

   6) cat the matched module name to driver_override:
    echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:01:00.0/driver_override

   7) unbind device from original module
     echo 0000:01:00.0 > /sys/bus/pci/devices/0000:01:00.0/driver/unbind

   8) probe PCI drivers (or explicitly bind to mlx5_vfio_pci)
    echo 0000:01:00.0 > /sys/bus/pci/drivers_probe

The algorithm is independent of bus type. In future the other buses with
VFIO device drivers, like platform and ACPI, can use this algorithm as
well.

This patch is the infrastructure to provide the information in the
modules.alias to userspace. Convert the only VFIO pci_driver which results
in one new line in the modules.alias:

  alias vfio_pci:v*d*sv*sd*bc*sc*i* vfio_pci

Later series introduce additional HW specific VFIO PCI drivers, such as
mlx5_vfio_pci.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>  # for pci.h
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-11-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mod_devicetable.h |  4 ++++
 include/linux/pci.h             | 14 ++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 2e3ba6d9ece0..ae2e75d15b21 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -16,6 +16,10 @@ typedef unsigned long kernel_ulong_t;
 
 #define PCI_ANY_ID (~0)
 
+enum {
+	PCI_ID_F_VFIO_DRIVER_OVERRIDE = 1,
+};
+
 /**
  * struct pci_device_id - PCI device ID structure
  * @vendor:		Vendor ID to match (or PCI_ANY_ID)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 0506b1a8c921..527a1dfd1d06 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -916,6 +916,20 @@ struct pci_driver {
 	.vendor = (vend), .device = (dev), .subvendor = PCI_ANY_ID, \
 	.subdevice = PCI_ANY_ID, .override_only = (driver_override)
 
+/**
+ * PCI_DRIVER_OVERRIDE_DEVICE_VFIO - macro used to describe a VFIO
+ *                                   "driver_override" PCI device.
+ * @vend: the 16 bit PCI Vendor ID
+ * @dev: the 16 bit PCI Device ID
+ *
+ * This macro is used to create a struct pci_device_id that matches a
+ * specific device. The subvendor and subdevice fields will be set to
+ * PCI_ANY_ID and the driver_override will be set to
+ * PCI_ID_F_VFIO_DRIVER_OVERRIDE.
+ */
+#define PCI_DRIVER_OVERRIDE_DEVICE_VFIO(vend, dev) \
+	PCI_DEVICE_DRIVER_OVERRIDE(vend, dev, PCI_ID_F_VFIO_DRIVER_OVERRIDE)
+
 /**
  * PCI_DEVICE_SUB - macro used to describe a specific PCI device with subsystem
  * @vend: the 16 bit PCI Vendor ID
-- 
cgit v1.2.3


From 7fa005caa35ed92563b9e9d88d319b2623763a77 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 26 Aug 2021 13:39:12 +0300
Subject: vfio/pci: Introduce vfio_pci_core.ko

Now that vfio_pci has been split into two source modules, one focusing on
the "struct pci_driver" (vfio_pci.c) and a toolbox library of code
(vfio_pci_core.c), complete the split and move them into two different
kernel modules.

As before vfio_pci.ko continues to present the same interface under sysfs
and this change will have no functional impact.

Splitting into another module and adding exports allows creating new HW
specific VFIO PCI drivers that can implement device specific
functionality, such as VFIO migration interfaces or specialized device
requirements.

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20210826103912.128972-14-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/vfio_pci_core.h | 239 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 239 insertions(+)
 create mode 100644 include/linux/vfio_pci_core.h

(limited to 'include/linux')

diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
new file mode 100644
index 000000000000..ef9a44b6cf5d
--- /dev/null
+++ b/include/linux/vfio_pci_core.h
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/vfio.h>
+#include <linux/irqbypass.h>
+#include <linux/types.h>
+#include <linux/uuid.h>
+#include <linux/notifier.h>
+
+#ifndef VFIO_PCI_CORE_H
+#define VFIO_PCI_CORE_H
+
+#define VFIO_PCI_OFFSET_SHIFT   40
+
+#define VFIO_PCI_OFFSET_TO_INDEX(off)	(off >> VFIO_PCI_OFFSET_SHIFT)
+#define VFIO_PCI_INDEX_TO_OFFSET(index)	((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
+#define VFIO_PCI_OFFSET_MASK	(((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
+
+/* Special capability IDs predefined access */
+#define PCI_CAP_ID_INVALID		0xFF	/* default raw access */
+#define PCI_CAP_ID_INVALID_VIRT		0xFE	/* default virt access */
+
+/* Cap maximum number of ioeventfds per device (arbitrary) */
+#define VFIO_PCI_IOEVENTFD_MAX		1000
+
+struct vfio_pci_ioeventfd {
+	struct list_head	next;
+	struct vfio_pci_core_device	*vdev;
+	struct virqfd		*virqfd;
+	void __iomem		*addr;
+	uint64_t		data;
+	loff_t			pos;
+	int			bar;
+	int			count;
+	bool			test_mem;
+};
+
+struct vfio_pci_irq_ctx {
+	struct eventfd_ctx	*trigger;
+	struct virqfd		*unmask;
+	struct virqfd		*mask;
+	char			*name;
+	bool			masked;
+	struct irq_bypass_producer	producer;
+};
+
+struct vfio_pci_core_device;
+struct vfio_pci_region;
+
+struct vfio_pci_regops {
+	ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf,
+		      size_t count, loff_t *ppos, bool iswrite);
+	void	(*release)(struct vfio_pci_core_device *vdev,
+			   struct vfio_pci_region *region);
+	int	(*mmap)(struct vfio_pci_core_device *vdev,
+			struct vfio_pci_region *region,
+			struct vm_area_struct *vma);
+	int	(*add_capability)(struct vfio_pci_core_device *vdev,
+				  struct vfio_pci_region *region,
+				  struct vfio_info_cap *caps);
+};
+
+struct vfio_pci_region {
+	u32				type;
+	u32				subtype;
+	const struct vfio_pci_regops	*ops;
+	void				*data;
+	size_t				size;
+	u32				flags;
+};
+
+struct vfio_pci_dummy_resource {
+	struct resource		resource;
+	int			index;
+	struct list_head	res_next;
+};
+
+struct vfio_pci_vf_token {
+	struct mutex		lock;
+	uuid_t			uuid;
+	int			users;
+};
+
+struct vfio_pci_mmap_vma {
+	struct vm_area_struct	*vma;
+	struct list_head	vma_next;
+};
+
+struct vfio_pci_core_device {
+	struct vfio_device	vdev;
+	struct pci_dev		*pdev;
+	void __iomem		*barmap[PCI_STD_NUM_BARS];
+	bool			bar_mmap_supported[PCI_STD_NUM_BARS];
+	u8			*pci_config_map;
+	u8			*vconfig;
+	struct perm_bits	*msi_perm;
+	spinlock_t		irqlock;
+	struct mutex		igate;
+	struct vfio_pci_irq_ctx	*ctx;
+	int			num_ctx;
+	int			irq_type;
+	int			num_regions;
+	struct vfio_pci_region	*region;
+	u8			msi_qmax;
+	u8			msix_bar;
+	u16			msix_size;
+	u32			msix_offset;
+	u32			rbar[7];
+	bool			pci_2_3;
+	bool			virq_disabled;
+	bool			reset_works;
+	bool			extended_caps;
+	bool			bardirty;
+	bool			has_vga;
+	bool			needs_reset;
+	bool			nointx;
+	bool			needs_pm_restore;
+	struct pci_saved_state	*pci_saved_state;
+	struct pci_saved_state	*pm_save;
+	int			ioeventfds_nr;
+	struct eventfd_ctx	*err_trigger;
+	struct eventfd_ctx	*req_trigger;
+	struct list_head	dummy_resources_list;
+	struct mutex		ioeventfds_lock;
+	struct list_head	ioeventfds_list;
+	struct vfio_pci_vf_token	*vf_token;
+	struct notifier_block	nb;
+	struct mutex		vma_lock;
+	struct list_head	vma_list;
+	struct rw_semaphore	memory_lock;
+};
+
+#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
+#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX)
+#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
+#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
+#define irq_is(vdev, type) (vdev->irq_type == type)
+
+extern void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
+extern void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
+
+extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev,
+				   uint32_t flags, unsigned index,
+				   unsigned start, unsigned count, void *data);
+
+extern ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev,
+				  char __user *buf, size_t count,
+				  loff_t *ppos, bool iswrite);
+
+extern ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			       size_t count, loff_t *ppos, bool iswrite);
+
+extern ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			       size_t count, loff_t *ppos, bool iswrite);
+
+extern long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
+			       uint64_t data, int count, int fd);
+
+extern int vfio_pci_init_perm_bits(void);
+extern void vfio_pci_uninit_perm_bits(void);
+
+extern int vfio_config_init(struct vfio_pci_core_device *vdev);
+extern void vfio_config_free(struct vfio_pci_core_device *vdev);
+
+extern int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
+					unsigned int type, unsigned int subtype,
+					const struct vfio_pci_regops *ops,
+					size_t size, u32 flags, void *data);
+
+extern int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
+				    pci_power_t state);
+
+extern bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
+extern void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device
+						    *vdev);
+extern u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev);
+extern void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
+					       u16 cmd);
+
+#ifdef CONFIG_VFIO_PCI_IGD
+extern int vfio_pci_igd_init(struct vfio_pci_core_device *vdev);
+#else
+static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
+{
+	return -ENODEV;
+}
+#endif
+
+#ifdef CONFIG_S390
+extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
+				       struct vfio_info_cap *caps);
+#else
+static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
+					      struct vfio_info_cap *caps)
+{
+	return -ENODEV;
+}
+#endif
+
+/* Will be exported for vfio pci drivers usage */
+void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga,
+			      bool is_disable_idle_d3);
+void vfio_pci_core_close_device(struct vfio_device *core_vdev);
+void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
+			       struct pci_dev *pdev,
+			       const struct vfio_device_ops *vfio_pci_ops);
+int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev);
+void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev);
+void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev);
+int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn);
+extern const struct pci_error_handlers vfio_pci_core_err_handlers;
+long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
+		unsigned long arg);
+ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
+		size_t count, loff_t *ppos);
+ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
+		size_t count, loff_t *ppos);
+int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma);
+void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count);
+int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
+int vfio_pci_core_enable(struct vfio_pci_core_device *vdev);
+void vfio_pci_core_disable(struct vfio_pci_core_device *vdev);
+void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
+
+static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
+{
+	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
+}
+
+#endif /* VFIO_PCI_CORE_H */
-- 
cgit v1.2.3


From b3636a3a2c51715736d3ec45f635ed03191962ce Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Sat, 31 Jul 2021 22:50:32 +0300
Subject: PM: runtime: add devm_pm_runtime_enable helper

A typical code pattern for pm_runtime_enable() call is to call it in the
_probe function and to call pm_runtime_disable() both from _probe error
path and from _remove function. For some drivers the whole remove
function would consist of the call to pm_remove_disable().

Add helper function to replace this bolierplate piece of code. Calling
devm_pm_runtime_enable() removes the need for calling
pm_runtime_disable() both in the probe()'s error path and in the
remove() function.

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20210731195034.979084-2-dmitry.baryshkov@linaro.org
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/pm_runtime.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index aab8b35e9f8a..222da43b7096 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -59,6 +59,8 @@ extern void pm_runtime_put_suppliers(struct device *dev);
 extern void pm_runtime_new_link(struct device *dev);
 extern void pm_runtime_drop_link(struct device_link *link);
 
+extern int devm_pm_runtime_enable(struct device *dev);
+
 /**
  * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter.
  * @dev: Target device.
@@ -253,6 +255,8 @@ static inline void __pm_runtime_disable(struct device *dev, bool c) {}
 static inline void pm_runtime_allow(struct device *dev) {}
 static inline void pm_runtime_forbid(struct device *dev) {}
 
+static inline int devm_pm_runtime_enable(struct device *dev) { return 0; }
+
 static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {}
 static inline void pm_runtime_get_noresume(struct device *dev) {}
 static inline void pm_runtime_put_noidle(struct device *dev) {}
-- 
cgit v1.2.3


From a649136b17af6361355874a10e9219390c266a2c Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Sat, 31 Jul 2021 22:50:33 +0300
Subject: PM: runtime: add devm_pm_clk_create helper

A typical code pattern for pm_clk_create() call is to call it in the
_probe function and to call pm_clk_destroy() both from _probe error path
and from _remove function. For some drivers the whole remove function
would consist of the call to pm_remove_disable().

Add helper function to replace this bolierplate piece of code. Calling
devm_pm_clk_create() removes the need for calling pm_clk_destroy() both
in the probe()'s error path and in the remove() function.

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20210731195034.979084-3-dmitry.baryshkov@linaro.org
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/pm_clock.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_clock.h b/include/linux/pm_clock.h
index 8ddc7860e131..ada3a0ab10bf 100644
--- a/include/linux/pm_clock.h
+++ b/include/linux/pm_clock.h
@@ -47,6 +47,7 @@ extern void pm_clk_remove(struct device *dev, const char *con_id);
 extern void pm_clk_remove_clk(struct device *dev, struct clk *clk);
 extern int pm_clk_suspend(struct device *dev);
 extern int pm_clk_resume(struct device *dev);
+extern int devm_pm_clk_create(struct device *dev);
 #else
 static inline bool pm_clk_no_clocks(struct device *dev)
 {
@@ -83,6 +84,10 @@ static inline void pm_clk_remove(struct device *dev, const char *con_id)
 static inline void pm_clk_remove_clk(struct device *dev, struct clk *clk)
 {
 }
+static inline int devm_pm_clk_create(struct device *dev)
+{
+	return -EINVAL;
+}
 #endif
 
 #ifdef CONFIG_HAVE_CLK
-- 
cgit v1.2.3


From 8c09e896cef8d908dd9a20a9f2a5c3fcb9799de3 Mon Sep 17 00:00:00 2001
From: Zhangfei Gao <zhangfei.gao@linaro.org>
Date: Tue, 13 Jul 2021 10:54:34 +0800
Subject: PCI: Allow PASID on fake PCIe devices without TLP prefixes

Some systems, e.g., HiSilicon KunPeng920 and KunPeng930, have devices that
appear as PCI but are actually on the AMBA bus.  Some of these fake PCI
devices support a PASID-like feature and they do have a working PASID
capability even though they do not use the PCIe Transport Layer Protocol
and do not support TLP prefixes.

Add a pasid_no_tlp bit for this "PASID works without TLP prefixes" case and
update pci_enable_pasid() so it can enable PASID on these devices.

Set this bit for HiSilicon KunPeng920 and KunPeng930.

[bhelgaas: squashed, commit log]
Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/1626144876-11352-2-git-send-email-zhangfei.gao@linaro.org
Link: https://lore.kernel.org/r/1626144876-11352-3-git-send-email-zhangfei.gao@linaro.org
Signed-off-by: Zhangfei Gao <zhangfei.gao@linaro.org>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Zhou Wang <wangzhou1@hisilicon.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 540b377ca8f6..28165dc5b221 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -388,6 +388,7 @@ struct pci_dev {
 					   supported from root to here */
 	u16		l1ss;		/* L1SS Capability pointer */
 #endif
+	unsigned int	pasid_no_tlp:1;		/* PASID works without TLP Prefix */
 	unsigned int	eetlp_prefix_path:1;	/* End-to-End TLP Prefix */
 
 	pci_channel_state_t error_state;	/* Current connectivity state */
-- 
cgit v1.2.3


From 1b7646014e0d838b06be7288e2dec3262948cc56 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 26 Aug 2021 15:55:05 +0200
Subject: dax: mark dax_get_by_host static

And move the code around a bit to avoid a forward declaration.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/20210826135510.6293-5-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index b52f084aa643..379739b55408 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -41,7 +41,6 @@ struct dax_operations {
 extern struct attribute_group dax_attribute_group;
 
 #if IS_ENABLED(CONFIG_DAX)
-struct dax_device *dax_get_by_host(const char *host);
 struct dax_device *alloc_dax(void *private, const char *host,
 		const struct dax_operations *ops, unsigned long flags);
 void put_dax(struct dax_device *dax_dev);
@@ -73,10 +72,6 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
 	return dax_synchronous(dax_dev);
 }
 #else
-static inline struct dax_device *dax_get_by_host(const char *host)
-{
-	return NULL;
-}
 static inline struct dax_device *alloc_dax(void *private, const char *host,
 		const struct dax_operations *ops, unsigned long flags)
 {
-- 
cgit v1.2.3


From cd93a2a4d1b076f5c73d70d836c202bbcbeea49e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 26 Aug 2021 15:55:07 +0200
Subject: dax: remove __generic_fsdax_supported

Just implement generic_fsdax_supported directly out of line instead of
adding a wrapper.  Given that generic_fsdax_supported is only supplied
for CONFIG_FS_DAX builds this also allows to not provide it at all for
!CONFIG_FS_DAX builds.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/20210826135510.6293-7-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 379739b55408..0a3ef9701e03 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -123,16 +123,9 @@ static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize)
 	return __bdev_dax_supported(bdev, blocksize);
 }
 
-bool __generic_fsdax_supported(struct dax_device *dax_dev,
+bool generic_fsdax_supported(struct dax_device *dax_dev,
 		struct block_device *bdev, int blocksize, sector_t start,
 		sector_t sectors);
-static inline bool generic_fsdax_supported(struct dax_device *dax_dev,
-		struct block_device *bdev, int blocksize, sector_t start,
-		sector_t sectors)
-{
-	return __generic_fsdax_supported(dax_dev, bdev, blocksize, start,
-			sectors);
-}
 
 static inline void fs_put_dax(struct dax_device *dax_dev)
 {
@@ -154,12 +147,7 @@ static inline bool bdev_dax_supported(struct block_device *bdev,
 	return false;
 }
 
-static inline bool generic_fsdax_supported(struct dax_device *dax_dev,
-		struct block_device *bdev, int blocksize, sector_t start,
-		sector_t sectors)
-{
-	return false;
-}
+#define generic_fsdax_supported		NULL
 
 static inline void fs_put_dax(struct dax_device *dax_dev)
 {
-- 
cgit v1.2.3


From 60b8340f0d6587d7b51990689fcdae567f309fbf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 26 Aug 2021 15:55:08 +0200
Subject: dax: stub out dax_supported for !CONFIG_FS_DAX

dax_supported calls into ->dax_supported which checks for fsdax support.
Don't bother building it for !CONFIG_FS_DAX as it will always return
false.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/20210826135510.6293-8-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 0a3ef9701e03..32dce5763f2c 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -57,8 +57,6 @@ static inline void set_dax_synchronous(struct dax_device *dax_dev)
 {
 	__set_dax_synchronous(dax_dev);
 }
-bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
-		int blocksize, sector_t start, sector_t len);
 /*
  * Check if given mapping is supported by the file / underlying device.
  */
@@ -101,12 +99,6 @@ static inline bool dax_synchronous(struct dax_device *dax_dev)
 static inline void set_dax_synchronous(struct dax_device *dax_dev)
 {
 }
-static inline bool dax_supported(struct dax_device *dax_dev,
-		struct block_device *bdev, int blocksize, sector_t start,
-		sector_t len)
-{
-	return false;
-}
 static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
 				struct dax_device *dax_dev)
 {
@@ -127,6 +119,9 @@ bool generic_fsdax_supported(struct dax_device *dax_dev,
 		struct block_device *bdev, int blocksize, sector_t start,
 		sector_t sectors);
 
+bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
+		int blocksize, sector_t start, sector_t len);
+
 static inline void fs_put_dax(struct dax_device *dax_dev)
 {
 	put_dax(dax_dev);
@@ -149,6 +144,13 @@ static inline bool bdev_dax_supported(struct block_device *bdev,
 
 #define generic_fsdax_supported		NULL
 
+static inline bool dax_supported(struct dax_device *dax_dev,
+		struct block_device *bdev, int blocksize, sector_t start,
+		sector_t len)
+{
+	return false;
+}
+
 static inline void fs_put_dax(struct dax_device *dax_dev)
 {
 }
-- 
cgit v1.2.3


From bdd3c50d83bf7f6acc869b48d02670d19030ae03 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 26 Aug 2021 15:55:10 +0200
Subject: dax: remove bdev_dax_supported

All callers already have a dax_device obtained from fs_dax_get_by_bdev
at hand, so just pass that to dax_supported() insted of doing another
lookup.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/20210826135510.6293-10-hch@lst.de
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 32dce5763f2c..2619d94c308d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -109,12 +109,6 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
 struct writeback_control;
 int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #if IS_ENABLED(CONFIG_FS_DAX)
-bool __bdev_dax_supported(struct block_device *bdev, int blocksize);
-static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize)
-{
-	return __bdev_dax_supported(bdev, blocksize);
-}
-
 bool generic_fsdax_supported(struct dax_device *dax_dev,
 		struct block_device *bdev, int blocksize, sector_t start,
 		sector_t sectors);
@@ -136,12 +130,6 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t st
 dax_entry_t dax_lock_page(struct page *page);
 void dax_unlock_page(struct page *page, dax_entry_t cookie);
 #else
-static inline bool bdev_dax_supported(struct block_device *bdev,
-		int blocksize)
-{
-	return false;
-}
-
 #define generic_fsdax_supported		NULL
 
 static inline bool dax_supported(struct dax_device *dax_dev,
-- 
cgit v1.2.3


From 2908f5e101e3fb1d478cff1c556966e1af816641 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 10 Feb 2020 10:00:22 +0000
Subject: fscache: Add a cookie debug ID and use that in traces

Add a cookie debug ID and use that in traces and in procfiles rather than
displaying the (hashed) pointer to the cookie.  This is easier to correlate
and we don't lose anything when interpreting oops output since that shows
unhashed addresses and registers that aren't comparable to the hashed
values.

Changes:

ver #2:
 - Fix the fscache_op tracepoint to handle a NULL cookie pointer.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/158861210988.340223.11688464116498247790.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/159465769844.1376105.14119502774019865432.stgit@warthog.procyon.org.uk/
Link: https://lore.kernel.org/r/160588459097.3465195.1273313637721852165.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/162431193544.2908479.17556704572948300790.stgit@warthog.procyon.org.uk/
---
 include/linux/fscache.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index abc1c4737fb8..ba58c427cf9a 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -126,6 +126,7 @@ struct fscache_cookie {
 	atomic_t			usage;		/* number of users of this cookie */
 	atomic_t			n_children;	/* number of children of this cookie */
 	atomic_t			n_active;	/* number of active users of netfs ptrs */
+	unsigned int			debug_id;
 	spinlock_t			lock;
 	spinlock_t			stores_lock;	/* lock on page store tree */
 	struct hlist_head		backing_objects; /* object(s) backing this file/index */
-- 
cgit v1.2.3


From 884a76881fc5f5c9c04de1b640bed2c340929842 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 10 Feb 2020 10:00:22 +0000
Subject: fscache: Procfile to display cookies

Add /proc/fs/fscache/cookies to display active cookies.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/158861211871.340223.7223853943667440807.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/159465771021.1376105.6933857529128238020.stgit@warthog.procyon.org.uk/
Link: https://lore.kernel.org/r/160588460994.3465195.16963417803501149328.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/162431194785.2908479.786917990782538164.stgit@warthog.procyon.org.uk/
---
 include/linux/fscache.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index ba58c427cf9a..ea61e54a6bc5 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -133,6 +133,7 @@ struct fscache_cookie {
 	const struct fscache_cookie_def	*def;		/* definition */
 	struct fscache_cookie		*parent;	/* parent of this entry */
 	struct hlist_bl_node		hash_link;	/* Link in hash table */
+	struct list_head		proc_link;	/* Link in proc list */
 	void				*netfs_data;	/* back pointer to netfs */
 	struct radix_tree_root		stores;		/* pages to be stored on this cookie */
 #define FSCACHE_COOKIE_PENDING_TAG	0		/* pages tag: pending write to cache */
-- 
cgit v1.2.3


From 6ae9bd8bb037b7c422bafde746f2338a716f6058 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 12 May 2021 09:40:19 +0100
Subject: fscache, cachefiles: Remove the histogram stuff

Remove the histogram stuff as it's mostly going to be outdated.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/162431195953.2908479.16770977195634296638.stgit@warthog.procyon.org.uk/
---
 include/linux/fscache-cache.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 3235ddbdcc09..fbff0b7e3ef1 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -147,7 +147,6 @@ struct fscache_retrieval {
 	fscache_rw_complete_t	end_io_func;	/* function to call on I/O completion */
 	void			*context;	/* netfs read context (pinned) */
 	struct list_head	to_do;		/* list of things to be done by the backend */
-	unsigned long		start_time;	/* time at which retrieval started */
 	atomic_t		n_pages;	/* number of pages to be retrieved */
 };
 
-- 
cgit v1.2.3


From 58f386a73f16cea1f78e8466cc5c402eb7f6fcf8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 12 May 2021 09:59:17 +0100
Subject: fscache: Remove the object list procfile

Remove the object list procfile from fscache as objects will become
entirely internal to the cache.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/162431198332.2908479.5847286163455099669.stgit@warthog.procyon.org.uk/
---
 include/linux/fscache-cache.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index fbff0b7e3ef1..8d39491c5f9f 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -384,9 +384,6 @@ struct fscache_object {
 	struct list_head	dependents;	/* FIFO of dependent objects */
 	struct list_head	dep_link;	/* link in parent's dependents list */
 	struct list_head	pending_ops;	/* unstarted operations on this object */
-#ifdef CONFIG_FSCACHE_OBJECT_LIST
-	struct rb_node		objlist_link;	/* link in global object list */
-#endif
 	pgoff_t			store_limit;	/* current storage limit */
 	loff_t			store_limit_l;	/* current storage limit */
 };
-- 
cgit v1.2.3


From 20ec197bfa13c5b799fc9527790ea7b5374fc8f2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 29 Mar 2021 13:53:50 +0100
Subject: fscache: Use refcount_t for the cookie refcount instead of atomic_t

Use refcount_t for the fscache_cookie refcount instead of atomic_t and
rename the 'usage' member to 'ref' in such cases.  The tracepoints that
reference it change from showing "u=%d" to "r=%d".

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/162431204358.2908479.8006938388213098079.stgit@warthog.procyon.org.uk/
---
 include/linux/fscache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index ea61e54a6bc5..a4dab5998613 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -123,7 +123,7 @@ struct fscache_netfs {
  * - indices are created on disk just-in-time
  */
 struct fscache_cookie {
-	atomic_t			usage;		/* number of users of this cookie */
+	refcount_t			ref;		/* number of users of this cookie */
 	atomic_t			n_children;	/* number of children of this cookie */
 	atomic_t			n_active;	/* number of active users of netfs ptrs */
 	unsigned int			debug_id;
-- 
cgit v1.2.3


From 3a3f976639f267823e443fdd8bffa03848fa1c3f Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Fri, 27 Aug 2021 14:37:14 -0400
Subject: SUNRPC keep track of number of transports to unique addresses

Currently, xprt_switch keeps a number of all xprts (xps_nxprts)
that were added to the switch regardless of whethere it's an
nconnect transport or a transport to a trunkable address.
Introduce a new counter to keep track of transports to unique
destination addresses per xprt_switch.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprtmultipath.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index b19addc8b715..bbb8a5fa0816 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -18,6 +18,7 @@ struct rpc_xprt_switch {
 	unsigned int		xps_id;
 	unsigned int		xps_nxprts;
 	unsigned int		xps_nactive;
+	unsigned int		xps_nunique_destaddr_xprts;
 	atomic_long_t		xps_queuelen;
 	struct list_head	xps_xprt_list;
 
-- 
cgit v1.2.3


From 7e134205f62955369619021a695cd78fefd32451 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Fri, 27 Aug 2021 14:37:17 -0400
Subject: NFSv4 introduce max_connect mount options

This option will control up to how many xprts can the client
establish to the server with a distinct address (that means
nconnect connections are not counted towards this new limit).
This patch is setting up nfs structures to keeep track of the
max_connect limit (does not enforce it).

The default value is kept at 1 so that no current mounts that
don't want any additional connections would be effected. The
maximum value is set at 16.

Mounts to DS are not limited to default value of 1 but instead
set to the maximum default value of 16 (NFS_MAX_TRANSPORTS).

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/nfs_fs.h    | 5 +++++
 include/linux/nfs_fs_sb.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index ce6474594872..b9a8b925db43 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -40,6 +40,11 @@
 
 #include <linux/mempool.h>
 
+/*
+ * These are the default for number of transports to different server IPs
+ */
+#define NFS_MAX_TRANSPORTS 16
+
 /*
  * These are the default flags for swap requests
  */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index d71a0e90faeb..2a9acbfe00f0 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -62,6 +62,7 @@ struct nfs_client {
 
 	u32			cl_minorversion;/* NFSv4 minorversion */
 	unsigned int		cl_nconnect;	/* Number of connections */
+	unsigned int		cl_max_connect; /* max number of xprts allowed */
 	const char *		cl_principal;  /* used for machine cred */
 
 #if IS_ENABLED(CONFIG_NFS_V4)
-- 
cgit v1.2.3


From dc48e0abee245e2f0361bd8d4e3b00f70450fab2 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Fri, 27 Aug 2021 14:37:18 -0400
Subject: SUNRPC enforce creation of no more than max_connect xprts

If we are adding new transports via rpc_clnt_test_and_add_xprt()
then check if we've reached the limit. Currently only pnfs path
adds transports via that function but this is done in
preparation when the client would add new transports when
session trunking is detected. A warning is logged if the
limit is reached.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/clnt.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index b2edd5fc2f0c..a4661646adc9 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -82,6 +82,7 @@ struct rpc_clnt {
 		struct work_struct	cl_work;
 	};
 	const struct cred	*cl_cred;
+	unsigned int		cl_max_connect; /* max number of transports not to the same IP */
 };
 
 /*
@@ -136,6 +137,7 @@ struct rpc_create_args {
 	char			*client_name;
 	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
 	const struct cred	*cred;
+	unsigned int		max_connect;
 };
 
 struct rpc_add_xprt_test {
-- 
cgit v1.2.3


From ab959c7d4ea086852f35c7ff20ecd79b7471cfad Mon Sep 17 00:00:00 2001
From: Biju Das <biju.das.jz@bp.renesas.com>
Date: Fri, 6 Aug 2021 10:53:21 +0100
Subject: dmaengine: Extend the dma_slave_width for 128 bytes

Add DMA_SLAVE_BUSWIDTH_128_BYTES to dma_slave_width for DMA engines
and users to select 128 bytes as bus width.

Signed-off-by: Biju Das <biju.das.jz@bp.renesas.com>
Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://lore.kernel.org/r/20210806095322.2326-3-biju.das.jz@bp.renesas.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 93c3ca5fdafd..e5c2c9e71bf1 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -380,6 +380,7 @@ enum dma_slave_buswidth {
 	DMA_SLAVE_BUSWIDTH_16_BYTES = 16,
 	DMA_SLAVE_BUSWIDTH_32_BYTES = 32,
 	DMA_SLAVE_BUSWIDTH_64_BYTES = 64,
+	DMA_SLAVE_BUSWIDTH_128_BYTES = 128,
 };
 
 /**
@@ -398,7 +399,7 @@ enum dma_slave_buswidth {
  * @src_addr_width: this is the width in bytes of the source (RX)
  * register where DMA data shall be read. If the source
  * is memory this may be ignored depending on architecture.
- * Legal values: 1, 2, 3, 4, 8, 16, 32, 64.
+ * Legal values: 1, 2, 3, 4, 8, 16, 32, 64, 128.
  * @dst_addr_width: same as src_addr_width but for destination
  * target (TX) mutatis mutandis.
  * @src_maxburst: the maximum number of words (note: words, as in
-- 
cgit v1.2.3


From d7e7747ac5c2496c98291944c6066adaa9f3b975 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 26 Aug 2021 15:54:22 +0200
Subject: netfilter: refuse insertion if chain has grown too large

Also add a stat counter for this that gets exported both via old /proc
interface and ctnetlink.

Assuming the old default size of 16536 buckets and max hash occupancy of
64k, this results in 128k insertions (origin+reply), so ~8 entries per
chain on average.

The revised settings in this series will result in about two entries per
bucket on average.

This allows a hard-limit ceiling of 64.

This is not tunable at the moment, but its possible to either increase
nf_conntrack_buckets or decrease nf_conntrack_max to reduce average
lengths.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_common.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 0c7d8d1e945d..700ea077ce2d 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -18,6 +18,7 @@ struct ip_conntrack_stat {
 	unsigned int expect_create;
 	unsigned int expect_delete;
 	unsigned int search_restart;
+	unsigned int chaintoolong;
 };
 
 #define NFCT_INFOMASK	7UL
-- 
cgit v1.2.3


From a61590892ef097c180144fa469abe2256b9ae715 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 26 Aug 2021 20:53:42 +0200
Subject: PCI/VPD: Stop exporting pci_vpd_find_tag()

Now that the last users have been migrated to pci_vpd_find_ro_keyword()
we can stop exporting this function. It's still used in VPD core code.

Link: https://lore.kernel.org/r/71131eca-0502-7878-365f-30b6614161cf@gmail.com
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 827b7eefd550..4fb233e374c5 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2339,17 +2339,6 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field)
  */
 void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size);
 
-/**
- * pci_vpd_find_tag - Locates the Resource Data Type tag provided
- * @buf: Pointer to buffered vpd data
- * @len: The length of the vpd buffer
- * @rdt: The Resource Data Type to search for
- *
- * Returns the index where the Resource Data Type was found or
- * -ENOENT otherwise.
- */
-int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt);
-
 /**
  * pci_vpd_find_info_keyword - Locates an information field keyword in the VPD
  * @buf: Pointer to buffered vpd data
-- 
cgit v1.2.3


From 59b83b29bb5532bbff54a271e0b4f321e28b954f Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 26 Aug 2021 20:54:23 +0200
Subject: PCI/VPD: Stop exporting pci_vpd_find_info_keyword()

Now that the last users have been migrated to pci_vpd_find_ro_keyword()
we can stop exporting this function. It's still used in VPD core code.

Link: https://lore.kernel.org/r/96ca2a56-383e-9b61-9cba-4f1e5611dc15@gmail.com
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4fb233e374c5..196cbf4c76a1 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2339,19 +2339,6 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field)
  */
 void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size);
 
-/**
- * pci_vpd_find_info_keyword - Locates an information field keyword in the VPD
- * @buf: Pointer to buffered vpd data
- * @off: The offset into the buffer at which to begin the search
- * @len: The length of the buffer area, relative to off, in which to search
- * @kw: The keyword to search for
- *
- * Returns the index where the information field keyword was found or
- * -ENOENT otherwise.
- */
-int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off,
-			      unsigned int len, const char *kw);
-
 /**
  * pci_vpd_find_ro_info_keyword - Locate info field keyword in VPD RO section
  * @buf: Pointer to buffered VPD data
-- 
cgit v1.2.3


From acfbb1b8a494d7bfd316dfb363a820e6df637e8d Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 26 Aug 2021 20:55:43 +0200
Subject: PCI/VPD: Add pci_vpd_find_id_string()

Add a pci_vpd_find_id_string() API function to retrieve the ID string from
VPD.

This way callers don't need pci_vpd_lrdt_size() any longer, and it can be
made private to the VPD core.

Link: https://lore.kernel.org/r/c5225bf6-8d29-970d-e271-0d7b52252630@gmail.com
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 196cbf4c76a1..ea330ca0501a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2339,6 +2339,16 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field)
  */
 void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size);
 
+/**
+ * pci_vpd_find_id_string - Locate id string in VPD
+ * @buf: Pointer to buffered VPD data
+ * @len: The length of the buffer area in which to search
+ * @size: Pointer to field where length of id string is returned
+ *
+ * Returns the index of the id string or -ENOENT if not found.
+ */
+int pci_vpd_find_id_string(const u8 *buf, unsigned int len, unsigned int *size);
+
 /**
  * pci_vpd_find_ro_info_keyword - Locate info field keyword in VPD RO section
  * @buf: Pointer to buffered VPD data
-- 
cgit v1.2.3


From 06e1913d457121a98ee276179734c34dab30f388 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 26 Aug 2021 20:57:01 +0200
Subject: PCI/VPD: Clean up public VPD defines and inline functions

After recent introduction of new VPD API functions and user migration
these defines and inline functions aren't used outside VPD core any
longer.

Link: https://lore.kernel.org/r/d33e06bf-bc5e-ece7-bf35-7245ae224d1b@gmail.com
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 69 -----------------------------------------------------
 1 file changed, 69 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index ea330ca0501a..303034d03c33 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2255,81 +2255,12 @@ int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask);
 #define PCI_VPD_LRDT_RO_DATA		PCI_VPD_LRDT_ID(PCI_VPD_LTIN_RO_DATA)
 #define PCI_VPD_LRDT_RW_DATA		PCI_VPD_LRDT_ID(PCI_VPD_LTIN_RW_DATA)
 
-/* Small Resource Data Type Tag Item Names */
-#define PCI_VPD_STIN_END		0x0f	/* End */
-
-#define PCI_VPD_SRDT_END		(PCI_VPD_STIN_END << 3)
-
-#define PCI_VPD_SRDT_TIN_MASK		0x78
-#define PCI_VPD_SRDT_LEN_MASK		0x07
-#define PCI_VPD_LRDT_TIN_MASK		0x7f
-
-#define PCI_VPD_LRDT_TAG_SIZE		3
-#define PCI_VPD_SRDT_TAG_SIZE		1
-
-#define PCI_VPD_INFO_FLD_HDR_SIZE	3
-
 #define PCI_VPD_RO_KEYWORD_PARTNO	"PN"
 #define PCI_VPD_RO_KEYWORD_SERIALNO	"SN"
 #define PCI_VPD_RO_KEYWORD_MFR_ID	"MN"
 #define PCI_VPD_RO_KEYWORD_VENDOR0	"V0"
 #define PCI_VPD_RO_KEYWORD_CHKSUM	"RV"
 
-/**
- * pci_vpd_lrdt_size - Extracts the Large Resource Data Type length
- * @lrdt: Pointer to the beginning of the Large Resource Data Type tag
- *
- * Returns the extracted Large Resource Data Type length.
- */
-static inline u16 pci_vpd_lrdt_size(const u8 *lrdt)
-{
-	return (u16)lrdt[1] + ((u16)lrdt[2] << 8);
-}
-
-/**
- * pci_vpd_lrdt_tag - Extracts the Large Resource Data Type Tag Item
- * @lrdt: Pointer to the beginning of the Large Resource Data Type tag
- *
- * Returns the extracted Large Resource Data Type Tag item.
- */
-static inline u16 pci_vpd_lrdt_tag(const u8 *lrdt)
-{
-	return (u16)(lrdt[0] & PCI_VPD_LRDT_TIN_MASK);
-}
-
-/**
- * pci_vpd_srdt_size - Extracts the Small Resource Data Type length
- * @srdt: Pointer to the beginning of the Small Resource Data Type tag
- *
- * Returns the extracted Small Resource Data Type length.
- */
-static inline u8 pci_vpd_srdt_size(const u8 *srdt)
-{
-	return (*srdt) & PCI_VPD_SRDT_LEN_MASK;
-}
-
-/**
- * pci_vpd_srdt_tag - Extracts the Small Resource Data Type Tag Item
- * @srdt: Pointer to the beginning of the Small Resource Data Type tag
- *
- * Returns the extracted Small Resource Data Type Tag Item.
- */
-static inline u8 pci_vpd_srdt_tag(const u8 *srdt)
-{
-	return ((*srdt) & PCI_VPD_SRDT_TIN_MASK) >> 3;
-}
-
-/**
- * pci_vpd_info_field_size - Extracts the information field length
- * @info_field: Pointer to the beginning of an information field header
- *
- * Returns the extracted information field length.
- */
-static inline u8 pci_vpd_info_field_size(const u8 *info_field)
-{
-	return info_field[2];
-}
-
 /**
  * pci_vpd_alloc - Allocate buffer and read VPD into it
  * @dev: PCI device
-- 
cgit v1.2.3


From 4b92d4add5f6dcf21275185c997d6ecb800054cd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 31 Aug 2021 13:48:34 +0200
Subject: drivers: base: cacheinfo: Get rid of DEFINE_SMP_CALL_CACHE_FUNCTION()

DEFINE_SMP_CALL_CACHE_FUNCTION() was usefel before the CPU hotplug rework
to ensure that the cache related functions are called on the upcoming CPU
because the notifier itself could run on any online CPU.

The hotplug state machine guarantees that the callbacks are invoked on the
upcoming CPU. So there is no need to have this SMP function call
obfuscation. That indirection was missed when the hotplug notifiers were
converted.

This also solves the problem of ARM64 init_cache_level() invoking ACPI
functions which take a semaphore in that context. That's invalid as SMP
function calls run with interrupts disabled. Running it just from the
callback in context of the CPU hotplug thread solves this.

Fixes: 8571890e1513 ("arm64: Add support for ACPI based firmware tables")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/871r69ersb.ffs@tglx
---
 include/linux/cacheinfo.h | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 4f72b47973c3..2f909ed084c6 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -79,24 +79,6 @@ struct cpu_cacheinfo {
 	bool cpu_map_populated;
 };
 
-/*
- * Helpers to make sure "func" is executed on the cpu whose cache
- * attributes are being detected
- */
-#define DEFINE_SMP_CALL_CACHE_FUNCTION(func)			\
-static inline void _##func(void *ret)				\
-{								\
-	int cpu = smp_processor_id();				\
-	*(int *)ret = __##func(cpu);				\
-}								\
-								\
-int func(unsigned int cpu)					\
-{								\
-	int ret;						\
-	smp_call_function_single(cpu, _##func, &ret, true);	\
-	return ret;						\
-}
-
 struct cpu_cacheinfo *get_cpu_cacheinfo(unsigned int cpu);
 int init_cache_level(unsigned int cpu);
 int populate_cache_leaves(unsigned int cpu);
-- 
cgit v1.2.3


From ef6c8da71eaffe4e251b0ff2a1d0da96f89fe6b0 Mon Sep 17 00:00:00 2001
From: Geetha sowjanya <gakula@marvell.com>
Date: Wed, 1 Sep 2021 15:25:50 +0530
Subject: octeontx2-pf: cn10K: Reserve LMTST lines per core

This patch reserves the LMTST lines per cpu instead
of separate LMTST lines for NPA(buffer free) and NIX(sqe flush).
LMTST line of the core on which SQ or RQ is processed is used
for LMTST operation.

This patch also replace STEOR with STEORL release semantics and
updates driver name in ethtool file.

Signed-off-by: Geetha sowjanya <gakula@marvell.com>
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/soc/marvell/octeontx2/asm.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/marvell/octeontx2/asm.h b/include/linux/soc/marvell/octeontx2/asm.h
index 28c04d918f0f..fa1d6af0164e 100644
--- a/include/linux/soc/marvell/octeontx2/asm.h
+++ b/include/linux/soc/marvell/octeontx2/asm.h
@@ -22,12 +22,17 @@
 			 : [rs]"r" (ioaddr));           \
 	(result);                                       \
 })
+/*
+ * STEORL store to memory with release semantics.
+ * This will avoid using DMB barrier after each LMTST
+ * operation.
+ */
 #define cn10k_lmt_flush(val, addr)			\
 ({							\
 	__asm__ volatile(".cpu  generic+lse\n"		\
-			 "steor %x[rf],[%[rs]]"		\
-			 : [rf]"+r"(val)		\
-			 : [rs]"r"(addr));		\
+			 "steorl %x[rf],[%[rs]]"		\
+			 : [rf] "+r"(val)		\
+			 : [rs] "r"(addr));		\
 })
 #else
 #define otx2_lmt_flush(ioaddr)          ({ 0; })
-- 
cgit v1.2.3


From 59dc33252ee777e02332774fbdf3381b1d5d5f5d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 24 Aug 2021 16:43:55 +0200
Subject: PCI: VMD: ACPI: Make ACPI companion lookup work for VMD bus

On some systems, in order to get to the deepest low-power state of
the platform (which may be necessary to save significant enough
amounts of energy while suspended to idle. for example), devices on
the PCI bus exposed by the VMD driver need to be power-managed via
ACPI.  However, the layout of the ACPI namespace below the VMD
controller device object does not reflect the layout of the PCI bus
under the VMD host bridge, so in order to identify the ACPI companion
objects for the devices on that bus, it is necessary to use a special
_ADR encoding on the ACPI side.  In other words, acpi_pci_find_companion()
does not work for these devices, so it needs to be amended with a
special lookup logic specific to the VMD bus.

Address this issue by allowing the VMD driver to temporarily install
an ACPI companion lookup hook containing the code matching the devices
on the VMD PCI bus with the corresponding objects in the ACPI
namespace.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Jon Derrick <jonathan.derrick@intel.com>
---
 include/linux/pci-acpi.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 5ba475ca9078..f16de399d2de 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -122,6 +122,9 @@ static inline void pci_acpi_add_edr_notifier(struct pci_dev *pdev) { }
 static inline void pci_acpi_remove_edr_notifier(struct pci_dev *pdev) { }
 #endif /* CONFIG_PCIE_EDR */
 
+int pci_acpi_set_companion_lookup_hook(struct acpi_device *(*func)(struct pci_dev *));
+void pci_acpi_clear_companion_lookup_hook(void);
+
 #else	/* CONFIG_ACPI */
 static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
 static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
-- 
cgit v1.2.3


From 4bf8e582119ed9767f907abb6dc62ef9dddf10df Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 1 Sep 2021 14:41:57 +0530
Subject: cpufreq: Remove ready() callback

This isn't used anymore, get rid of it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpufreq.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index c65a1d7385f8..fe6acc04e5e5 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -367,9 +367,6 @@ struct cpufreq_driver {
 	int		(*suspend)(struct cpufreq_policy *policy);
 	int		(*resume)(struct cpufreq_policy *policy);
 
-	/* Will be called after the driver is fully initialized */
-	void		(*ready)(struct cpufreq_policy *policy);
-
 	struct freq_attr **attr;
 
 	/* platform specific boost support code */
-- 
cgit v1.2.3


From 8083f58d08fd52f547c0a62c0f4e448e15e6726b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 7 Jul 2021 18:28:35 +0200
Subject: pwm: Make pwmchip_remove() return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since some time pwmchip_remove() always returns 0 so the return value
isn't usefull. Now that all callers are converted to ignore its value
the function can be changed to return void.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index a0b7e43049d5..725c9b784e60 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -404,7 +404,7 @@ int pwm_set_chip_data(struct pwm_device *pwm, void *data);
 void *pwm_get_chip_data(struct pwm_device *pwm);
 
 int pwmchip_add(struct pwm_chip *chip);
-int pwmchip_remove(struct pwm_chip *chip);
+void pwmchip_remove(struct pwm_chip *chip);
 
 int devm_pwmchip_add(struct device *dev, struct pwm_chip *chip);
 
-- 
cgit v1.2.3


From 15eb7c888e749fbd1cc0370f3d38de08ad903700 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 31 Aug 2021 08:38:19 +0200
Subject: locking/rwsem: Add missing __init_rwsem() for PREEMPT_RT

730633f0b7f95 became the first direct caller of __init_rwsem() vs the
usual init_rwsem(), exposing PREEMPT_RT's lack thereof.  Add it.

[ tglx: Move it out of line ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/50a936b7d8f12277d6ec7ed2ef0421a381056909.camel@gmx.de
---
 include/linux/rwsem.h | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 426e98e0b675..352c6127cb90 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -142,22 +142,14 @@ struct rw_semaphore {
 #define DECLARE_RWSEM(lockname) \
 	struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-extern void  __rwsem_init(struct rw_semaphore *rwsem, const char *name,
+extern void  __init_rwsem(struct rw_semaphore *rwsem, const char *name,
 			  struct lock_class_key *key);
-#else
-static inline void  __rwsem_init(struct rw_semaphore *rwsem, const char *name,
-				 struct lock_class_key *key)
-{
-}
-#endif
 
 #define init_rwsem(sem)						\
 do {								\
 	static struct lock_class_key __key;			\
 								\
-	init_rwbase_rt(&(sem)->rwbase);			\
-	__rwsem_init((sem), #sem, &__key);			\
+	__init_rwsem((sem), #sem, &__key);			\
 } while (0)
 
 static __always_inline int rwsem_is_locked(struct rw_semaphore *sem)
-- 
cgit v1.2.3


From d095559ce4100f0c02aea229705230deac329c97 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Mon, 5 Jul 2021 09:22:56 +0800
Subject: ceph: flush mdlog before umounting

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index e41a811026f6..bc2699feddbe 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -299,6 +299,7 @@ enum {
 	CEPH_SESSION_FLUSHMSG_ACK,
 	CEPH_SESSION_FORCE_RO,
 	CEPH_SESSION_REJECT,
+	CEPH_SESSION_REQUEST_FLUSH_MDLOG,
 };
 
 extern const char *ceph_session_op_name(int op);
-- 
cgit v1.2.3


From 7a8526a5cd51cf5f070310c6c37dd7293334ac49 Mon Sep 17 00:00:00 2001
From: Kate Hsuan <hpa@redhat.com>
Date: Fri, 3 Sep 2021 17:44:11 +0800
Subject: libata: Add ATA_HORKAGE_NO_NCQ_ON_ATI for Samsung 860 and 870 SSD.

Many users are reporting that the Samsung 860 and 870 SSD are having
various issues when combined with AMD/ATI (vendor ID 0x1002)  SATA
controllers and only completely disabling NCQ helps to avoid these
issues.

Always disabling NCQ for Samsung 860/870 SSDs regardless of the host
SATA adapter vendor will cause I/O performance degradation with well
behaved adapters. To limit the performance impact to ATI adapters,
introduce the ATA_HORKAGE_NO_NCQ_ON_ATI flag to force disable NCQ
only for these adapters.

Also, two libata.force parameters (noncqati and ncqati) are introduced
to disable and enable the NCQ for the system which equipped with ATI
SATA adapter and Samsung 860 and 870 SSDs. The user can determine NCQ
function to be enabled or disabled according to the demand.

After verifying the chipset from the user reports, the issue appears
on AMD/ATI SB7x0/SB8x0/SB9x0 SATA Controllers and does not appear on
recent AMD SATA adapters. The vendor ID of ATI should be 0x1002.
Therefore, ATA_HORKAGE_NO_NCQ_ON_AMD was modified to
ATA_HORKAGE_NO_NCQ_ON_ATI.

BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=201693
Signed-off-by: Kate Hsuan <hpa@redhat.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20210903094411.58749-1-hpa@redhat.com
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/libata.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 860e63f5667b..c0c64f03e107 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -426,6 +426,7 @@ enum {
 	ATA_HORKAGE_NOTRIM	= (1 << 24),	/* don't use TRIM */
 	ATA_HORKAGE_MAX_SEC_1024 = (1 << 25),	/* Limit max sects to 1024 */
 	ATA_HORKAGE_MAX_TRIM_128M = (1 << 26),	/* Limit max trim size to 128M */
+	ATA_HORKAGE_NO_NCQ_ON_ATI = (1 << 27),	/* Disable NCQ on ATI chipset */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3


From 35d7bdc86031a2c1ae05ac27dfa93b2acdcbaecc Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 23 Apr 2021 10:20:25 +0200
Subject: kernel/fork: factor out replacing the current MM exe_file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's factor the main logic out into replace_mm_exe_file(), such that
all mm->exe_file logic is contained in kernel/fork.c.

While at it, perform some simple cleanups that are possible now that
we're simplifying the individual functions.

Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ca22e6e694a..48c6fa9ab792 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2581,6 +2581,7 @@ extern int mm_take_all_locks(struct mm_struct *mm);
 extern void mm_drop_all_locks(struct mm_struct *mm);
 
 extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
+extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
 extern struct file *get_mm_exe_file(struct mm_struct *mm);
 extern struct file *get_task_exe_file(struct task_struct *task);
 
-- 
cgit v1.2.3


From fe69d560b5bd9ec77b5d5749bd7027344daef47e Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 23 Apr 2021 10:29:59 +0200
Subject: kernel/fork: always deny write access to current MM exe_file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We want to remove VM_DENYWRITE only currently only used when mapping the
executable during exec. During exec, we already deny_write_access() the
executable, however, after exec completes the VMAs mapped
with VM_DENYWRITE effectively keeps write access denied via
deny_write_access().

Let's deny write access when setting or replacing the MM exe_file. With
this change, we can remove VM_DENYWRITE for mapping executables.

Make set_mm_exe_file() return an error in case deny_write_access()
fails; note that this should never happen, because exec code does a
deny_write_access() early and keeps write access denied when calling
set_mm_exe_file. However, it makes the code easier to read and makes
set_mm_exe_file() and replace_mm_exe_file() look more similar.

This represents a minor user space visible change:
sys_prctl(PR_SET_MM_MAP/EXE_FILE) can now fail if the file is already
opened writable. Also, after sys_prctl(PR_SET_MM_MAP/EXE_FILE) the file
cannot be opened writable. Note that we can already fail with -EACCES if
the file doesn't have execute permissions.

Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 48c6fa9ab792..56b1cd41db61 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2580,7 +2580,7 @@ static inline int check_data_rlimit(unsigned long rlim,
 extern int mm_take_all_locks(struct mm_struct *mm);
 extern void mm_drop_all_locks(struct mm_struct *mm);
 
-extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
+extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
 extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
 extern struct file *get_mm_exe_file(struct mm_struct *mm);
 extern struct file *get_task_exe_file(struct task_struct *task);
-- 
cgit v1.2.3


From 8d0920bde5eb8ec7e567939b85e65a0596c8580d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 22 Apr 2021 12:08:20 +0200
Subject: mm: remove VM_DENYWRITE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All in-tree users of MAP_DENYWRITE are gone. MAP_DENYWRITE cannot be
set from user space, so all users are gone; let's remove it.

Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/linux/mm.h   | 1 -
 include/linux/mman.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 56b1cd41db61..257995f62e83 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -281,7 +281,6 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_GROWSDOWN	0x00000100	/* general info on the segment */
 #define VM_UFFD_MISSING	0x00000200	/* missing pages tracking */
 #define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
-#define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
 #define VM_UFFD_WP	0x00001000	/* wrprotect pages tracking */
 
 #define VM_LOCKED	0x00002000
diff --git a/include/linux/mman.h b/include/linux/mman.h
index ebb09a964272..bd9aadda047b 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -153,7 +153,6 @@ static inline unsigned long
 calc_vm_flag_bits(unsigned long flags)
 {
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
-	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
 	       _calc_vm_trans(flags, MAP_SYNC,	     VM_SYNC      ) |
 	       arch_calc_vm_flag_bits(flags);
-- 
cgit v1.2.3


From 6128b3af2a5e42386aa7faf37609b57f39fb7d00 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 23 Apr 2021 09:38:14 +0200
Subject: mm: ignore MAP_DENYWRITE in ksys_mmap_pgoff()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's also remove masking off MAP_DENYWRITE from ksys_mmap_pgoff():
the last in-tree occurrence of MAP_DENYWRITE is now in LEGACY_MAP_MASK,
which accepts the flag e.g., for MAP_SHARED_VALIDATE; however, the flag
is ignored throughout the kernel now.

Add a comment to LEGACY_MAP_MASK stating that MAP_DENYWRITE is ignored.

Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/linux/mman.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mman.h b/include/linux/mman.h
index bd9aadda047b..b66e91b8176c 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -32,7 +32,8 @@
  * The historical set of flags that all mmap implementations implicitly
  * support when a ->mmap_validate() op is not provided in file_operations.
  *
- * MAP_EXECUTABLE is completely ignored throughout the kernel.
+ * MAP_EXECUTABLE and MAP_DENYWRITE are completely ignored throughout the
+ * kernel.
  */
 #define LEGACY_MAP_MASK (MAP_SHARED \
 		| MAP_PRIVATE \
-- 
cgit v1.2.3


From 592ca09be8333bd226f50100328a905bfc377133 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 23 Apr 2021 09:45:45 +0200
Subject: fs: update documentation of get_write_access() and friends
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As VM_DENYWRITE does no longer exists, let's spring-clean the
documentation of get_write_access() and friends.

Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/linux/fs.h | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 640574294216..e0dc3e96ed72 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3055,15 +3055,20 @@ static inline void file_end_write(struct file *file)
 }
 
 /*
+ * This is used for regular files where some users -- especially the
+ * currently executed binary in a process, previously handled via
+ * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap
+ * read-write shared) accesses.
+ *
  * get_write_access() gets write permission for a file.
  * put_write_access() releases this write permission.
- * This is used for regular files.
- * We cannot support write (and maybe mmap read-write shared) accesses and
- * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
- * can have the following values:
- * 0: no writers, no VM_DENYWRITE mappings
- * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
- * > 0: (i_writecount) users are writing to the file.
+ * deny_write_access() denies write access to a file.
+ * allow_write_access() re-enables write access to a file.
+ *
+ * The i_writecount field of an inode can have the following values:
+ * 0: no write access, no denied write access
+ * < 0: (-i_writecount) users that denied write access to the file.
+ * > 0: (i_writecount) users that have write access to the file.
  *
  * Normally we operate on that counter with atomic_{inc,dec} and it's safe
  * except for the cases where we don't hold i_writecount yet. Then we need to
-- 
cgit v1.2.3


From 4f3eaf452a14ff3982f71c1ca8bdf757254231fa Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 2 Sep 2021 14:52:58 -0700
Subject: mm: report a more useful address for reclaim acquisition

A recent lockdep report included these lines:

[   96.177910] 3 locks held by containerd/770:
[   96.177934]  #0: ffff88810815ea28 (&mm->mmap_lock#2){++++}-{3:3},
at: do_user_addr_fault+0x115/0x770
[   96.177999]  #1: ffffffff82915020 (rcu_read_lock){....}-{1:2}, at:
get_swap_device+0x33/0x140
[   96.178057]  #2: ffffffff82955ba0 (fs_reclaim){+.+.}-{0:0}, at:
__fs_reclaim_acquire+0x5/0x30

While it was not useful to that bug report to know where the reclaim lock
had been acquired, it might be useful under other circumstances.  Allow
the caller of __fs_reclaim_acquire to specify the instruction pointer to
use.

Link: https://lkml.kernel.org/r/20210719185709.1755149-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/mm.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index e24b1fe348e3..8894825cc4db 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -174,13 +174,13 @@ static inline gfp_t current_gfp_context(gfp_t flags)
 }
 
 #ifdef CONFIG_LOCKDEP
-extern void __fs_reclaim_acquire(void);
-extern void __fs_reclaim_release(void);
+extern void __fs_reclaim_acquire(unsigned long ip);
+extern void __fs_reclaim_release(unsigned long ip);
 extern void fs_reclaim_acquire(gfp_t gfp_mask);
 extern void fs_reclaim_release(gfp_t gfp_mask);
 #else
-static inline void __fs_reclaim_acquire(void) { }
-static inline void __fs_reclaim_release(void) { }
+static inline void __fs_reclaim_acquire(unsigned long ip) { }
+static inline void __fs_reclaim_release(unsigned long ip) { }
 static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
 static inline void fs_reclaim_release(gfp_t gfp_mask) { }
 #endif
-- 
cgit v1.2.3


From 633a2abb9e1cd5c95f3b600f4b2c12cce22ae4a0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 2 Sep 2021 14:53:04 -0700
Subject: writeback: track number of inodes under writeback

Patch series "writeback: Fix bandwidth estimates", v4.

Fix estimate of writeback throughput when device is not fully busy doing
writeback.  Michael Stapelberg has reported that such workload (e.g.
generated by linking) tends to push estimated throughput down to 0 and as
a result writeback on the device is practically stalled.

The first three patches fix the reported issue, the remaining two patches
are unrelated cleanups of problems I've noticed when reading the code.

This patch (of 4):

Track number of inodes under writeback for each bdi_writeback structure.
We will use this to decide whether wb does any IO and so we can estimate
its writeback throughput.  In principle we could use number of pages under
writeback (WB_WRITEBACK counter) for this however normal percpu counter
reads are too inaccurate for our purposes and summing the counter is too
expensive.

Link: https://lkml.kernel.org/r/20210713104519.16394-1-jack@suse.cz
Link: https://lkml.kernel.org/r/20210713104716.22868-1-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Michael Stapelberg <stapelberg+linux@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/backing-dev-defs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 1d7edad9914f..06fb8e13f6bc 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -116,6 +116,7 @@ struct bdi_writeback {
 	struct list_head b_dirty_time;	/* time stamps are dirty */
 	spinlock_t list_lock;		/* protects the b_* lists */
 
+	atomic_t writeback_inodes;	/* number of inodes under writeback */
 	struct percpu_counter stat[NR_WB_STAT_ITEMS];
 
 	unsigned long congested;	/* WB_[a]sync_congested flags */
-- 
cgit v1.2.3


From fee468fdf41cdf36ba6b5a780e2474d0a3e066ac Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 2 Sep 2021 14:53:06 -0700
Subject: writeback: reliably update bandwidth estimation

Currently we trigger writeback bandwidth estimation from
balance_dirty_pages() and from wb_writeback().  However neither of these
need to trigger when the system is relatively idle and writeback is
triggered e.g.  from fsync(2).  Make sure writeback estimates happen
reliably by triggering them from do_writepages().

Link: https://lkml.kernel.org/r/20210713104716.22868-2-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Michael Stapelberg <stapelberg+linux@google.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/backing-dev.h | 19 +++++++++++++++++++
 include/linux/writeback.h   |  1 -
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 44df4fcef65c..8a886bca51e5 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -288,6 +288,17 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
 	return inode->i_wb;
 }
 
+static inline struct bdi_writeback *inode_to_wb_wbc(
+				struct inode *inode,
+				struct writeback_control *wbc)
+{
+	/*
+	 * If wbc does not have inode attached, it means cgroup writeback was
+	 * disabled when wbc started. Just use the default wb in that case.
+	 */
+	return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb;
+}
+
 /**
  * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
  * @inode: target inode
@@ -366,6 +377,14 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 	return &inode_to_bdi(inode)->wb;
 }
 
+static inline struct bdi_writeback *inode_to_wb_wbc(
+				struct inode *inode,
+				struct writeback_control *wbc)
+{
+	return inode_to_wb(inode);
+}
+
+
 static inline struct bdi_writeback *
 unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
 {
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 667e86cfbdcf..2480322c06a7 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -379,7 +379,6 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
 
-void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
 bool wb_over_bg_thresh(struct bdi_writeback *wb);
 
-- 
cgit v1.2.3


From 45a2966fd64147518dc5bca25f447bd0fb5359ac Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 2 Sep 2021 14:53:09 -0700
Subject: writeback: fix bandwidth estimate for spiky workload

Michael Stapelberg has reported that for workload with short big spikes of
writes (GCC linker seem to trigger this frequently) the write throughput
is heavily underestimated and tends to steadily sink until it reaches
zero.  This has rather bad impact on writeback throttling (causing
stalls).  The problem is that writeback throughput estimate gets updated
at most once per 200 ms.  One update happens early after we submit pages
for writeback (at that point writeout of only small fraction of pages is
completed and thus observed throughput is tiny).  Next update happens only
during the next write spike (updates happen only from inode writeback and
dirty throttling code) and if that is more than 1s after previous spike,
we decide system was idle and just ignore whatever was written until this
moment.

Fix the problem by making sure writeback throughput estimate is also
updated shortly after writeback completes to get reasonable estimate of
throughput for spiky workloads.

[jack@suse.cz: avoid division by 0 in wb_update_dirty_ratelimit()]

Link: https://lore.kernel.org/lkml/20210617095309.3542373-1-stapelberg+linux@google.com
Link: https://lkml.kernel.org/r/20210713104716.22868-3-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: Michael Stapelberg <stapelberg+linux@google.com>
Tested-by: Michael Stapelberg <stapelberg+linux@google.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/backing-dev-defs.h | 1 +
 include/linux/writeback.h        | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 06fb8e13f6bc..33207004cfde 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -143,6 +143,7 @@ struct bdi_writeback {
 	spinlock_t work_lock;		/* protects work_list & dwork scheduling */
 	struct list_head work_list;
 	struct delayed_work dwork;	/* work item used for writeback */
+	struct delayed_work bw_dwork;	/* work item used for bandwidth estimate */
 
 	unsigned long dirty_sleep;	/* last wait */
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 2480322c06a7..cbaef099645e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -379,6 +379,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
 
+void wb_update_bandwidth(struct bdi_writeback *wb);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
 bool wb_over_bg_thresh(struct bdi_writeback *wb);
 
-- 
cgit v1.2.3


From 7490a2d248145d8694e1e9828801b496250fd697 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 2 Sep 2021 14:53:27 -0700
Subject: writeback: memcg: simplify cgroup_writeback_by_id

Currently cgroup_writeback_by_id calls mem_cgroup_wb_stats() to get dirty
pages for a memcg.  However mem_cgroup_wb_stats() does a lot more than
just get the number of dirty pages.  Just directly get the number of dirty
pages instead of calling mem_cgroup_wb_stats().  Also
cgroup_writeback_by_id() is only called for best-effort dirty flushing, so
remove the unused 'nr' parameter and no need to explicitly flush memcg
stats.

Link: https://lkml.kernel.org/r/20210722182627.2267368-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 15 +++++++++++++++
 include/linux/writeback.h  |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 24797929d8a1..3403ec77528a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -955,6 +955,16 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
 	local_irq_restore(flags);
 }
 
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+	long x = READ_ONCE(memcg->vmstats.state[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
 					      enum node_stat_item idx)
 {
@@ -1391,6 +1401,11 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
 {
 }
 
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+	return 0;
+}
+
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
 					      enum node_stat_item idx)
 {
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index cbaef099645e..aeda2c0c9986 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -218,7 +218,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 void wbc_detach_inode(struct writeback_control *wbc);
 void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
 			      size_t bytes);
-int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages,
+int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
 			   enum wb_reason reason, struct wb_completion *done);
 void cgroup_writeback_umount(void);
 bool cleanup_offline_cgwb(struct bdi_writeback *wb);
-- 
cgit v1.2.3


From 6de522d1667f628376517e4b177af10c739d745b Mon Sep 17 00:00:00 2001
From: Jing Yangyang <jing.yangyang@zte.com.cn>
Date: Thu, 2 Sep 2021 14:53:30 -0700
Subject: include/linux/buffer_head.h: fix boolreturn.cocci warnings

./include/linux/buffer_head.h:412:64-65:WARNING:return of 0/1 in
function 'has_bh_in_lru' with return type bool

Return statements in functions returning bool should use true/false
instead of 1/0.

Generated by: scripts/coccinelle/misc/boolreturn.cocci

Link: https://lkml.kernel.org/r/20210824055828.58783-1-deng.changcheng@zte.com.cn
Signed-off-by: Jing Yangyang <jing.yangyang@zte.com.cn>
Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/buffer_head.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index e7e99da31349..6486d3c19463 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -409,7 +409,7 @@ static inline void invalidate_inode_buffers(struct inode *inode) {}
 static inline int remove_inode_buffers(struct inode *inode) { return 1; }
 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
 static inline void invalidate_bh_lrus_cpu(int cpu) {}
-static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; }
+static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; }
 #define buffer_heads_over_limit 0
 
 #endif /* CONFIG_BLOCK */
-- 
cgit v1.2.3


From 54d516b1d62ff8f17cee2da06e5e4706a0d00b8a Mon Sep 17 00:00:00 2001
From: John Hubbard <jhubbard@nvidia.com>
Date: Thu, 2 Sep 2021 14:53:51 -0700
Subject: mm/gup: small refactoring: simplify try_grab_page()

try_grab_page() does the same thing as try_grab_compound_head(..., refs=1,
...), just with a different API.  So there is a lot of code duplication
there.

Change try_grab_page() to call try_grab_compound_head(), while keeping the
API contract identical for callers.

Also, now that try_grab_compound_head() always has a caller, remove the
__maybe_unused annotation.

Link: https://lkml.kernel.org/r/20210813044133.1536842-3-jhubbard@nvidia.com
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ca22e6e694a..af4845e81b84 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1214,8 +1214,8 @@ static inline void get_page(struct page *page)
 }
 
 bool __must_check try_grab_page(struct page *page, unsigned int flags);
-__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs,
-						   unsigned int flags);
+struct page *try_grab_compound_head(struct page *page, int refs,
+				    unsigned int flags);
 
 
 static inline __must_check bool try_get_page(struct page *page)
-- 
cgit v1.2.3


From 9857a17f206ff374aea78bccfb687f145368be2e Mon Sep 17 00:00:00 2001
From: John Hubbard <jhubbard@nvidia.com>
Date: Thu, 2 Sep 2021 14:53:54 -0700
Subject: mm/gup: remove try_get_page(), call try_get_compound_head() directly

try_get_page() is very similar to try_get_compound_head(), and in fact
try_get_page() has fallen a little behind in terms of maintenance:
try_get_compound_head() handles speculative page references more
thoroughly.

There are only two try_get_page() callsites, so just call
try_get_compound_head() directly from those, and remove try_get_page()
entirely.

Also, seeing as how this changes try_get_compound_head() into a non-static
function, provide some kerneldoc documentation for it.

Link: https://lkml.kernel.org/r/20210813044133.1536842-4-jhubbard@nvidia.com
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index af4845e81b84..d3439dd4f4ba 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1217,15 +1217,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags);
 struct page *try_grab_compound_head(struct page *page, int refs,
 				    unsigned int flags);
 
-
-static inline __must_check bool try_get_page(struct page *page)
-{
-	page = compound_head(page);
-	if (WARN_ON_ONCE(page_ref_count(page) <= 0))
-		return false;
-	page_ref_inc(page);
-	return true;
-}
+struct page *try_get_compound_head(struct page *page, int refs);
 
 static inline void put_page(struct page *page)
 {
-- 
cgit v1.2.3


From 3969b1a654fb09b7915efc1aa4ad45daf932e12f Mon Sep 17 00:00:00 2001
From: John Hubbard <jhubbard@nvidia.com>
Date: Thu, 2 Sep 2021 14:54:00 -0700
Subject: mm: delete unused get_kernel_page()

get_kernel_page() was added in 2012 by [1].  It was used for a while for
NFS, but then in 2014, a refactoring [2] removed all callers, and it has
apparently not been used since.

Remove get_kernel_page() because it has no callers.

[1] commit 18022c5d8627 ("mm: add get_kernel_page[s] for pinning of
    kernel addresses for I/O")
[2] commit 91f79c43d1b5 ("new helper: iov_iter_get_pages_alloc()")

Link: https://lkml.kernel.org/r/20210729221847.1165665-1-jhubbard@nvidia.com
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Eric Paris <eparis@redhat.com>
Cc: James Morris <jmorris@namei.org>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Neil Brown <neilb@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Xiaotian Feng <dfeng@redhat.com>
Cc: Mark Salter <msalter@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d3439dd4f4ba..35bbac32b6f6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1839,7 +1839,6 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
 struct kvec;
 int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
 			struct page **pages);
-int get_kernel_page(unsigned long start, int write, struct page **pages);
 struct page *get_dump_page(unsigned long addr);
 
 extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
-- 
cgit v1.2.3


From bf11b9a8e9a93c1fc0ebfc2929622d5cf7d43888 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 2 Sep 2021 14:54:03 -0700
Subject: shmem: use raw_spinlock_t for ->stat_lock

Each CPU has SHMEM_INO_BATCH inodes available in `->ino_batch' which is
per-CPU.  Access here is serialized by disabling preemption.  If the pool
is empty, it gets reloaded from `->next_ino'.  Access here is serialized
by ->stat_lock which is a spinlock_t and can not be acquired with disabled
preemption.

One way around it would make per-CPU ino_batch struct containing the inode
number a local_lock_t.

Another solution is to promote ->stat_lock to a raw_spinlock_t.  The
critical sections are short.  The mpol_put() must be moved outside of the
critical section to avoid invoking the destructor with disabled
preemption.

Link: https://lkml.kernel.org/r/20210806142916.jdwkb5bx62q5fwfo@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 8e775ce517bb..0a8499fb9c3c 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -31,7 +31,7 @@ struct shmem_sb_info {
 	struct percpu_counter used_blocks;  /* How many are allocated */
 	unsigned long max_inodes;   /* How many inodes are allowed */
 	unsigned long free_inodes;  /* How many are left for allocation */
-	spinlock_t stat_lock;	    /* Serialize shmem_sb_info changes */
+	raw_spinlock_t stat_lock;   /* Serialize shmem_sb_info changes */
 	umode_t mode;		    /* Mount mode for root directory */
 	unsigned char huge;	    /* Whether to try for hugepages */
 	kuid_t uid;		    /* Mount uid for root directory */
-- 
cgit v1.2.3


From d144bf6205342a4b5fed5d204ae18849a4de741b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 2 Sep 2021 14:54:21 -0700
Subject: huge tmpfs: fix split_huge_page() after FALLOC_FL_KEEP_SIZE

A successful shmem_fallocate() guarantees that the extent has been
reserved, even beyond i_size when the FALLOC_FL_KEEP_SIZE flag was used.
But that guarantee is broken by shmem_unused_huge_shrink()'s attempts to
split huge pages and free their excess beyond i_size; and by other uses of
split_huge_page() near i_size.

It's sad to add a shmem inode field just for this, but I did not find a
better way to keep the guarantee.  A flag to say KEEP_SIZE has been used
would be cheaper, but I'm averse to unclearable flags.  The fallocend
field is not perfect either (many disjoint ranges might be fallocated),
but good enough; and gains another use later on.

Link: https://lkml.kernel.org/r/ca9a146-3a59-6cd3-7f28-e9a044bb1052@google.com
Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure")
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 0a8499fb9c3c..bfc5899d18e0 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -18,6 +18,7 @@ struct shmem_inode_info {
 	unsigned long		flags;
 	unsigned long		alloced;	/* data pages alloced to file */
 	unsigned long		swapped;	/* subtotal assigned to swap */
+	pgoff_t			fallocend;	/* highest fallocate endindex */
 	struct list_head        shrinklist;     /* shrinkable hpage inodes */
 	struct list_head	swaplist;	/* chain of maybes on swap */
 	struct shared_policy	policy;		/* NUMA memory alloc policy */
@@ -119,6 +120,18 @@ static inline bool shmem_file(struct file *file)
 	return shmem_mapping(file->f_mapping);
 }
 
+/*
+ * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages
+ * beyond i_size's notion of EOF, which fallocate has committed to reserving:
+ * which split_huge_page() must therefore not delete.  This use of a single
+ * "fallocend" per inode errs on the side of not deleting a reservation when
+ * in doubt: there are plenty of cases when it preserves unreserved pages.
+ */
+static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof)
+{
+	return max(eof, SHMEM_I(inode)->fallocend);
+}
+
 extern bool shmem_charge(struct inode *inode, long pages);
 extern void shmem_uncharge(struct inode *inode, long pages);
 
-- 
cgit v1.2.3


From acdd9f8e0fed9f1bd7e83a8ff934694bb4c9a72b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 2 Sep 2021 14:54:34 -0700
Subject: huge tmpfs: SGP_NOALLOC to stop collapse_file() on race

khugepaged's collapse_file() currently uses SGP_NOHUGE to tell
shmem_getpage() not to try allocating a huge page, in the very unlikely
event that a racing hole-punch removes the swapped or fallocated page as
soon as i_pages lock is dropped.

We want to consolidate shmem's huge decisions, removing SGP_HUGE and
SGP_NOHUGE; but cannot quite persuade ourselves that it's okay to regress
the protection in this case - Yang Shi points out that the huge page would
remain indefinitely, charged to root instead of the intended memcg.

collapse_file() should not even allocate a small page in this case: why
proceed if someone is punching a hole?  SGP_READ is almost the right flag
here, except that it optimizes away from a fallocated page, with NULL to
tell caller to fill with zeroes (like a hole); whereas collapse_file()'s
sequence relies on using a cache page.  Add SGP_NOALLOC just for this.

There are too many consecutive "if (page"s there in shmem_getpage_gfp():
group it better; and fix the outdated "bring it back from swap" comment.

Link: https://lkml.kernel.org/r/1355343b-acf-4653-ef79-6aee40214ac5@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index bfc5899d18e0..a3f4502ec8a9 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -94,6 +94,7 @@ extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
 /* Flag allocation requirements to shmem_getpage */
 enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
+	SGP_NOALLOC,	/* similar, but fail on hole or use fallocated page */
 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
 	SGP_NOHUGE,	/* like SGP_CACHE, but no huge pages */
 	SGP_HUGE,	/* like SGP_CACHE, huge pages preferred */
-- 
cgit v1.2.3


From 5e6e5a12a44ca5ff2b130d8d39aaf9b8c026de94 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 2 Sep 2021 14:54:37 -0700
Subject: huge tmpfs: shmem_is_huge(vma, inode, index)

Extend shmem_huge_enabled(vma) to shmem_is_huge(vma, inode, index), so
that a consistent set of checks can be applied, even when the inode is
accessed through read/write syscalls (with NULL vma) instead of mmaps (the
index argument is seldom of interest, but required by mount option
"huge=within_size").  Clean up and rearrange the checks a little.

This then replaces the checks which shmem_fault() and shmem_getpage_gfp()
were making, and eliminates the SGP_HUGE and SGP_NOHUGE modes.

Replace a couple of 0s by explicit SHMEM_HUGE_NEVERs; and replace the
obscure !shmem_mapping() symlink check by explicit S_ISLNK() - nothing
else needs that symlink check, so leave it there in shmem_getpage_gfp().

Link: https://lkml.kernel.org/r/23a77889-2ddc-b030-75cd-44ca27fd4d1@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index a3f4502ec8a9..166158b6e917 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -86,7 +86,12 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 extern int shmem_unuse(unsigned int type, bool frontswap,
 		       unsigned long *fs_pages_to_unuse);
 
-extern bool shmem_huge_enabled(struct vm_area_struct *vma);
+extern bool shmem_is_huge(struct vm_area_struct *vma,
+			  struct inode *inode, pgoff_t index);
+static inline bool shmem_huge_enabled(struct vm_area_struct *vma)
+{
+	return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff);
+}
 extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
 extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
 						pgoff_t start, pgoff_t end);
@@ -96,8 +101,6 @@ enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
 	SGP_NOALLOC,	/* similar, but fail on hole or use fallocated page */
 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
-	SGP_NOHUGE,	/* like SGP_CACHE, but no huge pages */
-	SGP_HUGE,	/* like SGP_CACHE, huge pages preferred */
 	SGP_WRITE,	/* may exceed i_size, may allocate !Uptodate page */
 	SGP_FALLOC,	/* like SGP_WRITE, but make existing page Uptodate */
 };
-- 
cgit v1.2.3


From 2c8d8f97ae2272f1455ee31a5af62b326772eb31 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 2 Sep 2021 14:54:50 -0700
Subject: mm, memcg: inline mem_cgroup_{charge/uncharge} to improve disabled
 memcg config

Inline mem_cgroup_{charge/uncharge} and mem_cgroup_uncharge_list functions
functions to perform mem_cgroup_disabled static key check inline before
calling the main body of the function.  This minimizes the memcg overhead
in the pagefault and exit_mmap paths when memcgs are disabled using
cgroup_disable=memory command-line option.

This change results in ~0.4% overhead reduction when running PFT test [1]
comparing {CONFIG_MEMCG=n} against {CONFIG_MEMCG=y, cgroup_disable=memory}
configuration on an 8-core ARM64 Android device.

[1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite

Link: https://lkml.kernel.org/r/20210713010934.299876-2-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Alex Shi <alexs@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3403ec77528a..a00bf337567b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -693,13 +693,35 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
 		page_counter_read(&memcg->memory);
 }
 
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
+int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+			gfp_t gfp_mask);
+static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+				    gfp_t gfp_mask)
+{
+	if (mem_cgroup_disabled())
+		return 0;
+	return __mem_cgroup_charge(page, mm, gfp_mask);
+}
+
 int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
 				  gfp_t gfp, swp_entry_t entry);
 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
 
-void mem_cgroup_uncharge(struct page *page);
-void mem_cgroup_uncharge_list(struct list_head *page_list);
+void __mem_cgroup_uncharge(struct page *page);
+static inline void mem_cgroup_uncharge(struct page *page)
+{
+	if (mem_cgroup_disabled())
+		return;
+	__mem_cgroup_uncharge(page);
+}
+
+void __mem_cgroup_uncharge_list(struct list_head *page_list);
+static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
+{
+	if (mem_cgroup_disabled())
+		return;
+	__mem_cgroup_uncharge_list(page_list);
+}
 
 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
 
-- 
cgit v1.2.3


From 01c4b28cd2e600160566a7d83b4703800381dae1 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 2 Sep 2021 14:54:54 -0700
Subject: mm, memcg: inline swap-related functions to improve disabled memcg
 config

Inline mem_cgroup_try_charge_swap, mem_cgroup_uncharge_swap and
cgroup_throttle_swaprate functions to perform mem_cgroup_disabled static
key check inline before calling the main body of the function.  This
minimizes the memcg overhead in the pagefault and exit_mmap paths when
memcgs are disabled using cgroup_disable=memory command-line option.  This
change results in ~1% overhead reduction when running PFT test [1]
comparing {CONFIG_MEMCG=n} against {CONFIG_MEMCG=y, cgroup_disable=memory}
configuration on an 8-core ARM64 Android device.

[1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite

Link: https://lkml.kernel.org/r/20210713010934.299876-3-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6f5a43251593..f30d26b0f71d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -721,7 +721,13 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 #endif
 
 #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-extern void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask);
+extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask);
+static inline  void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+{
+	if (mem_cgroup_disabled())
+		return;
+	__cgroup_throttle_swaprate(page, gfp_mask);
+}
 #else
 static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
 {
@@ -730,8 +736,22 @@ static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
 
 #ifdef CONFIG_MEMCG_SWAP
 extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
-extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
-extern void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
+extern int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
+static inline int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+{
+	if (mem_cgroup_disabled())
+		return 0;
+	return __mem_cgroup_try_charge_swap(page, entry);
+}
+
+extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
+static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+{
+	if (mem_cgroup_disabled())
+		return;
+	__mem_cgroup_uncharge_swap(entry, nr_pages);
+}
+
 extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
 extern bool mem_cgroup_swap_full(struct page *page);
 #else
-- 
cgit v1.2.3


From 7e1c0d6f58207e7e60674647d3935f446f05613c Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 2 Sep 2021 14:55:00 -0700
Subject: memcg: switch lruvec stats to rstat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commit 2d146aa3aa84 ("mm: memcontrol: switch to rstat") switched memcg
stats to rstat infrastructure but skipped the conversion of the lruvec
stats as such stats are read in the performance critical code paths and
flushing stats may have impacted the performances of the applications.
This patch converts the lruvec stats to rstat and later patches add
mechanisms to keep the performance impact to minimum.

The rstat conversion comes with the price i.e.  memory cost.  Effectively
this patch reverts the savings done by the commit f3344adf38bd ("mm:
memcontrol: optimize per-lruvec stats counter memory usage").  However
this cost is justified due to negative impact of the inaccurate lruvec
stats on many heuristics.  One such case is reported in [1].

The memory reclaim code is filled with plethora of heuristics and many of
those heuristics reads the lruvec stats.  So, inaccurate stats can make
such heuristics ineffective.  [1] reports the impact of inaccurate lruvec
stats on the "cache trim mode" heuristic.  Inaccurate lruvec stats can
impact the deactivation and aging anon heuristics as well.

[1] https://lore.kernel.org/linux-mm/20210311004449.1170308-1-ying.huang@intel.com/

Link: https://lkml.kernel.org/r/20210716212137.1391164-1-shakeelb@google.com
Link: https://lkml.kernel.org/r/20210714013948.270662-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 42 ++++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a00bf337567b..47d35bef9f6e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -105,14 +105,6 @@ struct mem_cgroup_reclaim_iter {
 	unsigned int generation;
 };
 
-struct lruvec_stat {
-	long count[NR_VM_NODE_STAT_ITEMS];
-};
-
-struct batched_lruvec_stat {
-	s32 count[NR_VM_NODE_STAT_ITEMS];
-};
-
 /*
  * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
  * shrinkers, which have elements charged to this memcg.
@@ -123,24 +115,30 @@ struct shrinker_info {
 	unsigned long *map;
 };
 
+struct lruvec_stats_percpu {
+	/* Local (CPU and cgroup) state */
+	long state[NR_VM_NODE_STAT_ITEMS];
+
+	/* Delta calculation for lockless upward propagation */
+	long state_prev[NR_VM_NODE_STAT_ITEMS];
+};
+
+struct lruvec_stats {
+	/* Aggregated (CPU and subtree) state */
+	long state[NR_VM_NODE_STAT_ITEMS];
+
+	/* Pending child counts during tree propagation */
+	long state_pending[NR_VM_NODE_STAT_ITEMS];
+};
+
 /*
  * per-node information in memory controller.
  */
 struct mem_cgroup_per_node {
 	struct lruvec		lruvec;
 
-	/*
-	 * Legacy local VM stats. This should be struct lruvec_stat and
-	 * cannot be optimized to struct batched_lruvec_stat. Because
-	 * the threshold of the lruvec_stat_cpu can be as big as
-	 * MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this
-	 * filed has no upper limit.
-	 */
-	struct lruvec_stat __percpu *lruvec_stat_local;
-
-	/* Subtree VM stats (batched updates) */
-	struct batched_lruvec_stat __percpu *lruvec_stat_cpu;
-	atomic_long_t		lruvec_stat[NR_VM_NODE_STAT_ITEMS];
+	struct lruvec_stats_percpu __percpu	*lruvec_stats_percpu;
+	struct lruvec_stats			lruvec_stats;
 
 	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
 
@@ -997,7 +995,7 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
 		return node_page_state(lruvec_pgdat(lruvec), idx);
 
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	x = atomic_long_read(&pn->lruvec_stat[idx]);
+	x = READ_ONCE(pn->lruvec_stats.state[idx]);
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
@@ -1017,7 +1015,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 	for_each_possible_cpu(cpu)
-		x += per_cpu(pn->lruvec_stat_local->count[idx], cpu);
+		x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu);
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
-- 
cgit v1.2.3


From aa48e47e3906c332eaf1e5d7b58be11d3509ad9f Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 2 Sep 2021 14:55:04 -0700
Subject: memcg: infrastructure to flush memcg stats
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At the moment memcg stats are read in four contexts:

1. memcg stat user interfaces
2. dirty throttling
3. page fault
4. memory reclaim

Currently the kernel flushes the stats for first two cases.  Flushing the
stats for remaining two casese may have performance impact.  Always
flushing the memcg stats on the page fault code path may negatively
impacts the performance of the applications.  In addition flushing in the
memory reclaim code path, though treated as slowpath, can become the
source of contention for the global lock taken for stat flushing because
when system or memcg is under memory pressure, many tasks may enter the
reclaim path.

This patch uses following mechanisms to solve these challenges:

1. Periodically flush the stats from root memcg every 2 seconds.  This
   will time limit the out of sync stats.

2. Asynchronously flush the stats after fixed number of stat updates.
   In the worst case the stat can be out of sync by O(nr_cpus * BATCH) for
   2 seconds.

3. For avoiding thundering herd to flush the stats particularly from
   the memory reclaim context, introduce memcg local spinlock and let only
   one flusher active at a time.  This could have been done through
   cgroup_rstat_lock lock but that lock is used by other subsystem and for
   userspace reading memcg stats.  So, it is better to keep flushers
   introduced by this patch decoupled from cgroup_rstat_lock.  However we
   would have to use irqsafe version of rstat flush but that is fine as
   this code path will be flushing for whole tree and do the work for
   everyone.  No one will be waiting for that worker.

[shakeelb@google.com: fix sleep-in-wrong context bug]
  Link: https://lkml.kernel.org/r/20210716212137.1391164-2-shakeelb@google.com

Link: https://lkml.kernel.org/r/20210714013948.270662-2-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 47d35bef9f6e..c4c5b652d3af 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1023,6 +1023,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 	return x;
 }
 
+void mem_cgroup_flush_stats(void);
+
 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			      int val);
 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
@@ -1438,6 +1440,10 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 	return node_page_state(lruvec_pgdat(lruvec), idx);
 }
 
+static inline void mem_cgroup_flush_stats(void)
+{
+}
+
 static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
 					    enum node_stat_item idx, int val)
 {
-- 
cgit v1.2.3


From 96e51ccf1af33e82f429a0d6baebba29c6448d0f Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 2 Sep 2021 14:55:46 -0700
Subject: memcg: cleanup racy sum avoidance code

We used to have per-cpu memcg and lruvec stats and the readers have to
traverse and sum the stats from each cpu.  This summing was racy and may
expose transient negative values.  So, an explicit check was added to
avoid such scenarios.  Now these stats are moved to rstat infrastructure
and are no more per-cpu, so we can remove the fixup for transient negative
values.

Link: https://lkml.kernel.org/r/20210728012243.3369123-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c4c5b652d3af..2ee20509ac71 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -977,30 +977,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
 
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 {
-	long x = READ_ONCE(memcg->vmstats.state[idx]);
-#ifdef CONFIG_SMP
-	if (x < 0)
-		x = 0;
-#endif
-	return x;
+	return READ_ONCE(memcg->vmstats.state[idx]);
 }
 
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
 					      enum node_stat_item idx)
 {
 	struct mem_cgroup_per_node *pn;
-	long x;
 
 	if (mem_cgroup_disabled())
 		return node_page_state(lruvec_pgdat(lruvec), idx);
 
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	x = READ_ONCE(pn->lruvec_stats.state[idx]);
-#ifdef CONFIG_SMP
-	if (x < 0)
-		x = 0;
-#endif
-	return x;
+	return READ_ONCE(pn->lruvec_stats.state[idx]);
 }
 
 static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
-- 
cgit v1.2.3


From 55a68c823951855f3ec313fdb85100db84284505 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 2 Sep 2021 14:55:49 -0700
Subject: memcg: replace in_interrupt() by !in_task() in active_memcg()

set_active_memcg() uses in_interrupt() check to select proper storage for
cgroup: pointer on task struct or per-cpu pointer.

It isn't fully correct: obsoleted in_interrupt() includes tasks with
disabled BH.  It's better to use '!in_task()' instead.

Link: https://lkml.org/lkml/2021/7/26/487
Link: https://lkml.kernel.org/r/ed4448b0-4970-616f-7368-ef9dd3cb628d@virtuozzo.com
Fixes: 37d5985c003d ("mm: kmem: prepare remote memcg charging infra for interrupt contexts")
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 8894825cc4db..5561486fddef 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -306,7 +306,7 @@ set_active_memcg(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *old;
 
-	if (in_interrupt()) {
+	if (!in_task()) {
 		old = this_cpu_read(int_active_memcg);
 		this_cpu_write(int_active_memcg, memcg);
 	} else {
-- 
cgit v1.2.3


From bec49c067c679e9b7ca7c1aac50b56618c12d879 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Thu, 2 Sep 2021 14:55:56 -0700
Subject: mm, memcg: remove unused functions

Since commit 2d146aa3aa84 ("mm: memcontrol: switch to rstat"), last user
of memcg_stat_item_in_bytes() is gone.  And since commit fa40d1ee9f15
("mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()"), only
the declaration of mem_cgroup_select_victim_node() is remained here.
Remove them.

Link: https://lkml.kernel.org/r/20210807082835.61281-2-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2ee20509ac71..69e6c5462c43 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -593,13 +593,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 }
 #endif
 
-static __always_inline bool memcg_stat_item_in_bytes(int idx)
-{
-	if (idx == MEMCG_PERCPU_B)
-		return true;
-	return vmstat_item_in_bytes(idx);
-}
-
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
 	return (memcg == root_mem_cgroup);
@@ -904,11 +897,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 	return !!(memcg->css.flags & CSS_ONLINE);
 }
 
-/*
- * For memory reclaim.
- */
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
-
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 		int zid, int nr_pages);
 
-- 
cgit v1.2.3


From f358afc52c3066f4e8cd7b3a2d75b31e822519e9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Sep 2021 14:56:36 -0700
Subject: mm: remove flush_kernel_dcache_page

flush_kernel_dcache_page is a rather confusing interface that implements a
subset of flush_dcache_page by not being able to properly handle page
cache mapped pages.

The only callers left are in the exec code as all other previous callers
were incorrect as they could have dealt with page cache pages.  Replace
the calls to flush_kernel_dcache_page with calls to flush_dcache_page,
which for all architectures does either exactly the same thing, can
contains one or more of the following:

 1) an optimization to defer the cache flush for page cache pages not
    mapped into userspace
 2) additional flushing for mapped page cache pages if cache aliases
    are possible

Link: https://lkml.kernel.org/r/20210712060928.4161649-7-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Nick Hu <nickhu@andestech.com>
Cc: Paul Cercueil <paul@crapouillou.net>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: Vincent Chen <deanbo422@gmail.com>
Cc: Yoshinori Sato <ysato@users.osdn.me>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index d9a606a9fc64..b4c49f9cc379 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -130,10 +130,7 @@ static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page
 }
 #endif
 
-#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-static inline void flush_kernel_dcache_page(struct page *page)
-{
-}
+#ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE
 static inline void flush_kernel_vmap_range(void *vaddr, int size)
 {
 }
-- 
cgit v1.2.3


From e15710bf04063766f428048f4dad7b73b646203f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 2 Sep 2021 14:56:43 -0700
Subject: mm: change fault_in_pages_* to have an unsigned size parameter

fault_in_pages_writeable() and fault_in_pages_readable() treat the size
parameter as unsigned, doing pointer math with the value, so make this
explicit and set it to be a size_t type which all callers currently treat
it as anyway.

This solves the issue where static checkers get nervous seeing pointer
arithmetic happening with a signed value.

Link: https://lkml.kernel.org/r/20210727111136.457638-1-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reported-by: Jordy Zomer <jordy@pwning.systems>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ed02aa522263..5dcf446f42e5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -736,7 +736,7 @@ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
 /*
  * Fault everything in given userspace address range in.
  */
-static inline int fault_in_pages_writeable(char __user *uaddr, int size)
+static inline int fault_in_pages_writeable(char __user *uaddr, size_t size)
 {
 	char __user *end = uaddr + size - 1;
 
@@ -763,7 +763,7 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size)
 	return 0;
 }
 
-static inline int fault_in_pages_readable(const char __user *uaddr, int size)
+static inline int fault_in_pages_readable(const char __user *uaddr, size_t size)
 {
 	volatile char c;
 	const char __user *end = uaddr + size - 1;
-- 
cgit v1.2.3


From fc1f5e980a463325cf41d39ac6a69aa3cca73995 Mon Sep 17 00:00:00 2001
From: Ohhoon Kwon <ohoono.kwon@samsung.com>
Date: Thu, 2 Sep 2021 14:57:01 -0700
Subject: mm: sparse: pass section_nr to find_memory_block

With CONFIG_SPARSEMEM_EXTREME enabled, __section_nr() which converts
mem_section to section_nr could be costly since it iterates all section
roots to check if the given mem_section is in its range.

On the other hand, __nr_to_section() which converts section_nr to
mem_section can be done in O(1).

Let's pass section_nr instead of mem_section ptr to find_memory_block() in
order to reduce needless iterations.

Link: https://lkml.kernel.org/r/20210707150212.855-3-ohoono.kwon@samsung.com
Signed-off-by: Ohhoon Kwon <ohoono.kwon@samsung.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 97e92e8b556a..d9a0b61cd432 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -90,7 +90,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
 void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern void memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
-extern struct memory_block *find_memory_block(struct mem_section *);
+extern struct memory_block *find_memory_block(unsigned long section_nr);
 typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
 extern int walk_memory_blocks(unsigned long start, unsigned long size,
 			      void *arg, walk_memory_blocks_func_t func);
-- 
cgit v1.2.3


From 11e02d3729da1a2d4a33db5ea61291770d411884 Mon Sep 17 00:00:00 2001
From: Ohhoon Kwon <ohoono.kwon@samsung.com>
Date: Thu, 2 Sep 2021 14:57:04 -0700
Subject: mm: sparse: remove __section_nr() function

As the last users of __section_nr() are gone, let's remove unused function
__section_nr().

Link: https://lkml.kernel.org/r/20210707150212.855-4-ohoono.kwon@samsung.com
Signed-off-by: Ohhoon Kwon <ohoono.kwon@samsung.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fcb535560028..8827f4d081d4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1342,7 +1342,6 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
 		return NULL;
 	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
-extern unsigned long __section_nr(struct mem_section *ms);
 extern size_t mem_section_usage_size(void);
 
 /*
-- 
cgit v1.2.3


From 01c8d337d195ed105cabab95bc4dcb9e145bf5ea Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Thu, 2 Sep 2021 14:57:07 -0700
Subject: mm/sparse: set SECTION_NID_SHIFT to 6

Currently SECTION_NID_SHIFT is set to 3, which is incorrect because bit 3
and 4 can be overlapped by sub-field for early NID, and can be
unexpectedly set on NUMA systems.  There are a few non-critical issues
related to this:

- Having SECTION_TAINT_ZONE_DEVICE set for wrong sections forces
  pfn_to_online_page() through the slow path, but doesn't actually break
  the kernel.

- A kdump generation tool like makedumpfile uses this field to calculate
  the physical address to read.  So wrong bits can make the tool access to
  wrong address and fail to create kdump.  This can be avoided by the
  tool, so it's not critical.

To fix it, set SECTION_NID_SHIFT to 6 which is the minimum number of
available bits of section flag field.

Link: https://lkml.kernel.org/r/20210707045548.810271-1-naoya.horiguchi@linux.dev
Fixes: 1f90a3477df3 ("mm: teach pfn_to_online_page() about ZONE_DEVICE section collisions")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reported-by: Kazuhito Hagio <k-hagio-ab@nec.com>
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Wang Wensheng <wangwensheng4@huawei.com>
Cc: Rui Xiang <rui.xiang@huawei.com>
Cc: Kazu <k-hagio-ab@nec.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8827f4d081d4..59bad25ce78e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1364,7 +1364,7 @@ extern size_t mem_section_usage_size(void);
 #define SECTION_TAINT_ZONE_DEVICE	(1UL<<4)
 #define SECTION_MAP_LAST_BIT		(1UL<<5)
 #define SECTION_MAP_MASK		(~(SECTION_MAP_LAST_BIT-1))
-#define SECTION_NID_SHIFT		3
+#define SECTION_NID_SHIFT		6
 
 static inline struct page *__section_mem_map_addr(struct mem_section *section)
 {
-- 
cgit v1.2.3


From d0505e9f7dcec85da6634ec66da2b17656ee177b Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Thu, 2 Sep 2021 14:58:31 -0700
Subject: mm: hwpoison: don't drop slab caches for offlining non-LRU page

In the current implementation of soft offline, if non-LRU page is met,
all the slab caches will be dropped to free the page then offline.  But
if the page is not slab page all the effort is wasted in vain.  Even
though it is a slab page, it is not guaranteed the page could be freed
at all.

However the side effect and cost is quite high.  It does not only drop
the slab caches, but also may drop a significant amount of page caches
which are associated with inode caches.  It could make the most
workingset gone in order to just offline a page.  And the offline is not
guaranteed to succeed at all, actually I really doubt the success rate
for real life workload.

Furthermore the worse consequence is the system may be locked up and
unusable since the page cache release may incur huge amount of works
queued for memcg release.

Actually we ran into such unpleasant case in our production environment.
Firstly, the workqueue of memory_failure_work_func is locked up as
below:

    BUG: workqueue lockup - pool cpus=1 node=0 flags=0x0 nice=0 stuck for 53s!
    Showing busy workqueues and worker pools:
    workqueue events: flags=0x0
     pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=14/256 refcnt=15
      in-flight: 409271:memory_failure_work_func
      pending: kfree_rcu_work, kfree_rcu_monitor, kfree_rcu_work, rht_deferred_worker, rht_deferred_worker, rht_deferred_worker, rht_deferred_worker, kfree_rcu_work, kfree_rcu_work, kfree_rcu_work, kfree_rcu_work, drain_local_stock, kfree_rcu_work
    workqueue mm_percpu_wq: flags=0x8
     pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/256 refcnt=2
      pending: vmstat_update
    workqueue cgroup_destroy: flags=0x0
      pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/1 refcnt=12072
        pending: css_release_work_fn

There were over 12K css_release_work_fn queued, and this caused a few
lockups due to the contention of worker pool lock with IRQ disabled, for
example:

    NMI watchdog: Watchdog detected hard LOCKUP on cpu 1
    Modules linked in: amd64_edac_mod edac_mce_amd crct10dif_pclmul crc32_pclmul ghash_clmulni_intel xt_DSCP iptable_mangle kvm_amd bpfilter vfat fat acpi_ipmi i2c_piix4 usb_storage ipmi_si k10temp i2c_core ipmi_devintf ipmi_msghandler acpi_cpufreq sch_fq_codel xfs libcrc32c crc32c_intel mlx5_core mlxfw nvme xhci_pci ptp nvme_core pps_core xhci_hcd
    CPU: 1 PID: 205500 Comm: kworker/1:0 Tainted: G             L    5.10.32-t1.el7.twitter.x86_64 #1
    Hardware name: TYAN F5AMT /z        /S8026GM2NRE-CGN, BIOS V8.030 03/30/2021
    Workqueue: events memory_failure_work_func
    RIP: 0010:queued_spin_lock_slowpath+0x41/0x1a0
    Code: 41 f0 0f ba 2f 08 0f 92 c0 0f b6 c0 c1 e0 08 89 c2 8b 07 30 e4 09 d0 a9 00 01 ff ff 75 1b 85 c0 74 0e 8b 07 84 c0 74 08 f3 90 <8b> 07 84 c0 75 f8 b8 01 00 00 00 66 89 07 c3 f6 c4 01 75 04 c6 47
    RSP: 0018:ffff9b2ac278f900 EFLAGS: 00000002
    RAX: 0000000000480101 RBX: ffff8ce98ce71800 RCX: 0000000000000084
    RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8ce98ce6a140
    RBP: 00000000000284c8 R08: ffffd7248dcb6808 R09: 0000000000000000
    R10: 0000000000000003 R11: ffff9b2ac278f9b0 R12: 0000000000000001
    R13: ffff8cb44dab9c00 R14: ffffffffbd1ce6a0 R15: ffff8cacaa37f068
    FS:  0000000000000000(0000) GS:ffff8ce98ce40000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 00007fcf6e8cb000 CR3: 0000000a0c60a000 CR4: 0000000000350ee0
    Call Trace:
     __queue_work+0xd6/0x3c0
     queue_work_on+0x1c/0x30
     uncharge_batch+0x10e/0x110
     mem_cgroup_uncharge_list+0x6d/0x80
     release_pages+0x37f/0x3f0
     __pagevec_release+0x1c/0x50
     __invalidate_mapping_pages+0x348/0x380
     inode_lru_isolate+0x10a/0x160
     __list_lru_walk_one+0x7b/0x170
     list_lru_walk_one+0x4a/0x60
     prune_icache_sb+0x37/0x50
     super_cache_scan+0x123/0x1a0
     do_shrink_slab+0x10c/0x2c0
     shrink_slab+0x1f1/0x290
     drop_slab_node+0x4d/0x70
     soft_offline_page+0x1ac/0x5b0
     memory_failure_work_func+0x6a/0x90
     process_one_work+0x19e/0x340
     worker_thread+0x30/0x360
     kthread+0x116/0x130

The lockup made the machine is quite unusable.  And it also made the
most workingset gone, the reclaimabled slab caches were reduced from 12G
to 300MB, the page caches were decreased from 17G to 4G.

But the most disappointing thing is all the effort doesn't make the page
offline, it just returns:

    soft_offline: 0x1469f2: unknown non LRU page type 5ffff0000000000 ()

It seems the aggressive behavior for non-LRU page didn't pay back, so it
doesn't make too much sense to keep it considering the terrible side
effect.

Link: https://lkml.kernel.org/r/20210819054116.266126-1-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reported-by: David Mackey <tdmackey@twitter.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 35bbac32b6f6..11c38550627c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3110,7 +3110,7 @@ extern void memory_failure_queue_kick(int cpu);
 extern int unpoison_memory(unsigned long pfn);
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
-extern void shake_page(struct page *p, int access);
+extern void shake_page(struct page *p);
 extern atomic_long_t num_poisoned_pages __read_mostly;
 extern int soft_offline_page(unsigned long pfn, int flags);
 
-- 
cgit v1.2.3


From 09a26e832705fdb7a9484495b71a05e0bbc65207 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Thu, 2 Sep 2021 14:58:53 -0700
Subject: hugetlb: fix hugetlb cgroup refcounting during vma split

Guillaume Morin reported hitting the following WARNING followed by GPF or
NULL pointer deference either in cgroups_destroy or in the kill_css path.:

    percpu ref (css_release) <= 0 (-1) after switching to atomic
    WARNING: CPU: 23 PID: 130 at lib/percpu-refcount.c:196 percpu_ref_switch_to_atomic_rcu+0x127/0x130
    CPU: 23 PID: 130 Comm: ksoftirqd/23 Kdump: loaded Tainted: G           O      5.10.60 #1
    RIP: 0010:percpu_ref_switch_to_atomic_rcu+0x127/0x130
    Call Trace:
       rcu_core+0x30f/0x530
       rcu_core_si+0xe/0x10
       __do_softirq+0x103/0x2a2
       run_ksoftirqd+0x2b/0x40
       smpboot_thread_fn+0x11a/0x170
       kthread+0x10a/0x140
       ret_from_fork+0x22/0x30

Upon further examination, it was discovered that the css structure was
associated with hugetlb reservations.

For private hugetlb mappings the vma points to a reserve map that
contains a pointer to the css.  At mmap time, reservations are set up
and a reference to the css is taken.  This reference is dropped in the
vma close operation; hugetlb_vm_op_close.  However, if a vma is split no
additional reference to the css is taken yet hugetlb_vm_op_close will be
called twice for the split vma resulting in an underflow.

Fix by taking another reference in hugetlb_vm_op_open.  Note that the
reference is only taken for the owner of the reserve map.  In the more
common fork case, the pointer to the reserve map is cleared for
non-owning vmas.

Link: https://lkml.kernel.org/r/20210830215015.155224-1-mike.kravetz@oracle.com
Fixes: e9fe92ae0cd2 ("hugetlb_cgroup: add reservation accounting for private mappings")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Guillaume Morin <guillaume@morinfr.org>
Suggested-by: Guillaume Morin <guillaume@morinfr.org>
Tested-by: Guillaume Morin <guillaume@morinfr.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb_cgroup.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 0b8d1fdda3a1..c137396129db 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -121,6 +121,13 @@ static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg)
 	css_put(&h_cg->css);
 }
 
+static inline void resv_map_dup_hugetlb_cgroup_uncharge_info(
+						struct resv_map *resv_map)
+{
+	if (resv_map->css)
+		css_get(resv_map->css);
+}
+
 extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 					struct hugetlb_cgroup **ptr);
 extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
@@ -199,6 +206,11 @@ static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg)
 {
 }
 
+static inline void resv_map_dup_hugetlb_cgroup_uncharge_info(
+						struct resv_map *resv_map)
+{
+}
+
 static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 					       struct hugetlb_cgroup **ptr)
 {
-- 
cgit v1.2.3


From a759a909d42d727e918bd5248d6cff7562fa8109 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Thu, 2 Sep 2021 14:58:56 -0700
Subject: userfaultfd: change mmap_changing to atomic

Patch series "userfaultfd: minor bug fixes".

Three unrelated bug fixes. The first two addresses possible issues (not
too theoretical ones), but I did not encounter them in practice.

The third patch addresses a test bug that causes the test to fail on my
system. It has been sent before as part of a bigger RFC.

This patch (of 3):

mmap_changing is currently a boolean variable, which is set and cleared
without any lock that protects against concurrent modifications.

mmap_changing is supposed to mark whether userfaultfd page-faults handling
should be retried since mappings are undergoing a change.  However,
concurrent calls, for instance to madvise(MADV_DONTNEED), might cause
mmap_changing to be false, although the remove event was still not read
(hence acknowledged) by the user.

Change mmap_changing to atomic_t and increase/decrease appropriately.  Add
a debug assertion to see whether mmap_changing is negative.

Link: https://lkml.kernel.org/r/20210808020724.1022515-1-namit@vmware.com
Link: https://lkml.kernel.org/r/20210808020724.1022515-2-namit@vmware.com
Fixes: df2cc96e77011 ("userfaultfd: prevent non-cooperative events vs mcopy_atomic races")
Signed-off-by: Nadav Amit <namit@vmware.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/userfaultfd_k.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 331d2ccf0bcc..33cea484d1ad 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -60,16 +60,16 @@ extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 			    unsigned long src_start, unsigned long len,
-			    bool *mmap_changing, __u64 mode);
+			    atomic_t *mmap_changing, __u64 mode);
 extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
 			      unsigned long dst_start,
 			      unsigned long len,
-			      bool *mmap_changing);
+			      atomic_t *mmap_changing);
 extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
-			      unsigned long len, bool *mmap_changing);
+			      unsigned long len, atomic_t *mmap_changing);
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
 			       unsigned long start, unsigned long len,
-			       bool enable_wp, bool *mmap_changing);
+			       bool enable_wp, atomic_t *mmap_changing);
 
 /* mm helpers */
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
-- 
cgit v1.2.3


From 5ac95884a784e822b8cbe3d4bd6e9f96b3b71e3f Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Thu, 2 Sep 2021 14:59:13 -0700
Subject: mm/migrate: enable returning precise migrate_pages() success count

Under normal circumstances, migrate_pages() returns the number of pages
migrated.  In error conditions, it returns an error code.  When returning
an error code, there is no way to know how many pages were migrated or not
migrated.

Make migrate_pages() return how many pages are demoted successfully for
all cases, including when encountering errors.  Page reclaim behavior will
depend on this in subsequent patches.

Link: https://lkml.kernel.org/r/20210721063926.3024591-3-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20210715055145.195411-4-ying.huang@intel.com
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Suggested-by: Oscar Salvador <osalvador@suse.de> [optional parameter]
Reviewed-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Keith Busch <kbusch@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 23dadf7aeba8..8ab88d46318e 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -41,7 +41,8 @@ extern int migrate_page(struct address_space *mapping,
 			struct page *newpage, struct page *page,
 			enum migrate_mode mode);
 extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
-		unsigned long private, enum migrate_mode mode, int reason);
+		unsigned long private, enum migrate_mode mode, int reason,
+		unsigned int *ret_succeeded);
 extern struct page *alloc_migration_target(struct page *page, unsigned long private);
 extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
 
@@ -56,7 +57,7 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 static inline void putback_movable_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t new,
 		free_page_t free, unsigned long private, enum migrate_mode mode,
-		int reason)
+		int reason, unsigned int *ret_succeeded)
 	{ return -ENOSYS; }
 static inline struct page *alloc_migration_target(struct page *page,
 		unsigned long private)
-- 
cgit v1.2.3


From 26aa2d199d6f2cfa6f2ef2a5dfe891f2250e71a0 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Thu, 2 Sep 2021 14:59:16 -0700
Subject: mm/migrate: demote pages during reclaim

This is mostly derived from a patch from Yang Shi:

	https://lore.kernel.org/linux-mm/1560468577-101178-10-git-send-email-yang.shi@linux.alibaba.com/

Add code to the reclaim path (shrink_page_list()) to "demote" data to
another NUMA node instead of discarding the data.  This always avoids the
cost of I/O needed to read the page back in and sometimes avoids the
writeout cost when the page is dirty.

A second pass through shrink_page_list() will be made if any demotions
fail.  This essentially falls back to normal reclaim behavior in the case
that demotions fail.  Previous versions of this patch may have simply
failed to reclaim pages which were eligible for demotion but were unable
to be demoted in practice.

For some cases, for example, MADV_PAGEOUT, the pages are always discarded
instead of demoted to follow the kernel API definition.  Because
MADV_PAGEOUT is defined as freeing specified pages regardless in which
tier they are.

Note: This just adds the start of infrastructure for migration.  It is
actually disabled next to the FIXME in migrate_demote_page_ok().

[dave.hansen@linux.intel.com: v11]
  Link: https://lkml.kernel.org/r/20210715055145.195411-5-ying.huang@intel.com
  Link: https://lkml.kernel.org/r/20210721063926.3024591-4-ying.huang@intel.com

Link: https://lkml.kernel.org/r/20210715055145.195411-5-ying.huang@intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Wei Xu <weixugc@google.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Keith Busch <kbusch@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 8ab88d46318e..326250996b4e 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -28,6 +28,7 @@ enum migrate_reason {
 	MR_NUMA_MISPLACED,
 	MR_CONTIG_RANGE,
 	MR_LONGTERM_PIN,
+	MR_DEMOTION,
 	MR_TYPES
 };
 
@@ -167,6 +168,14 @@ struct migrate_vma {
 int migrate_vma_setup(struct migrate_vma *args);
 void migrate_vma_pages(struct migrate_vma *migrate);
 void migrate_vma_finalize(struct migrate_vma *migrate);
+int next_demotion_node(int node);
+
+#else /* CONFIG_MIGRATION disabled: */
+
+static inline int next_demotion_node(int node)
+{
+	return NUMA_NO_NODE;
+}
 
 #endif /* CONFIG_MIGRATION */
 
-- 
cgit v1.2.3


From 668e4147d8850df32ca41e28f52c146025ca45c6 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Thu, 2 Sep 2021 14:59:19 -0700
Subject: mm/vmscan: add page demotion counter

Account the number of demoted pages.

Add pgdemote_kswapd and pgdemote_direct VM counters showed in
/proc/vmstat.

[ daveh:
   - __count_vm_events() a bit, and made them look at the THP
     size directly rather than getting data from migrate_pages()
]

Link: https://lkml.kernel.org/r/20210721063926.3024591-5-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20210715055145.195411-6-ying.huang@intel.com
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Wei Xu <weixugc@google.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Greg Thelen <gthelen@google.com>
Cc: Keith Busch <kbusch@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vm_event_item.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index ae0dd1948c2b..a185cc75ff52 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -33,6 +33,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		PGREUSE,
 		PGSTEAL_KSWAPD,
 		PGSTEAL_DIRECT,
+		PGDEMOTE_KSWAPD,
+		PGDEMOTE_DIRECT,
 		PGSCAN_KSWAPD,
 		PGSCAN_DIRECT,
 		PGSCAN_DIRECT_THROTTLE,
-- 
cgit v1.2.3


From 20b51af15e014cac63b58a4f8b8b323ac35bccce Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Thu, 2 Sep 2021 14:59:33 -0700
Subject: mm/migrate: add sysfs interface to enable reclaim migration

Some method is obviously needed to enable reclaim-based migration.

Just like traditional autonuma, there will be some workloads that will
benefit like workloads with more "static" configurations where hot pages
stay hot and cold pages stay cold.  If pages come and go from the hot and
cold sets, the benefits of this approach will be more limited.

The benefits are truly workload-based and *not* hardware-based.  We do not
believe that there is a viable threshold where certain hardware
configurations should have this mechanism enabled while others do not.

To be conservative, earlier work defaulted to disable reclaim- based
migration and did not include a mechanism to enable it.  This proposes add
a new sysfs file

  /sys/kernel/mm/numa/demotion_enabled

as a method to enable it.

We are open to any alternative that allows end users to enable this
mechanism or disable it if workload harm is detected (just like
traditional autonuma).

Once this is enabled page demotion may move data to a NUMA node that does
not fall into the cpuset of the allocating process.  This could be
construed to violate the guarantees of cpusets.  However, since this is an
opt-in mechanism, the assumption is that anyone enabling it is content to
relax the guarantees.

Link: https://lkml.kernel.org/r/20210721063926.3024591-9-ying.huang@intel.com
Link: https://lkml.kernel.org/r/20210715055145.195411-10-ying.huang@intel.com
Signed-off-by: Huang Ying <ying.huang@intel.com>
Originally-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 0aaf91b496e2..4ca025e2a77e 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -184,6 +184,8 @@ extern bool vma_migratable(struct vm_area_struct *vma);
 extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
 extern void mpol_put_task_policy(struct task_struct *);
 
+extern bool numa_demotion_enabled;
+
 #else
 
 struct mempolicy {};
@@ -292,5 +294,7 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
 {
 	return NULL;
 }
+
+#define numa_demotion_enabled	false
 #endif /* CONFIG_NUMA */
 #endif
-- 
cgit v1.2.3


From 9647875be52b33fe22cb034ec3074896c581543f Mon Sep 17 00:00:00 2001
From: Hui Su <suhui@zeku.com>
Date: Thu, 2 Sep 2021 14:59:36 -0700
Subject: mm/vmpressure: replace vmpressure_to_css() with vmpressure_to_memcg()

We can get memcg directly form vmpr instead of vmpr->memcg->css->memcg, so
add a new func helper vmpressure_to_memcg().  And no code will use
vmpressure_to_css(), so delete it.

Link: https://lkml.kernel.org/r/20210630112146.455103-1-suhui@zeku.com
Signed-off-by: Hui Su <suhui@zeku.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Chris Down <chris@chrisdown.name>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmpressure.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 6d28bc433c1c..6a2f51ebbfd3 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -37,7 +37,7 @@ extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
 extern void vmpressure_init(struct vmpressure *vmpr);
 extern void vmpressure_cleanup(struct vmpressure *vmpr);
 extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
-extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
+extern struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr);
 extern int vmpressure_register_event(struct mem_cgroup *memcg,
 				     struct eventfd_ctx *eventfd,
 				     const char *args);
-- 
cgit v1.2.3


From b87c517ac5de168aec6e8318ca0707b11b2ccfaf Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Thu, 2 Sep 2021 14:59:46 -0700
Subject: mm/vmscan: remove unneeded return value of kswapd_run()

The return value of kswapd_run() is unused now.  Clean it up.

Link: https://lkml.kernel.org/r/20210717065911.61497-4-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Shaohua Li <shli@fb.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f30d26b0f71d..ba52f3a3478e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -408,7 +408,7 @@ static inline bool node_reclaim_enabled(void)
 
 extern void check_move_unevictable_pages(struct pagevec *pvec);
 
-extern int kswapd_run(int nid);
+extern void kswapd_run(int nid);
 extern void kswapd_stop(int nid);
 
 #ifdef CONFIG_SWAP
-- 
cgit v1.2.3


From 65d759c8f9f57b96c199f3fe5cfb93ac7da095e9 Mon Sep 17 00:00:00 2001
From: Charan Teja Reddy <charante@codeaurora.org>
Date: Thu, 2 Sep 2021 14:59:59 -0700
Subject: mm: compaction: support triggering of proactive compaction by user

The proactive compaction[1] gets triggered for every 500msec and run
compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9) pages
based on the value set to sysctl.compaction_proactiveness.  Triggering the
compaction for every 500msec in search of COMPACTION_HPAGE_ORDER pages is
not needed for all applications, especially on the embedded system
usecases which may have few MB's of RAM.  Enabling the proactive
compaction in its state will endup in running almost always on such
systems.

Other side, proactive compaction can still be very much useful for getting
a set of higher order pages in some controllable manner(controlled by
using the sysctl.compaction_proactiveness).  So, on systems where enabling
the proactive compaction always may proove not required, can trigger the
same from user space on write to its sysctl interface.  As an example, say
app launcher decide to launch the memory heavy application which can be
launched fast if it gets more higher order pages thus launcher can prepare
the system in advance by triggering the proactive compaction from
userspace.

This triggering of proactive compaction is done on a write to
sysctl.compaction_proactiveness by user.

[1]https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=facdaa917c4d5a376d09d25865f5a863f906234a

[akpm@linux-foundation.org: tweak vm.rst, per Mike]

Link: https://lkml.kernel.org/r/1627653207-12317-1-git-send-email-charante@codeaurora.org
Signed-off-by: Charan Teja Reddy <charante@codeaurora.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Rafael Aquini <aquini@redhat.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Nitin Gupta <nigupta@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Khalid Aziz <khalid.aziz@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 2 ++
 include/linux/mmzone.h     | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index c24098c7acca..34bce35c808d 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -84,6 +84,8 @@ static inline unsigned long compact_gap(unsigned int order)
 extern unsigned int sysctl_compaction_proactiveness;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
 			void *buffer, size_t *length, loff_t *ppos);
+extern int compaction_proactiveness_sysctl_handler(struct ctl_table *table,
+		int write, void *buffer, size_t *length, loff_t *ppos);
 extern int sysctl_extfrag_threshold;
 extern int sysctl_compact_unevictable_allowed;
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 59bad25ce78e..1bd5f5955f9a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -846,6 +846,7 @@ typedef struct pglist_data {
 	enum zone_type kcompactd_highest_zoneidx;
 	wait_queue_head_t kcompactd_wait;
 	struct task_struct *kcompactd;
+	bool proactive_compact_trigger;
 #endif
 	/*
 	 * This is a per-node reserve of pages that are not available
-- 
cgit v1.2.3


From cfcaa66f803233c50e17239469f6c96136a673a1 Mon Sep 17 00:00:00 2001
From: Ben Widawsky <ben.widawsky@intel.com>
Date: Thu, 2 Sep 2021 15:00:13 -0700
Subject: mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY

Implement the missing huge page allocation functionality while obeying the
preferred node semantics.  This is similar to the implementation for
general page allocation, as it uses a fallback mechanism to try multiple
preferred nodes first, and then all other nodes.

To avoid adding too many "#ifdef CONFIG_NUMA" check, add a helper function
in mempolicy.h to check whether a mempolicy is MPOL_PREFERRED_MANY.

[akpm@linux-foundation.org: fix compiling issue when merging with other hugetlb patch]
[Thanks to 0day bot for catching the !CONFIG_NUMA compiling issue]
[mhocko@suse.com: suggest to remove the #ifdef CONFIG_NUMA check]
[ben.widawsky@intel.com: add helpers to avoid ifdefs]
  Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com
  Link: https://lkml.kernel.org/r/1627970362-61305-4-git-send-email-feng.tang@intel.com
  Link: https://lkml.kernel.org/r/20210809024430.GA46432@shbuild999.sh.intel.com
[nathan@kernel.org: initialize page to NULL in alloc_buddy_huge_page_with_mpol()]
  Link: https://lkml.kernel.org/r/20210810200632.3812797-1-nathan@kernel.org

Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com
Link: https://lkml.kernel.org/r/1627970362-61305-4-git-send-email-feng.tang@intel.com
Link: https://lkml.kernel.org/r/20210809024430.GA46432@shbuild999.sh.intel.com
Signed-off-by: Ben Widawsky <ben.widawsky@intel.com>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Co-developed-by: Feng Tang <feng.tang@intel.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 4ca025e2a77e..4091692bed8c 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -186,6 +186,12 @@ extern void mpol_put_task_policy(struct task_struct *);
 
 extern bool numa_demotion_enabled;
 
+static inline bool mpol_is_preferred_many(struct mempolicy *pol)
+{
+	return  (pol->mode == MPOL_PREFERRED_MANY);
+}
+
+
 #else
 
 struct mempolicy {};
@@ -296,5 +302,11 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
 }
 
 #define numa_demotion_enabled	false
+
+static inline bool mpol_is_preferred_many(struct mempolicy *pol)
+{
+	return  false;
+}
+
 #endif /* CONFIG_NUMA */
 #endif
-- 
cgit v1.2.3


From a7259df7670240ee03b0cfce8a3e5d3773911e24 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Thu, 2 Sep 2021 15:00:26 -0700
Subject: memblock: make memblock_find_in_range method private

There are a lot of uses of memblock_find_in_range() along with
memblock_reserve() from the times memblock allocation APIs did not exist.

memblock_find_in_range() is the very core of memblock allocations, so any
future changes to its internal behaviour would mandate updates of all the
users outside memblock.

Replace the calls to memblock_find_in_range() with an equivalent calls to
memblock_phys_alloc() and memblock_phys_alloc_range() and make
memblock_find_in_range() private method of memblock.

This simplifies the callers, ensures that (unlikely) errors in
memblock_reserve() are handled and improves maintainability of
memblock_find_in_range().

Link: https://lkml.kernel.org/r/20210816122622.30279-1-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>		[arm64]
Acked-by: Kirill A. Shutemov <kirill.shtuemov@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>	[ACPI]
Acked-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Acked-by: Nick Kossifidis <mick@ics.forth.gr>			[riscv]
Tested-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 4a53c3ca86bd..b066024c62e3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -99,8 +99,6 @@ void memblock_discard(void);
 static inline void memblock_discard(void) {}
 #endif
 
-phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
-				   phys_addr_t size, phys_addr_t align);
 void memblock_allow_resize(void);
 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_add(phys_addr_t base, phys_addr_t size);
-- 
cgit v1.2.3


From dce49103962840dd61423d7627748d6c558d58c5 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 2 Sep 2021 15:00:33 -0700
Subject: mm: wire up syscall process_mrelease

Split off from prev patch in the series that implements the syscall.

Link: https://lkml.kernel.org/r/20210809185259.405936-2-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Jan Engelhardt <jengelh@inai.de>
Cc: Jann Horn <jannh@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tim Murray <timmurray@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/syscalls.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 69c9a7010081..00bc170a50f0 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -915,6 +915,7 @@ asmlinkage long sys_mincore(unsigned long start, size_t len,
 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
 asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
 			size_t vlen, int behavior, unsigned int flags);
+asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 			unsigned long prot, unsigned long pgoff,
 			unsigned long flags);
-- 
cgit v1.2.3


From 0b303fb402862dcb7948eeeed2439bd8c99948b5 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Sat, 8 May 2021 02:28:02 +0200
Subject: mm, slub: do initial checks in ___slab_alloc() with irqs enabled

As another step of shortening irq disabled sections in ___slab_alloc(), delay
disabling irqs until we pass the initial checks if there is a cached percpu
slab and it's suitable for our allocation.

Now we have to recheck c->page after actually disabling irqs as an allocation
in irq handler might have replaced it.

Because we call pfmemalloc_match() as one of the checks, we might hit
VM_BUG_ON_PAGE(!PageSlab(page)) in PageSlabPfmemalloc in case we get
interrupted and the page is freed. Thus introduce a pfmemalloc_match_unsafe()
variant that lacks the PageSlab check.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
---
 include/linux/page-flags.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5922031ffab6..7fda4fb85bdc 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -815,6 +815,15 @@ static inline int PageSlabPfmemalloc(struct page *page)
 	return PageActive(page);
 }
 
+/*
+ * A version of PageSlabPfmemalloc() for opportunistic checks where the page
+ * might have been freed under us and not be a PageSlab anymore.
+ */
+static inline int __PageSlabPfmemalloc(struct page *page)
+{
+	return PageActive(page);
+}
+
 static inline void SetPageSlabPfmemalloc(struct page *page)
 {
 	VM_BUG_ON_PAGE(!PageSlab(page), page);
-- 
cgit v1.2.3


From 2112ff5ce0c1128fe7b4d19cfe7f2b8ce5b595fa Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 23 Aug 2021 11:18:44 +0100
Subject: iov_iter: track truncated size

Remember how many bytes were truncated and reverted back. Because
not reexpanded iterators don't always work well with reverting, we may
need to know that to reexpand ourselves when needed.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 82c3c3e819e0..5265024e8b90 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -47,6 +47,7 @@ struct iov_iter {
 		};
 		loff_t xarray_start;
 	};
+	size_t truncated;
 };
 
 static inline enum iter_type iov_iter_type(const struct iov_iter *i)
@@ -254,8 +255,10 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
 	 * conversion in assignement is by definition greater than all
 	 * values of size_t, including old i->count.
 	 */
-	if (i->count > count)
+	if (i->count > count) {
+		i->truncated += i->count - count;
 		i->count = count;
+	}
 }
 
 /*
@@ -264,6 +267,7 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
  */
 static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 {
+	i->truncated -= count - i->count;
 	i->count = count;
 }
 
-- 
cgit v1.2.3


From bd0e7491a931f5a2960555b10b9551464ff8cc8e Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Sat, 22 May 2021 01:59:38 +0200
Subject: mm, slub: convert kmem_cpu_slab protection to local_lock

Embed local_lock into struct kmem_cpu_slab and use the irq-safe versions of
local_lock instead of plain local_irq_save/restore. On !PREEMPT_RT that's
equivalent, with better lockdep visibility. On PREEMPT_RT that means better
preemption.

However, the cost on PREEMPT_RT is the loss of lockless fast paths which only
work with cpu freelist. Those are designed to detect and recover from being
preempted by other conflicting operations (both fast or slow path), but the
slow path operations assume they cannot be preempted by a fast path operation,
which is guaranteed naturally with disabled irqs. With local locks on
PREEMPT_RT, the fast paths now also need to take the local lock to avoid races.

In the allocation fastpath slab_alloc_node() we can just defer to the slowpath
__slab_alloc() which also works with cpu freelist, but under the local lock.
In the free fastpath do_slab_free() we have to add a new local lock protected
version of freeing to the cpu freelist, as the existing slowpath only works
with the page freelist.

Also update the comment about locking scheme in SLUB to reflect changes done
by this series.

[ Mike Galbraith <efault@gmx.de>: use local_lock() without irq in PREEMPT_RT
  scope; debugging of RT crashes resulting in put_cpu_partial() locking changes ]
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slub_def.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index dcde82a4434c..85499f0586b0 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,6 +10,7 @@
 #include <linux/kfence.h>
 #include <linux/kobject.h>
 #include <linux/reciprocal_div.h>
+#include <linux/local_lock.h>
 
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
@@ -40,6 +41,10 @@ enum stat_item {
 	CPU_PARTIAL_DRAIN,	/* Drain cpu partial to node partial */
 	NR_SLUB_STAT_ITEMS };
 
+/*
+ * When changing the layout, make sure freelist and tid are still compatible
+ * with this_cpu_cmpxchg_double() alignment requirements.
+ */
 struct kmem_cache_cpu {
 	void **freelist;	/* Pointer to next available object */
 	unsigned long tid;	/* Globally unique transaction id */
@@ -47,6 +52,7 @@ struct kmem_cache_cpu {
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 	struct page *partial;	/* Partially allocated frozen slabs */
 #endif
+	local_lock_t lock;	/* Protects the fields above */
 #ifdef CONFIG_SLUB_STATS
 	unsigned stat[NR_SLUB_STAT_ITEMS];
 #endif
-- 
cgit v1.2.3


From 48eab831ae8b9f7002a533fa4235eed63ea1f1a3 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 2 Sep 2021 11:10:37 -0700
Subject: net: create netdev->dev_addr assignment helpers

Recent work on converting address list to a tree made it obvious
we need an abstraction around writing netdev->dev_addr. Without
such abstraction updating the main device address is invisible
to the core.

Introduce a number of helpers which for now just wrap memcpy()
but in the future can make necessary changes to the address
tree.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 12 ++++++++++++
 include/linux/netdevice.h   | 18 ++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 330345b1be54..928c411bd509 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -299,6 +299,18 @@ static inline void ether_addr_copy(u8 *dst, const u8 *src)
 #endif
 }
 
+/**
+ * eth_hw_addr_set - Assign Ethernet address to a net_device
+ * @dev: pointer to net_device structure
+ * @addr: address to assign
+ *
+ * Assign given address to the net_device, addr_assign_type is not changed.
+ */
+static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
+{
+	ether_addr_copy(dev->dev_addr, addr);
+}
+
 /**
  * eth_hw_addr_inherit - Copy dev_addr from another net_device
  * @dst: pointer to net_device to copy dev_addr to
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7c41593c1d6a..d79163208dfd 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4641,6 +4641,24 @@ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
 void __hw_addr_init(struct netdev_hw_addr_list *list);
 
 /* Functions used for device addresses handling */
+static inline void
+__dev_addr_set(struct net_device *dev, const u8 *addr, size_t len)
+{
+	memcpy(dev->dev_addr, addr, len);
+}
+
+static inline void dev_addr_set(struct net_device *dev, const u8 *addr)
+{
+	__dev_addr_set(dev, addr, dev->addr_len);
+}
+
+static inline void
+dev_addr_mod(struct net_device *dev, unsigned int offset,
+	     const u8 *addr, size_t len)
+{
+	memcpy(&dev->dev_addr[offset], addr, len);
+}
+
 int dev_addr_add(struct net_device *dev, const unsigned char *addr,
 		 unsigned char addr_type);
 int dev_addr_del(struct net_device *dev, const unsigned char *addr,
-- 
cgit v1.2.3


From 8486a32dd484a7d7ec25295c7439094608f54915 Mon Sep 17 00:00:00 2001
From: "Hector.Yuan" <hector.yuan@mediatek.com>
Date: Fri, 3 Sep 2021 16:39:23 +0800
Subject: cpufreq: Add of_perf_domain_get_sharing_cpumask

Add of_perf_domain_get_sharing_cpumask function to group cpu
to specific performance domain.

Signed-off-by: Hector.Yuan <hector.yuan@mediatek.com>
[ Viresh: create separate routine parse_perf_domain() and always set the
	  cpumask. ]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/cpufreq.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index c65a1d7385f8..acd3ee5b8b0a 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -14,6 +14,8 @@
 #include <linux/completion.h>
 #include <linux/kobject.h>
 #include <linux/notifier.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
 #include <linux/pm_opp.h>
 #include <linux/pm_qos.h>
 #include <linux/spinlock.h>
@@ -1003,6 +1005,55 @@ static inline int cpufreq_table_count_valid_entries(const struct cpufreq_policy
 
 	return count;
 }
+
+static inline int parse_perf_domain(int cpu, const char *list_name,
+				    const char *cell_name)
+{
+	struct device_node *cpu_np;
+	struct of_phandle_args args;
+	int ret;
+
+	cpu_np = of_cpu_device_node_get(cpu);
+	if (!cpu_np)
+		return -ENODEV;
+
+	ret = of_parse_phandle_with_args(cpu_np, list_name, cell_name, 0,
+					 &args);
+	if (ret < 0)
+		return ret;
+
+	of_node_put(cpu_np);
+
+	return args.args[0];
+}
+
+static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_name,
+						     const char *cell_name, struct cpumask *cpumask)
+{
+	int target_idx;
+	int cpu, ret;
+
+	ret = parse_perf_domain(pcpu, list_name, cell_name);
+	if (ret < 0)
+		return ret;
+
+	target_idx = ret;
+	cpumask_set_cpu(pcpu, cpumask);
+
+	for_each_possible_cpu(cpu) {
+		if (cpu == pcpu)
+			continue;
+
+		ret = parse_perf_domain(pcpu, list_name, cell_name);
+		if (ret < 0)
+			continue;
+
+		if (target_idx == ret)
+			cpumask_set_cpu(cpu, cpumask);
+	}
+
+	return target_idx;
+}
 #else
 static inline int cpufreq_boost_trigger_state(int state)
 {
@@ -1022,6 +1073,12 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy)
 {
 	return false;
 }
+
+static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_name,
+						     const char *cell_name, struct cpumask *cpumask)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
@@ -1043,7 +1100,6 @@ void arch_set_freq_scale(const struct cpumask *cpus,
 {
 }
 #endif
-
 /* the following are really really optional */
 extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;
 extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs;
-- 
cgit v1.2.3


From 3cc4e148b96263313e3dce926eae569c942bb74e Mon Sep 17 00:00:00 2001
From: Jing Zhang <jingzhangos@google.com>
Date: Tue, 17 Aug 2021 00:26:39 +0000
Subject: KVM: stats: Add VM stat for remote tlb flush requests

Add a new stat that counts the number of times a remote TLB flush is
requested, regardless of whether it kicks vCPUs out of guest mode. This
allows us to look at how often flushes are initiated.

Unlike remote_tlb_flush, this one applies to ARM's instruction-set-based
TLB flush implementation, so apply it there too.

Original-by: David Matlack <dmatlack@google.com>
Signed-off-by: Jing Zhang <jingzhangos@google.com>
Message-Id: <20210817002639.3856694-1-jingzhangos@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h  | 3 ++-
 include/linux/kvm_types.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e4d712e9f760..c177789a8542 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1444,7 +1444,8 @@ struct _kvm_stats_desc {
 		KVM_STATS_BASE_POW10, -9, sz)
 
 #define KVM_GENERIC_VM_STATS()						       \
-	STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush)
+	STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush),		       \
+	STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_requests)
 
 #define KVM_GENERIC_VCPU_STATS()					       \
 	STATS_DESC_COUNTER(VCPU_GENERIC, halt_successful_poll),		       \
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index de7fb5f364d8..2237abb93ccd 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -80,6 +80,7 @@ struct kvm_mmu_memory_cache {
 
 struct kvm_vm_stat_generic {
 	u64 remote_tlb_flush;
+	u64 remote_tlb_flush_requests;
 };
 
 struct kvm_vcpu_stat_generic {
-- 
cgit v1.2.3


From 9c930054f2f5326d59ee4bf8d7d1cf6c82f5643b Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Tue, 31 Aug 2021 18:36:24 +0800
Subject: file: Export receive_fd() to modules

Export receive_fd() so that some modules can use
it to pass file descriptor between processes without
missing any security stuffs.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210831103634.33-4-xieyongji@bytedance.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/file.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/file.h b/include/linux/file.h
index 2de2e4613d7b..51e830b4fe3a 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -94,6 +94,9 @@ extern void fd_install(unsigned int fd, struct file *file);
 
 extern int __receive_fd(struct file *file, int __user *ufd,
 			unsigned int o_flags);
+
+extern int receive_fd(struct file *file, unsigned int o_flags);
+
 static inline int receive_fd_user(struct file *file, int __user *ufd,
 				  unsigned int o_flags)
 {
@@ -101,10 +104,6 @@ static inline int receive_fd_user(struct file *file, int __user *ufd,
 		return -EFAULT;
 	return __receive_fd(file, ufd, o_flags);
 }
-static inline int receive_fd(struct file *file, unsigned int o_flags)
-{
-	return __receive_fd(file, NULL, o_flags);
-}
 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);
 
 extern void flush_delayed_fput(void);
-- 
cgit v1.2.3


From 86e17a51c1a5a299009f8b1645e3e9da0d59faae Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Tue, 31 Aug 2021 18:36:25 +0800
Subject: vdpa: Fix some coding style issues

Fix some code indent issues and following checkpatch warning:

WARNING: Prefer 'unsigned int' to bare use of 'unsigned'
371: FILE: include/linux/vdpa.h:371:
+static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset,

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/20210831103634.33-5-xieyongji@bytedance.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 8cfe49d201dd..8ae1134070eb 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -43,17 +43,17 @@ struct vdpa_vq_state_split {
  * @last_used_idx: used index
  */
 struct vdpa_vq_state_packed {
-        u16	last_avail_counter:1;
-        u16	last_avail_idx:15;
-        u16	last_used_counter:1;
-        u16	last_used_idx:15;
+	u16	last_avail_counter:1;
+	u16	last_avail_idx:15;
+	u16	last_used_counter:1;
+	u16	last_used_idx:15;
 };
 
 struct vdpa_vq_state {
-     union {
-          struct vdpa_vq_state_split split;
-          struct vdpa_vq_state_packed packed;
-     };
+	union {
+		struct vdpa_vq_state_split split;
+		struct vdpa_vq_state_packed packed;
+	};
 };
 
 struct vdpa_mgmt_dev;
@@ -131,7 +131,7 @@ struct vdpa_iova_range {
  *				@vdev: vdpa device
  *				@idx: virtqueue index
  *				@state: pointer to returned state (last_avail_idx)
- * @get_vq_notification: 	Get the notification area for a virtqueue
+ * @get_vq_notification:	Get the notification area for a virtqueue
  *				@vdev: vdpa device
  *				@idx: virtqueue index
  *				Returns the notifcation area
@@ -350,25 +350,25 @@ static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev)
 
 static inline void vdpa_reset(struct vdpa_device *vdev)
 {
-        const struct vdpa_config_ops *ops = vdev->config;
+	const struct vdpa_config_ops *ops = vdev->config;
 
 	vdev->features_valid = false;
-        ops->set_status(vdev, 0);
+	ops->set_status(vdev, 0);
 }
 
 static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features)
 {
-        const struct vdpa_config_ops *ops = vdev->config;
+	const struct vdpa_config_ops *ops = vdev->config;
 
 	vdev->features_valid = true;
-        return ops->set_features(vdev, features);
+	return ops->set_features(vdev, features);
 }
 
-
-static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset,
-				   void *buf, unsigned int len)
+static inline void vdpa_get_config(struct vdpa_device *vdev,
+				   unsigned int offset, void *buf,
+				   unsigned int len)
 {
-        const struct vdpa_config_ops *ops = vdev->config;
+	const struct vdpa_config_ops *ops = vdev->config;
 
 	/*
 	 * Config accesses aren't supposed to trigger before features are set.
-- 
cgit v1.2.3


From 0686082dbf7a204ca0fab326a820779e31666639 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Tue, 31 Aug 2021 18:36:26 +0800
Subject: vdpa: Add reset callback in vdpa_config_ops

This adds a new callback to support device specific reset
behavior. The vdpa bus driver will call the reset function
instead of setting status to zero during resetting.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Link: https://lore.kernel.org/r/20210831103634.33-6-xieyongji@bytedance.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 8ae1134070eb..e1eae8c7483d 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -171,6 +171,9 @@ struct vdpa_iova_range {
  * @set_status:			Set the device status
  *				@vdev: vdpa device
  *				@status: virtio device status
+ * @reset:			Reset device
+ *				@vdev: vdpa device
+ *				Returns integer: success (0) or error (< 0)
  * @get_config_size:		Get the size of the configuration space
  *				@vdev: vdpa device
  *				Returns size_t: configuration size
@@ -255,6 +258,7 @@ struct vdpa_config_ops {
 	u32 (*get_vendor_id)(struct vdpa_device *vdev);
 	u8 (*get_status)(struct vdpa_device *vdev);
 	void (*set_status)(struct vdpa_device *vdev, u8 status);
+	int (*reset)(struct vdpa_device *vdev);
 	size_t (*get_config_size)(struct vdpa_device *vdev);
 	void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
 			   void *buf, unsigned int len);
@@ -348,12 +352,12 @@ static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev)
 	return vdev->dma_dev;
 }
 
-static inline void vdpa_reset(struct vdpa_device *vdev)
+static inline int vdpa_reset(struct vdpa_device *vdev)
 {
 	const struct vdpa_config_ops *ops = vdev->config;
 
 	vdev->features_valid = false;
-	ops->set_status(vdev, 0);
+	return ops->reset(vdev);
 }
 
 static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features)
-- 
cgit v1.2.3


From 59dfe4f1e810b5820443c84f9863b04b033143e8 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Tue, 31 Aug 2021 18:36:28 +0800
Subject: vhost-iotlb: Add an opaque pointer for vhost IOTLB

Add an opaque pointer for vhost IOTLB. And introduce
vhost_iotlb_add_range_ctx() to accept it.

Suggested-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210831103634.33-8-xieyongji@bytedance.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vhost_iotlb.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vhost_iotlb.h b/include/linux/vhost_iotlb.h
index 6b09b786a762..2d0e2f52f938 100644
--- a/include/linux/vhost_iotlb.h
+++ b/include/linux/vhost_iotlb.h
@@ -17,6 +17,7 @@ struct vhost_iotlb_map {
 	u32 perm;
 	u32 flags_padding;
 	u64 __subtree_last;
+	void *opaque;
 };
 
 #define VHOST_IOTLB_FLAG_RETIRE 0x1
@@ -29,6 +30,8 @@ struct vhost_iotlb {
 	unsigned int flags;
 };
 
+int vhost_iotlb_add_range_ctx(struct vhost_iotlb *iotlb, u64 start, u64 last,
+			      u64 addr, unsigned int perm, void *opaque);
 int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, u64 start, u64 last,
 			  u64 addr, unsigned int perm);
 void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last);
-- 
cgit v1.2.3


From c10fb9454adc80c062151c6a436047e1fa59e99f Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Tue, 31 Aug 2021 18:36:29 +0800
Subject: vdpa: Add an opaque pointer for vdpa_config_ops.dma_map()

Add an opaque pointer for DMA mapping.

Suggested-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210831103634.33-9-xieyongji@bytedance.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index e1eae8c7483d..f3014aaca47e 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -270,7 +270,7 @@ struct vdpa_config_ops {
 	/* DMA ops */
 	int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb);
 	int (*dma_map)(struct vdpa_device *vdev, u64 iova, u64 size,
-		       u64 pa, u32 perm);
+		       u64 pa, u32 perm, void *opaque);
 	int (*dma_unmap)(struct vdpa_device *vdev, u64 iova, u64 size);
 
 	/* Free device resources */
-- 
cgit v1.2.3


From d8945ec411209272bcd4ae9e75ea1b078257e492 Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Tue, 31 Aug 2021 18:36:31 +0800
Subject: vdpa: Support transferring virtual addressing during DMA mapping

This patch introduces an attribute for vDPA device to indicate
whether virtual address can be used. If vDPA device driver set
it, vhost-vdpa bus driver will not pin user page and transfer
userspace virtual address instead of physical address during
DMA mapping. And corresponding vma->vm_file and offset will be
also passed as an opaque pointer.

Suggested-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210831103634.33-11-xieyongji@bytedance.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index f3014aaca47e..3972ab765de1 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -65,6 +65,7 @@ struct vdpa_mgmt_dev;
  * @config: the configuration ops for this device.
  * @index: device index
  * @features_valid: were features initialized? for legacy guests
+ * @use_va: indicate whether virtual address must be used by this device
  * @nvqs: maximum number of supported virtqueues
  * @mdev: management device pointer; caller must setup when registering device as part
  *	  of dev_add() mgmtdev ops callback before invoking _vdpa_register_device().
@@ -75,6 +76,7 @@ struct vdpa_device {
 	const struct vdpa_config_ops *config;
 	unsigned int index;
 	bool features_valid;
+	bool use_va;
 	int nvqs;
 	struct vdpa_mgmt_dev *mdev;
 };
@@ -89,6 +91,16 @@ struct vdpa_iova_range {
 	u64 last;
 };
 
+/**
+ * Corresponding file area for device memory mapping
+ * @file: vma->vm_file for the mapping
+ * @offset: mapping offset in the vm_file
+ */
+struct vdpa_map_file {
+	struct file *file;
+	u64 offset;
+};
+
 /**
  * struct vdpa_config_ops - operations for configuring a vDPA device.
  * Note: vDPA device drivers are required to implement all of the
@@ -279,7 +291,8 @@ struct vdpa_config_ops {
 
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 					const struct vdpa_config_ops *config,
-					size_t size, const char *name);
+					size_t size, const char *name,
+					bool use_va);
 
 /**
  * vdpa_alloc_device - allocate and initilaize a vDPA device
@@ -289,15 +302,16 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
  * @parent: the parent device
  * @config: the bus operations that is supported by this device
  * @name: name of the vdpa device
+ * @use_va: indicate whether virtual address must be used by this device
  *
  * Return allocated data structure or ERR_PTR upon error
  */
-#define vdpa_alloc_device(dev_struct, member, parent, config, name)   \
+#define vdpa_alloc_device(dev_struct, member, parent, config, name, use_va)   \
 			  container_of(__vdpa_alloc_device( \
 				       parent, config, \
 				       sizeof(dev_struct) + \
 				       BUILD_BUG_ON_ZERO(offsetof( \
-				       dev_struct, member)), name), \
+				       dev_struct, member)), name, use_va), \
 				       dev_struct, member)
 
 int vdpa_register_device(struct vdpa_device *vdev, int nvqs);
-- 
cgit v1.2.3


From f97493657c6372eeefe70faadd214bf31488c44e Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 7 Sep 2021 18:56:46 +0800
Subject: net: phylink: add suspend/resume support

Joakim Zhang reports that Wake-on-Lan with the stmmac ethernet driver broke
when moving the incorrect handling of mac link state out of mac_config().
This reason this breaks is because the stmmac's WoL is handled by the MAC
rather than the PHY, and phylink doesn't cater for that scenario.

This patch adds the necessary phylink code to handle suspend/resume events
according to whether the MAC still needs a valid link or not. This is the
barest minimum for this support.

Reported-by: Joakim Zhang <qiangqing.zhang@nxp.com>
Tested-by: Joakim Zhang <qiangqing.zhang@nxp.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Joakim Zhang <qiangqing.zhang@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index afb3ded0b691..237291196ce2 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -451,6 +451,9 @@ void phylink_mac_change(struct phylink *, bool up);
 void phylink_start(struct phylink *);
 void phylink_stop(struct phylink *);
 
+void phylink_suspend(struct phylink *pl, bool mac_wol);
+void phylink_resume(struct phylink *pl);
+
 void phylink_ethtool_get_wol(struct phylink *, struct ethtool_wolinfo *);
 int phylink_ethtool_set_wol(struct phylink *, struct ethtool_wolinfo *);
 
-- 
cgit v1.2.3


From cd1adf1b63a112d762832e9c64b0a886fbb840d6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 7 Sep 2021 11:03:45 -0700
Subject: Revert "mm/gup: remove try_get_page(), call try_get_compound_head()
 directly"

This reverts commit 9857a17f206ff374aea78bccfb687f145368be2e.

That commit was completely broken, and I should have caught on to it
earlier.  But happily, the kernel test robot noticed the breakage fairly
quickly.

The breakage is because "try_get_page()" is about avoiding the page
reference count overflow case, but is otherwise the exact same as a
plain "get_page()".

In contrast, "try_get_compound_head()" is an entirely different beast,
and uses __page_cache_add_speculative() because it's not just about the
page reference count, but also about possibly racing with the underlying
page going away.

So all the commentary about how

 "try_get_page() has fallen a little behind in terms of maintenance,
  try_get_compound_head() handles speculative page references more
  thoroughly"

was just completely wrong: yes, try_get_compound_head() handles
speculative page references, but the point is that try_get_page() does
not, and must not.

So there's no lack of maintainance - there are fundamentally different
semantics.

A speculative page reference would be entirely wrong in "get_page()",
and it's entirely wrong in "try_get_page()".  It's not about
speculation, it's purely about "uhhuh, you can't get this page because
you've tried to increment the reference count too much already".

The reason the kernel test robot noticed this bug was that it hit the
VM_BUG_ON() in __page_cache_add_speculative(), which is all about
verifying that the context of any speculative page access is correct.
But since that isn't what try_get_page() is all about, the VM_BUG_ON()
tests things that are not correct to test for try_get_page().

Reported-by: kernel test robot <oliver.sang@intel.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 50e2c2914ac2..73a52aba448f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1218,7 +1218,15 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags);
 struct page *try_grab_compound_head(struct page *page, int refs,
 				    unsigned int flags);
 
-struct page *try_get_compound_head(struct page *page, int refs);
+
+static inline __must_check bool try_get_page(struct page *page)
+{
+	page = compound_head(page);
+	if (WARN_ON_ONCE(page_ref_count(page) <= 0))
+		return false;
+	page_ref_inc(page);
+	return true;
+}
 
 static inline void put_page(struct page *page)
 {
-- 
cgit v1.2.3


From ca67408ad57a5a67ad6801d792c40c010451bdef Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Mon, 6 Sep 2021 09:44:52 +0100
Subject: PM: EM: fix kernel-doc comments

Fix the kernel-doc comments for the improved Energy Model documentation.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/energy_model.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 1834752c5617..39dcadd492b5 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -11,7 +11,7 @@
 #include <linux/types.h>
 
 /**
- * em_perf_state - Performance state of a performance domain
+ * struct em_perf_state - Performance state of a performance domain
  * @frequency:	The frequency in KHz, for consistency with CPUFreq
  * @power:	The power consumed at this level (by 1 CPU or by a registered
  *		device). It can be a total power: static and dynamic.
@@ -25,7 +25,7 @@ struct em_perf_state {
 };
 
 /**
- * em_perf_domain - Performance domain
+ * struct em_perf_domain - Performance domain
  * @table:		List of performance states, in ascending order
  * @nr_perf_states:	Number of performance states
  * @milliwatts:		Flag indicating the power values are in milli-Watts
@@ -103,12 +103,12 @@ void em_dev_unregister_perf_domain(struct device *dev);
 
 /**
  * em_cpu_energy() - Estimates the energy consumed by the CPUs of a
-		performance domain
+ *		performance domain
  * @pd		: performance domain for which energy has to be estimated
  * @max_util	: highest utilization among CPUs of the domain
  * @sum_util	: sum of the utilization of all CPUs in the domain
  * @allowed_cpu_cap	: maximum allowed CPU capacity for the @pd, which
-			  might reflect reduced frequency (due to thermal)
+ *			  might reflect reduced frequency (due to thermal)
  *
  * This function must be used only for CPU devices. There is no validation,
  * i.e. if the EM is a CPU type and has cpumask allocated. It is called from
-- 
cgit v1.2.3


From 39ff83f2f6cc5cc1458dfcea9697f96338210beb Mon Sep 17 00:00:00 2001
From: Lukas Hannen <lukas.hannen@opensource.tttech-industrial.com>
Date: Wed, 25 Aug 2021 10:12:43 +0000
Subject: time: Handle negative seconds correctly in timespec64_to_ns()

timespec64_ns() prevents multiplication overflows by comparing the seconds
value of the timespec to KTIME_SEC_MAX. If the value is greater or equal it
returns KTIME_MAX.

But that check casts the signed seconds value to unsigned which makes the
comparision true for all negative values and therefore return wrongly
KTIME_MAX.

Negative second values are perfectly valid and required in some places,
e.g. ptp_clock_adjtime().

Remove the cast and add a check for the negative boundary which is required
to prevent undefined behaviour due to multiplication underflow.

Fixes: cb47755725da ("time: Prevent undefined behaviour in timespec64_to_ns()")'
Signed-off-by: Lukas Hannen <lukas.hannen@opensource.tttech-industrial.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/AM6PR01MB541637BD6F336B8FFB72AF80EEC69@AM6PR01MB5416.eurprd01.prod.exchangelabs.com
---
 include/linux/time64.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 5117cb5b5656..81b9686a2079 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -25,7 +25,9 @@ struct itimerspec64 {
 #define TIME64_MIN			(-TIME64_MAX - 1)
 
 #define KTIME_MAX			((s64)~((u64)1 << 63))
+#define KTIME_MIN			(-KTIME_MAX - 1)
 #define KTIME_SEC_MAX			(KTIME_MAX / NSEC_PER_SEC)
+#define KTIME_SEC_MIN			(KTIME_MIN / NSEC_PER_SEC)
 
 /*
  * Limits for settimeofday():
@@ -124,10 +126,13 @@ static inline bool timespec64_valid_settod(const struct timespec64 *ts)
  */
 static inline s64 timespec64_to_ns(const struct timespec64 *ts)
 {
-	/* Prevent multiplication overflow */
-	if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX)
+	/* Prevent multiplication overflow / underflow */
+	if (ts->tv_sec >= KTIME_SEC_MAX)
 		return KTIME_MAX;
 
+	if (ts->tv_sec <= KTIME_SEC_MIN)
+		return KTIME_MIN;
+
 	return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
 }
 
-- 
cgit v1.2.3


From 859a85ddf90e714092dea71a0e54c7b9896621be Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Tue, 7 Sep 2021 19:54:52 -0700
Subject: mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE

Patch series "mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE".

After recent updates to freeing unused parts of the memory map, no
architecture can have holes in the memory map within a pageblock.  This
makes pfn_valid_within() check and CONFIG_HOLES_IN_ZONE configuration
option redundant.

The first patch removes them both in a mechanical way and the second patch
simplifies memory_hotplug::test_pages_in_a_zone() that had
pfn_valid_within() surrounded by more logic than simple if.

This patch (of 2):

After recent changes in freeing of the unused parts of the memory map and
rework of pfn_valid() in arm and arm64 there are no architectures that can
have holes in the memory map within a pageblock and so nothing can enable
CONFIG_HOLES_IN_ZONE which guards non trivial implementation of
pfn_valid_within().

With that, pfn_valid_within() is always hardwired to 1 and can be
completely removed.

Remove calls to pfn_valid_within() and CONFIG_HOLES_IN_ZONE.

Link: https://lkml.kernel.org/r/20210713080035.7464-1-rppt@kernel.org
Link: https://lkml.kernel.org/r/20210713080035.7464-2-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fcb535560028..ee3a86830519 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1525,18 +1525,6 @@ void sparse_init(void);
 #define subsection_map_init(_pfn, _nr_pages) do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
-/*
- * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
- * need to check pfn validity within that MAX_ORDER_NR_PAGES block.
- * pfn_valid_within() should be used in this case; we optimise this away
- * when we have no holes within a MAX_ORDER_NR_PAGES block.
- */
-#ifdef CONFIG_HOLES_IN_ZONE
-#define pfn_valid_within(pfn) pfn_valid(pfn)
-#else
-#define pfn_valid_within(pfn) (1)
-#endif
-
 #endif /* !__GENERATING_BOUNDS.H */
 #endif /* !__ASSEMBLY__ */
 #endif /* _LINUX_MMZONE_H */
-- 
cgit v1.2.3


From 7cf209ba8a86410939a24cb1aeb279479a7e0ca6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 7 Sep 2021 19:54:59 -0700
Subject: mm/memory_hotplug: use "unsigned long" for PFN in
 zone_for_pfn_range()

Patch series "mm/memory_hotplug: preparatory patches for new online policy and memory"

These are all cleanups and one fix previously sent as part of [1]:
[PATCH v1 00/12] mm/memory_hotplug: "auto-movable" online policy and memory
groups.

These patches make sense even without the other series, therefore I pulled
them out to make the other series easier to digest.

[1] https://lkml.kernel.org/r/20210607195430.48228-1-david@redhat.com

This patch (of 4):

Checkpatch complained on a follow-up patch that we are using "unsigned"
here, which defaults to "unsigned int" and checkpatch is correct.

As we will search for a fitting zone using the wrong pfn, we might end
up onlining memory to one of the special kernel zones, such as ZONE_DMA,
which can end badly as the onlined memory does not satisfy properties of
these zones.

Use "unsigned long" instead, just as we do in other places when handling
PFNs.  This can bite us once we have physical addresses in the range of
multiple TB.

Link: https://lkml.kernel.org/r/20210712124052.26491-2-david@redhat.com
Fixes: e5e689302633 ("mm, memory_hotplug: display allowed zones in the preferred ordering")
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Pankaj Gupta <pankaj.gupta@ionos.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: virtualization@lists.linux-foundation.org
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jia He <justin.he@arm.com>
Cc: Joe Perches <joe@perches.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Michel Lespinasse <michel@lespinasse.org>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Pierre Morel <pmorel@linux.ibm.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Scott Cheloha <cheloha@linux.ibm.com>
Cc: Sergei Trofimovich <slyfox@gentoo.org>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index a7fd2c3ccb77..d01b504ce06f 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -339,8 +339,8 @@ extern void sparse_remove_section(struct mem_section *ms,
 		unsigned long map_offset, struct vmem_altmap *altmap);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
-extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
-		unsigned long nr_pages);
+extern struct zone *zone_for_pfn_range(int online_type, int nid,
+		unsigned long start_pfn, unsigned long nr_pages);
 extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
 				      struct mhp_params *params);
 void arch_remove_linear_mapping(u64 start, u64 size);
-- 
cgit v1.2.3


From 65a2aa5f482ed0c1b5afb9e6b0b9e0b16bb8b616 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 7 Sep 2021 19:55:04 -0700
Subject: mm/memory_hotplug: remove nid parameter from arch_remove_memory()

The parameter is unused, let's remove it.

Link: https://lkml.kernel.org/r/20210712124052.26491-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> [powerpc]
Acked-by: Heiko Carstens <hca@linux.ibm.com>	[s390]
Reviewed-by: Pankaj Gupta <pankaj.gupta@ionos.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Sergei Trofimovich <slyfox@gentoo.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Michel Lespinasse <michel@lespinasse.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: Joe Perches <joe@perches.com>
Cc: Pierre Morel <pmorel@linux.ibm.com>
Cc: Jia He <justin.he@arm.com>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Len Brown <lenb@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Scott Cheloha <cheloha@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index d01b504ce06f..010a192298b5 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -130,8 +130,7 @@ static inline bool movable_node_is_enabled(void)
 	return movable_node_enabled;
 }
 
-extern void arch_remove_memory(int nid, u64 start, u64 size,
-			       struct vmem_altmap *altmap);
+extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap);
 extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages,
 			   struct vmem_altmap *altmap);
 
-- 
cgit v1.2.3


From e1c158e4956612e7bada4c03dfb99210af4d6cde Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 7 Sep 2021 19:55:09 -0700
Subject: mm/memory_hotplug: remove nid parameter from remove_memory() and
 friends

There is only a single user remaining.  We can simply lookup the nid only
used for node offlining purposes when walking our memory blocks.  We don't
expect to remove multi-nid ranges; and if we'd ever do, we most probably
don't care about removing multi-nid ranges that actually result in empty
nodes.

If ever required, we can detect the "multi-nid" scenario and simply try
offlining all online nodes.

Link: https://lkml.kernel.org/r/20210712124052.26491-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Cc: Scott Cheloha <cheloha@linux.ibm.com>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jia He <justin.he@arm.com>
Cc: Joe Perches <joe@perches.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michel Lespinasse <michel@lespinasse.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pankaj Gupta <pankaj.gupta@ionos.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Pierre Morel <pmorel@linux.ibm.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Sergei Trofimovich <slyfox@gentoo.org>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 010a192298b5..068e3dcf4690 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -292,9 +292,9 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
 
 extern void try_offline_node(int nid);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
-extern int remove_memory(int nid, u64 start, u64 size);
-extern void __remove_memory(int nid, u64 start, u64 size);
-extern int offline_and_remove_memory(int nid, u64 start, u64 size);
+extern int remove_memory(u64 start, u64 size);
+extern void __remove_memory(u64 start, u64 size);
+extern int offline_and_remove_memory(u64 start, u64 size);
 
 #else
 static inline void try_offline_node(int nid) {}
@@ -304,12 +304,12 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 	return -EINVAL;
 }
 
-static inline int remove_memory(int nid, u64 start, u64 size)
+static inline int remove_memory(u64 start, u64 size)
 {
 	return -EBUSY;
 }
 
-static inline void __remove_memory(int nid, u64 start, u64 size) {}
+static inline void __remove_memory(u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 extern void set_zone_contiguous(struct zone *zone);
-- 
cgit v1.2.3


From 4b0970024408afb17886e0c76e9761c4264db2a8 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 7 Sep 2021 19:55:19 -0700
Subject: mm: track present early pages per zone

Patch series "mm/memory_hotplug: "auto-movable" online policy and memory groups", v3.

I. Goal

The goal of this series is improving in-kernel auto-online support.  It
tackles the fundamental problems that:

 1) We can create zone imbalances when onlining all memory blindly to
    ZONE_MOVABLE, in the worst case crashing the system. We have to know
    upfront how much memory we are going to hotplug such that we can
    safely enable auto-onlining of all hotplugged memory to ZONE_MOVABLE
    via "online_movable". This is far from practical and only applicable in
    limited setups -- like inside VMs under the RHV/oVirt hypervisor which
    will never hotplug more than 3 times the boot memory (and the
    limitation is only in place due to the Linux limitation).

 2) We see more setups that implement dynamic VM resizing, hot(un)plugging
    memory to resize VM memory. In these setups, we might hotplug a lot of
    memory, but it might happen in various small steps in both directions
    (e.g., 2 GiB -> 8 GiB -> 4 GiB -> 16 GiB ...). virtio-mem is the
    primary driver of this upstream right now, performing such dynamic
    resizing NUMA-aware via multiple virtio-mem devices.

    Onlining all hotplugged memory to ZONE_NORMAL means we basically have
    no hotunplug guarantees. Onlining all to ZONE_MOVABLE means we can
    easily run into zone imbalances when growing a VM. We want a mixture,
    and we want as much memory as reasonable/configured in ZONE_MOVABLE.
    Details regarding zone imbalances can be found at [1].

 3) Memory devices consist of 1..X memory block devices, however, the
    kernel doesn't really track the relationship. Consequently, also user
    space has no idea. We want to make per-device decisions.

    As one example, for memory hotunplug it doesn't make sense to use a
    mixture of zones within a single DIMM: we want all MOVABLE if
    possible, otherwise all !MOVABLE, because any !MOVABLE part will easily
    block the whole DIMM from getting hotunplugged.

    As another example, virtio-mem operates on individual units that span
    1..X memory blocks. Similar to a DIMM, we want a unit to either be all
    MOVABLE or !MOVABLE. A "unit" can be thought of like a DIMM, however,
    all units of a virtio-mem device logically belong together and are
    managed (added/removed) by a single driver. We want as much memory of
    a virtio-mem device to be MOVABLE as possible.

 4) We want memory onlining to be done right from the kernel while adding
    memory, not triggered by user space via udev rules; for example, this
    is reqired for fast memory hotplug for drivers that add individual
    memory blocks, like virito-mem. We want a way to configure a policy in
    the kernel and avoid implementing advanced policies in user space.

The auto-onlining support we have in the kernel is not sufficient.  All we
have is a) online everything MOVABLE (online_movable) b) online everything
!MOVABLE (online_kernel) c) keep zones contiguous (online).  This series
allows configuring c) to mean instead "online movable if possible
according to the coniguration, driven by a maximum MOVABLE:KERNEL ratio"
-- a new onlining policy.

II. Approach

This series does 3 things:

 1) Introduces the "auto-movable" online policy that initially operates on
    individual memory blocks only. It uses a maximum MOVABLE:KERNEL ratio
    to make a decision whether a memory block will be onlined to
    ZONE_MOVABLE or not. However, in the basic form, hotplugged KERNEL
    memory does not allow for more MOVABLE memory (details in the
    patches). CMA memory is treated like MOVABLE memory.

 2) Introduces static (e.g., DIMM) and dynamic (e.g., virtio-mem) memory
    groups and uses group information to make decisions in the
    "auto-movable" online policy across memory blocks of a single memory
    device (modeled as memory group). More details can be found in patch
    #3 or in the DIMM example below.

 3) Maximizes ZONE_MOVABLE memory within dynamic memory groups, by
    allowing ZONE_NORMAL memory within a dynamic memory group to allow for
    more ZONE_MOVABLE memory within the same memory group. The target use
    case is dynamic VM resizing using virtio-mem. See the virtio-mem
    example below.

I remember that the basic idea of using a ratio to implement a policy in
the kernel was once mentioned by Vitaly Kuznetsov, but I might be wrong (I
lost the pointer to that discussion).

For me, the main use case is using it along with virtio-mem (and DIMMs /
ppc64 dlpar where necessary) for dynamic resizing of VMs, increasing the
amount of memory we can hotunplug reliably again if we might eventually
hotplug a lot of memory to a VM.

III. Target Usage

The target usage will be:

 1) Linux boots with "mhp_default_online_type=offline"

 2) User space (e.g., systemd unit) configures memory onlining (according
    to a config file and system properties), for example:
    * Setting memory_hotplug.online_policy=auto-movable
    * Setting memory_hotplug.auto_movable_ratio=301
    * Setting memory_hotplug.auto_movable_numa_aware=true

 3) User space enabled auto onlining via "echo online >
    /sys/devices/system/memory/auto_online_blocks"

 4) User space triggers manual onlining of all already-offline memory
    blocks (go over offline memory blocks and set them to "online")

IV. Example

For DIMMs, hotplugging 4 GiB DIMMs to a 4 GiB VM with a configured ratio of
301% results in the following layout:
	Memory block 0-15:    DMA32   (early)
	Memory block 32-47:   Normal  (early)
	Memory block 48-79:   Movable (DIMM 0)
	Memory block 80-111:  Movable (DIMM 1)
	Memory block 112-143: Movable (DIMM 2)
	Memory block 144-275: Normal  (DIMM 3)
	Memory block 176-207: Normal  (DIMM 4)
	... all Normal
	(-> hotplugged Normal memory does not allow for more Movable memory)

For virtio-mem, using a simple, single virtio-mem device with a 4 GiB VM
will result in the following layout:
	Memory block 0-15:    DMA32   (early)
	Memory block 32-47:   Normal  (early)
	Memory block 48-143:  Movable (virtio-mem, first 12 GiB)
	Memory block 144:     Normal  (virtio-mem, next 128 MiB)
	Memory block 145-147: Movable (virtio-mem, next 384 MiB)
	Memory block 148:     Normal  (virtio-mem, next 128 MiB)
	Memory block 149-151: Movable (virtio-mem, next 384 MiB)
	... Normal/Movable mixture as above
	(-> hotplugged Normal memory allows for more Movable memory within
	    the same device)

Which gives us maximum flexibility when dynamically growing/shrinking a
VM in smaller steps.

V. Doc Update

I'll update the memory-hotplug.rst documentation, once the overhaul [1] is
usptream. Until then, details can be found in patch #2.

VI. Future Work

 1) Use memory groups for ppc64 dlpar
 2) Being able to specify a portion of (early) kernel memory that will be
    excluded from the ratio. Like "128 MiB globally/per node" are excluded.

    This might be helpful when starting VMs with extremely small memory
    footprint (e.g., 128 MiB) and hotplugging memory later -- not wanting
    the first hotplugged units getting onlined to ZONE_MOVABLE. One
    alternative would be a trigger to not consider ZONE_DMA memory
    in the ratio. We'll have to see if this is really rrequired.
 3) Indicate to user space that MOVABLE might be a bad idea -- especially
    relevant when memory ballooning without support for balloon compaction
    is active.

This patch (of 9):

For implementing a new memory onlining policy, which determines when to
online memory blocks to ZONE_MOVABLE semi-automatically, we need the
number of present early (boot) pages -- present pages excluding hotplugged
pages.  Let's track these pages per zone.

Pass a page instead of the zone to adjust_present_page_count(), similar as
adjust_managed_page_count() and derive the zone from the page.

It's worth noting that a memory block to be offlined/onlined is either
completely "early" or "not early".  add_memory() and friends can only add
complete memory blocks and we only online/offline complete (individual)
memory blocks.

Link: https://lkml.kernel.org/r/20210806124715.17090-1-david@redhat.com
Link: https://lkml.kernel.org/r/20210806124715.17090-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Cc: Hui Zhu <teawater@gmail.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 2 +-
 include/linux/mmzone.h         | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 068e3dcf4690..39b04e99a30e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -95,7 +95,7 @@ static inline void zone_seqlock_init(struct zone *zone)
 extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
 extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
 extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
-extern void adjust_present_page_count(struct zone *zone, long nr_pages);
+extern void adjust_present_page_count(struct page *page, long nr_pages);
 /* VM interface that may be used by firmware interface */
 extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 				     struct zone *zone);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ee3a86830519..1c0e3bf42521 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -540,6 +540,10 @@ struct zone {
 	 * is calculated as:
 	 *	present_pages = spanned_pages - absent_pages(pages in holes);
 	 *
+	 * present_early_pages is present pages existing within the zone
+	 * located on memory available since early boot, excluding hotplugged
+	 * memory.
+	 *
 	 * managed_pages is present pages managed by the buddy system, which
 	 * is calculated as (reserved_pages includes pages allocated by the
 	 * bootmem allocator):
@@ -572,6 +576,9 @@ struct zone {
 	atomic_long_t		managed_pages;
 	unsigned long		spanned_pages;
 	unsigned long		present_pages;
+#if defined(CONFIG_MEMORY_HOTPLUG)
+	unsigned long		present_early_pages;
+#endif
 #ifdef CONFIG_CMA
 	unsigned long		cma_pages;
 #endif
-- 
cgit v1.2.3


From 028fc57a1c361116e3bcebfeba4ca87878baaf4f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 7 Sep 2021 19:55:26 -0700
Subject: drivers/base/memory: introduce "memory groups" to logically group
 memory blocks

In our "auto-movable" memory onlining policy, we want to make decisions
across memory blocks of a single memory device.  Examples of memory
devices include ACPI memory devices (in the simplest case a single DIMM)
and virtio-mem.  For now, we don't have a connection between a single
memory block device and the real memory device.  Each memory device
consists of 1..X memory block devices.

Let's logically group memory blocks belonging to the same memory device in
"memory groups".  Memory groups can span multiple physical ranges and a
memory group itself does not contain any information regarding physical
ranges, only properties (e.g., "max_pages") necessary for improved memory
onlining.

Introduce two memory group types:

1) Static memory group: E.g., a single ACPI memory device, consisting
   of 1..X memory resources.  A memory group consists of 1..Y memory
   blocks.  The whole group is added/removed in one go.  If any part
   cannot get offlined, the whole group cannot be removed.

2) Dynamic memory group: E.g., a single virtio-mem device.  Memory is
   dynamically added/removed in a fixed granularity, called a "unit",
   consisting of 1..X memory blocks.  A unit is added/removed in one go.
   If any part of a unit cannot get offlined, the whole unit cannot be
   removed.

In case of 1) we usually want either all memory managed by ZONE_MOVABLE or
none.  In case of 2) we usually want to have as many units as possible
managed by ZONE_MOVABLE.  We want a single unit to be of the same type.

For now, memory groups are an internal concept that is not exposed to user
space; we might want to change that in the future, though.

add_memory() users can specify a mgid instead of a nid when passing the
MHP_NID_IS_MGID flag.

Link: https://lkml.kernel.org/r/20210806124715.17090-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hui Zhu <teawater@gmail.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory.h         | 46 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/memory_hotplug.h |  5 +++++
 2 files changed, 50 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 97e92e8b556a..d505c12c5c77 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -23,6 +23,42 @@
 
 #define MIN_MEMORY_BLOCK_SIZE     (1UL << SECTION_SIZE_BITS)
 
+/**
+ * struct memory_group - a logical group of memory blocks
+ * @nid: The node id for all memory blocks inside the memory group.
+ * @blocks: List of all memory blocks belonging to this memory group.
+ * @is_dynamic: The memory group type: static vs. dynamic
+ * @s.max_pages: Valid with &memory_group.is_dynamic == false. The maximum
+ *		 number of pages we'll have in this static memory group.
+ * @d.unit_pages: Valid with &memory_group.is_dynamic == true. Unit in pages
+ *		  in which memory is added/removed in this dynamic memory group.
+ *		  This granularity defines the alignment of a unit in physical
+ *		  address space; it has to be at least as big as a single
+ *		  memory block.
+ *
+ * A memory group logically groups memory blocks; each memory block
+ * belongs to at most one memory group. A memory group corresponds to
+ * a memory device, such as a DIMM or a NUMA node, which spans multiple
+ * memory blocks and might even span multiple non-contiguous physical memory
+ * ranges.
+ *
+ * Modification of members after registration is serialized by memory
+ * hot(un)plug code.
+ */
+struct memory_group {
+	int nid;
+	struct list_head memory_blocks;
+	bool is_dynamic;
+	union {
+		struct {
+			unsigned long max_pages;
+		} s;
+		struct {
+			unsigned long unit_pages;
+		} d;
+	};
+};
+
 struct memory_block {
 	unsigned long start_section_nr;
 	unsigned long state;		/* serialized by the dev->lock */
@@ -34,6 +70,8 @@ struct memory_block {
 	 * lay at the beginning of the memory block.
 	 */
 	unsigned long nr_vmemmap_pages;
+	struct memory_group *group;	/* group (if any) for this block */
+	struct list_head group_next;	/* next block inside memory group */
 };
 
 int arch_get_memory_phys_device(unsigned long start_pfn);
@@ -86,7 +124,8 @@ static inline int memory_notify(unsigned long val, void *v)
 extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
 int create_memory_block_devices(unsigned long start, unsigned long size,
-				unsigned long vmemmap_pages);
+				unsigned long vmemmap_pages,
+				struct memory_group *group);
 void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern void memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
@@ -96,6 +135,11 @@ extern int walk_memory_blocks(unsigned long start, unsigned long size,
 			      void *arg, walk_memory_blocks_func_t func);
 extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func);
 #define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
+
+extern int memory_group_register_static(int nid, unsigned long max_pages);
+extern int memory_group_register_dynamic(int nid, unsigned long unit_pages);
+extern int memory_group_unregister(int mgid);
+struct memory_group *memory_group_find_by_id(int mgid);
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 39b04e99a30e..5d341978b4bc 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -50,6 +50,11 @@ typedef int __bitwise mhp_t;
  * Only selected architectures support it with SPARSE_VMEMMAP.
  */
 #define MHP_MEMMAP_ON_MEMORY   ((__force mhp_t)BIT(1))
+/*
+ * The nid field specifies a memory group id (mgid) instead. The memory group
+ * implies the node id (nid).
+ */
+#define MHP_NID_IS_MGID		((__force mhp_t)BIT(2))
 
 /*
  * Extended parameters for memory hotplug:
-- 
cgit v1.2.3


From 836809ec75cc07c6d07c43036e3844affbe0d46f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 7 Sep 2021 19:55:30 -0700
Subject: mm/memory_hotplug: track present pages in memory groups

Let's track all present pages in each memory group.  Especially, track
memory present in ZONE_MOVABLE and memory present in one of the kernel
zones (which really only is ZONE_NORMAL right now as memory groups only
apply to hotplugged memory) separately within a memory group, to prepare
for making smart auto-online decision for individual memory blocks within
a memory group based on group statistics.

Link: https://lkml.kernel.org/r/20210806124715.17090-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hui Zhu <teawater@gmail.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory.h         |  6 ++++++
 include/linux/memory_hotplug.h | 13 +++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index d505c12c5c77..6ffdc1db385f 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -27,6 +27,10 @@
  * struct memory_group - a logical group of memory blocks
  * @nid: The node id for all memory blocks inside the memory group.
  * @blocks: List of all memory blocks belonging to this memory group.
+ * @present_kernel_pages: Present (online) memory outside ZONE_MOVABLE of this
+ *			  memory group.
+ * @present_movable_pages: Present (online) memory in ZONE_MOVABLE of this
+ *			   memory group.
  * @is_dynamic: The memory group type: static vs. dynamic
  * @s.max_pages: Valid with &memory_group.is_dynamic == false. The maximum
  *		 number of pages we'll have in this static memory group.
@@ -48,6 +52,8 @@
 struct memory_group {
 	int nid;
 	struct list_head memory_blocks;
+	unsigned long present_kernel_pages;
+	unsigned long present_movable_pages;
 	bool is_dynamic;
 	union {
 		struct {
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 5d341978b4bc..cf3f423c8a74 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -12,6 +12,7 @@ struct zone;
 struct pglist_data;
 struct mem_section;
 struct memory_block;
+struct memory_group;
 struct resource;
 struct vmem_altmap;
 
@@ -100,13 +101,15 @@ static inline void zone_seqlock_init(struct zone *zone)
 extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
 extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
 extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
-extern void adjust_present_page_count(struct page *page, long nr_pages);
+extern void adjust_present_page_count(struct page *page,
+				      struct memory_group *group,
+				      long nr_pages);
 /* VM interface that may be used by firmware interface */
 extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 				     struct zone *zone);
 extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
 extern int online_pages(unsigned long pfn, unsigned long nr_pages,
-			struct zone *zone);
+			struct zone *zone, struct memory_group *group);
 extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
 					 unsigned long end_pfn);
 extern void __offline_isolated_pages(unsigned long start_pfn,
@@ -296,7 +299,8 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
 #ifdef CONFIG_MEMORY_HOTREMOVE
 
 extern void try_offline_node(int nid);
-extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
+extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
+			 struct memory_group *group);
 extern int remove_memory(u64 start, u64 size);
 extern void __remove_memory(u64 start, u64 size);
 extern int offline_and_remove_memory(u64 start, u64 size);
@@ -304,7 +308,8 @@ extern int offline_and_remove_memory(u64 start, u64 size);
 #else
 static inline void try_offline_node(int nid) {}
 
-static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
+				struct memory_group *group)
 {
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From 445fcf7c721450dd1d4ec6c217b3c6a932602a44 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 7 Sep 2021 19:55:45 -0700
Subject: mm/memory_hotplug: memory group aware "auto-movable" online policy

Use memory groups to improve our "auto-movable" onlining policy:

1. For static memory groups (e.g., a DIMM), online a memory block MOVABLE
   only if all other memory blocks in the group are either MOVABLE or could
   be onlined MOVABLE. A DIMM will either be MOVABLE or not, not a mixture.

2. For dynamic memory groups (e.g., a virtio-mem device), online a
   memory block MOVABLE only if all other memory blocks inside the
   current unit are either MOVABLE or could be onlined MOVABLE. For a
   virtio-mem device with a device block size with 512 MiB, all 128 MiB
   memory blocks wihin a 512 MiB unit will either be MOVABLE or not, not
   a mixture.

We have to pass the memory group to zone_for_pfn_range() to take the
memory group into account.

Note: for now, there seems to be no compelling reason to make this
behavior configurable.

Link: https://lkml.kernel.org/r/20210806124715.17090-9-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hui Zhu <teawater@gmail.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index cf3f423c8a74..e5a867c950b2 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -349,7 +349,8 @@ extern void sparse_remove_section(struct mem_section *ms,
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
 extern struct zone *zone_for_pfn_range(int online_type, int nid,
-		unsigned long start_pfn, unsigned long nr_pages);
+		struct memory_group *group, unsigned long start_pfn,
+		unsigned long nr_pages);
 extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
 				      struct mhp_params *params);
 void arch_remove_linear_mapping(u64 start, u64 size);
-- 
cgit v1.2.3


From 3fcebf90209a7f52d384ad7701425aa91be309ab Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 7 Sep 2021 19:55:48 -0700
Subject: mm/memory_hotplug: improved dynamic memory group aware "auto-movable"
 online policy

Currently, the "auto-movable" online policy does not allow for hotplugged
KERNEL (ZONE_NORMAL) memory to increase the amount of MOVABLE memory we
can have, primarily, because there is no coordiantion across memory
devices and we don't want to create zone-imbalances accidentially when
unplugging memory.

However, within a single memory device it's different.  Let's allow for
KERNEL memory within a dynamic memory group to allow for more MOVABLE
within the same memory group.  The only thing we have to take care of is
that the managing driver avoids zone imbalances by unplugging MOVABLE
memory first, otherwise there can be corner cases where unplug of memory
could result in (accidential) zone imbalances.

virtio-mem is the only user of dynamic memory groups and recently added
support for prioritizing unplug of ZONE_MOVABLE over ZONE_NORMAL, so we
don't need a new toggle to enable it for dynamic memory groups.

We limit this handling to dynamic memory groups, because:

* We want to keep the runtime overhead for collecting stats when
  onlining a single memory block small.  We tend to have only a handful of
  dynamic memory groups, but we can have quite some static memory groups
  (e.g., 256 DIMMs).

* It doesn't make too much sense for static memory groups, as we try
  onlining all applicable memory blocks either completely to ZONE_MOVABLE
  or not.  In ordinary operation, we won't have a mixture of zones within
  a static memory group.

When adding memory to a dynamic memory group, we'll first online memory to
ZONE_MOVABLE as long as early KERNEL memory allows for it.  Then, we'll
online the next unit(s) to ZONE_NORMAL, until we can online the next
unit(s) to ZONE_MOVABLE.

For a simple virtio-mem device with a MOVABLE:KERNEL ratio of 3:1, it will
result in a layout like:

  [M][M][M][M][M][M][M][M][N][M][M][M][N][M][M][M]...
  ^ movable memory due to early kernel memory
			   ^ allows for more movable memory ...
			      ^-----^ ... here
				       ^ allows for more movable memory ...
				          ^-----^ ... here

While the created layout is sub-optimal when it comes to contiguous zones,
it gives us the maximum flexibility when dynamically growing/shrinking a
device; we can grow small VMs really big in small steps, and still shrink
reliably to e.g., 1/4 of the maximum VM size in this example, removing
full memory blocks along with meta data more reliably.

Mark dynamic memory groups in the xarray such that we can efficiently
iterate over them when collecting stats.  In usual setups, we have one
virtio-mem device per NUMA node, and usually only a small number of NUMA
nodes.

Note: for now, there seems to be no compelling reason to make this
behavior configurable.

Link: https://lkml.kernel.org/r/20210806124715.17090-10-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hui Zhu <teawater@gmail.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 6ffdc1db385f..cbcc43ad2b97 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -146,6 +146,9 @@ extern int memory_group_register_static(int nid, unsigned long max_pages);
 extern int memory_group_register_dynamic(int nid, unsigned long unit_pages);
 extern int memory_group_unregister(int mgid);
 struct memory_group *memory_group_find_by_id(int mgid);
+typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *);
+int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
+			       struct memory_group *excluded, void *arg);
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-- 
cgit v1.2.3


From fe3df441ef885a75a3eff5e151ead1a92266d222 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 7 Sep 2021 19:55:55 -0700
Subject: mm: remove redundant compound_head() calling

There is a READ_ONCE() in the macro of compound_head(), which will prevent
compiler from optimizing the code when there are more than once calling of
it in a function.  Remove the redundant calling of compound_head() from
page_to_index() and page_add_file_rmap() for better code generation.

Link: https://lkml.kernel.org/r/20210811101431.83940-1-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ed02aa522263..904e57db3a7d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -521,18 +521,17 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
  */
 static inline pgoff_t page_to_index(struct page *page)
 {
-	pgoff_t pgoff;
+	struct page *head;
 
 	if (likely(!PageTransTail(page)))
 		return page->index;
 
+	head = compound_head(page);
 	/*
 	 *  We don't initialize ->index for tail pages: calculate based on
 	 *  head page
 	 */
-	pgoff = compound_head(page)->index;
-	pgoff += page - compound_head(page);
-	return pgoff;
+	return head->index + page - head;
 }
 
 extern pgoff_t hugetlb_basepage_index(struct page *page);
-- 
cgit v1.2.3


From 82a70ce0426dd7c4099516175019dccbd18cebf9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Sep 2021 19:56:01 -0700
Subject: mm: move ioremap_page_range to vmalloc.c

Patch series "small ioremap cleanups".

The first patch moves a little code around the vmalloc/ioremap boundary
following a bigger move by Nick earlier.  The second enforces
non-executable mapping on ioremap just like we do for vmap.  No driver
currently uses executable mappings anyway, as they should.

This patch (of 2):

This keeps it together with the implementation, and to remove the
vmap_range wrapper.

Link: https://lkml.kernel.org/r/20210824091259.1324527-1-hch@lst.de
Link: https://lkml.kernel.org/r/20210824091259.1324527-2-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 2644425b6dce..671d402c3778 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -225,9 +225,6 @@ static inline bool is_vm_area_hugepages(const void *addr)
 }
 
 #ifdef CONFIG_MMU
-int vmap_range(unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift);
 void vunmap_range(unsigned long addr, unsigned long end);
 static inline void set_vm_flush_reset_perms(void *addr)
 {
-- 
cgit v1.2.3


From 513861202d1259e35934e206b79cd54f523d79b5 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 7 Sep 2021 19:56:09 -0700
Subject: highmem: don't disable preemption on RT in kmap_atomic()

kmap_atomic() disables preemption and pagefaults for historical reasons.
The conversion to kmap_local(), which only disables migration, cannot be
done wholesale because quite some call sites need to be updated to
accommodate with the changed semantics.

On PREEMPT_RT enabled kernels the kmap_atomic() semantics are problematic
due to the implicit disabling of preemption which makes it impossible to
acquire 'sleeping' spinlocks within the kmap atomic sections.

PREEMPT_RT replaces the preempt_disable() with a migrate_disable() for
more than a decade.  It could be argued that this is a justification to do
this unconditionally, but PREEMPT_RT covers only a limited number of
architectures and it disables some functionality which limits the coverage
further.

Limit the replacement to PREEMPT_RT for now.

Link: https://lkml.kernel.org/r/20210810091116.pocdmaatdcogvdso@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem-internal.h | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index 7902c7d8b55f..4aa1031d3e4c 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -90,7 +90,11 @@ static inline void __kunmap_local(void *vaddr)
 
 static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
-	preempt_disable();
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		migrate_disable();
+	else
+		preempt_disable();
+
 	pagefault_disable();
 	return __kmap_local_page_prot(page, prot);
 }
@@ -102,7 +106,11 @@ static inline void *kmap_atomic(struct page *page)
 
 static inline void *kmap_atomic_pfn(unsigned long pfn)
 {
-	preempt_disable();
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		migrate_disable();
+	else
+		preempt_disable();
+
 	pagefault_disable();
 	return __kmap_local_pfn_prot(pfn, kmap_prot);
 }
@@ -111,7 +119,10 @@ static inline void __kunmap_atomic(void *addr)
 {
 	kunmap_local_indexed(addr);
 	pagefault_enable();
-	preempt_enable();
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		migrate_enable();
+	else
+		preempt_enable();
 }
 
 unsigned int __nr_free_highpages(void);
@@ -179,7 +190,10 @@ static inline void __kunmap_local(void *addr)
 
 static inline void *kmap_atomic(struct page *page)
 {
-	preempt_disable();
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		migrate_disable();
+	else
+		preempt_disable();
 	pagefault_disable();
 	return page_address(page);
 }
@@ -200,7 +214,10 @@ static inline void __kunmap_atomic(void *addr)
 	kunmap_flush_on_unmap(addr);
 #endif
 	pagefault_enable();
-	preempt_enable();
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		migrate_enable();
+	else
+		preempt_enable();
 }
 
 static inline unsigned int nr_free_highpages(void) { return 0; }
-- 
cgit v1.2.3


From 41c961b9013ee9b6d0491f6926df546e37964b1f Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 7 Sep 2021 19:56:15 -0700
Subject: mm: introduce PAGEFLAGS_MASK to replace ((1UL << NR_PAGEFLAGS) - 1)

Instead of hard-coding ((1UL << NR_PAGEFLAGS) - 1) everywhere, introducing
PAGEFLAGS_MASK to make the code clear to get the page flags.

Link: https://lkml.kernel.org/r/20210819150712.59948-1-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5922031ffab6..6b8d66965145 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -178,6 +178,8 @@ enum pageflags {
 	PG_reported = PG_uptodate,
 };
 
+#define PAGEFLAGS_MASK		((1UL << NR_PAGEFLAGS) - 1)
+
 #ifndef __GENERATING_BOUNDS_H
 
 static inline unsigned long _compound_head(const struct page *page)
@@ -859,7 +861,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
  * alloc-free cycle to prevent from reusing the page.
  */
 #define PAGE_FLAGS_CHECK_AT_PREP	\
-	(((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON)
+	(PAGEFLAGS_MASK & ~__PG_HWPOISON)
 
 #define PAGE_FLAGS_PRIVATE				\
 	(1UL << PG_private | 1UL << PG_private_2)
-- 
cgit v1.2.3


From 2224d8485492e499ca2e5d25407f8502cc06f149 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:56:28 -0700
Subject: mm: introduce Data Access MONitor (DAMON)

Patch series "Introduce Data Access MONitor (DAMON)", v34.

Introduction
============

DAMON is a data access monitoring framework for the Linux kernel.  The
core mechanisms of DAMON called 'region based sampling' and 'adaptive
regions adjustment' (refer to 'mechanisms.rst' in the 11th patch of this
patchset for the detail) make it

- accurate (The monitored information is useful for DRAM level memory
  management.  It might not appropriate for Cache-level accuracy,
  though.),

- light-weight (The monitoring overhead is low enough to be applied
  online while making no impact on the performance of the target
  workloads.), and

- scalable (the upper-bound of the instrumentation overhead is
  controllable regardless of the size of target workloads.).

Using this framework, therefore, several memory management mechanisms such
as reclamation and THP can be optimized to aware real data access
patterns.  Experimental access pattern aware memory management
optimization works that incurring high instrumentation overhead will be
able to have another try.

Though DAMON is for kernel subsystems, it can be easily exposed to the
user space by writing a DAMON-wrapper kernel subsystem.  Then, user space
users who have some special workloads will be able to write personalized
tools or applications for deeper understanding and specialized
optimizations of their systems.

DAMON is also merged in two public Amazon Linux kernel trees that based on
v5.4.y[1] and v5.10.y[2].

[1] https://github.com/amazonlinux/linux/tree/amazon-5.4.y/master/mm/damon
[2] https://github.com/amazonlinux/linux/tree/amazon-5.10.y/master/mm/damon

The userspace tool[1] is available, released under GPLv2, and actively
being maintained.  I am also planning to implement another basic user
interface in perf[2].  Also, the basic test suite for DAMON is available
under GPLv2[3].

[1] https://github.com/awslabs/damo
[2] https://lore.kernel.org/linux-mm/20210107120729.22328-1-sjpark@amazon.com/
[3] https://github.com/awslabs/damon-tests

Long-term Plan
--------------

DAMON is a part of a project called Data Access-aware Operating System
(DAOS).  As the name implies, I want to improve the performance and
efficiency of systems using fine-grained data access patterns.  The
optimizations are for both kernel and user spaces.  I will therefore
modify or create kernel subsystems, export some of those to user space and
implement user space library / tools.  Below shows the layers and
components for the project.

    ---------------------------------------------------------------------------
    Primitives:     PTE Accessed bit, PG_idle, rmap, (Intel CMT), ...
    Framework:      DAMON
    Features:       DAMOS, virtual addr, physical addr, ...
    Applications:   DAMON-debugfs, (DARC), ...
    ^^^^^^^^^^^^^^^^^^^^^^^    KERNEL SPACE    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

    Raw Interface:  debugfs, (sysfs), (damonfs), tracepoints, (sys_damon), ...

    vvvvvvvvvvvvvvvvvvvvvvv    USER SPACE      vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
    Library:        (libdamon), ...
    Tools:          DAMO, (perf), ...
    ---------------------------------------------------------------------------

The components in parentheses or marked as '...' are not implemented yet
but in the future plan.  IOW, those are the TODO tasks of DAOS project.
For more detail, please refer to the plans:
https://lore.kernel.org/linux-mm/20201202082731.24828-1-sjpark@amazon.com/

Evaluations
===========

We evaluated DAMON's overhead, monitoring quality and usefulness using 24
realistic workloads on my QEMU/KVM based virtual machine running a kernel
that v24 DAMON patchset is applied.

DAMON is lightweight.  It increases system memory usage by 0.39% and slows
target workloads down by 1.16%.

DAMON is accurate and useful for memory management optimizations.  An
experimental DAMON-based operation scheme for THP, namely 'ethp', removes
76.15% of THP memory overheads while preserving 51.25% of THP speedup.
Another experimental DAMON-based 'proactive reclamation' implementation,
'prcl', reduces 93.38% of residential sets and 23.63% of system memory
footprint while incurring only 1.22% runtime overhead in the best case
(parsec3/freqmine).

NOTE that the experimental THP optimization and proactive reclamation are
not for production but only for proof of concepts.

Please refer to the official document[1] or "Documentation/admin-guide/mm:
Add a document for DAMON" patch in this patchset for detailed evaluation
setup and results.

[1] https://damonitor.github.io/doc/html/latest-damon/admin-guide/mm/damon/eval.html

Real-world User Story
=====================

In summary, DAMON has used on production systems and proved its usefulness.

DAMON as a profiler
-------------------

We analyzed characteristics of a large scale production systems of our
customers using DAMON.  The systems utilize 70GB DRAM and 36 CPUs.  From
this, we were able to find interesting things below.

There were obviously different access pattern under idle workload and
active workload.  Under the idle workload, it accessed large memory
regions with low frequency, while the active workload accessed small
memory regions with high freuqnecy.

DAMON found a 7GB memory region that showing obviously high access
frequency under the active workload.  We believe this is the
performance-effective working set and need to be protected.

There was a 4KB memory region that showing highest access frequency under
not only active but also idle workloads.  We think this must be a hottest
code section like thing that should never be paged out.

For this analysis, DAMON used only 0.3-1% of single CPU time.  Because we
used recording-based analysis, it consumed about 3-12 MB of disk space per
20 minutes.  This is only small amount of disk space, but we can further
reduce the disk usage by using non-recording-based DAMON features.  I'd
like to argue that only DAMON can do such detailed analysis (finding 4KB
highest region in 70GB memory) with the light overhead.

DAMON as a system optimization tool
-----------------------------------

We also found below potential performance problems on the systems and made
DAMON-based solutions.

The system doesn't want to make the workload suffer from the page
reclamation and thus it utilizes enough DRAM but no swap device.  However,
we found the system is actively reclaiming file-backed pages, because the
system has intensive file IO.  The file IO turned out to be not
performance critical for the workload, but the customer wanted to ensure
performance critical file-backed pages like code section to not mistakenly
be evicted.

Using direct IO should or `mlock()` would be a straightforward solution,
but modifying the user space code is not easy for the customer.
Alternatively, we could use DAMON-based operation scheme[1].  By using it,
we can ask DAMON to track access frequency of each region and make
'process_madvise(MADV_WILLNEED)[2]' call for regions having specific size
and access frequency for a time interval.

We also found the system is having high number of TLB misses.  We tried
'always' THP enabled policy and it greatly reduced TLB misses, but the
page reclamation also been more frequent due to the THP internal
fragmentation caused memory bloat.  We could try another DAMON-based
operation scheme that applies 'MADV_HUGEPAGE' to memory regions having
>=2MB size and high access frequency, while applying 'MADV_NOHUGEPAGE' to
regions having <2MB size and low access frequency.

We do not own the systems so we only reported the analysis results and
possible optimization solutions to the customers.  The customers satisfied
about the analysis results and promised to try the optimization guides.

[1] https://lore.kernel.org/linux-mm/20201006123931.5847-1-sjpark@amazon.com/
[2] https://lore.kernel.org/linux-api/20200622192900.22757-4-minchan@kernel.org/

Comparison with Idle Page Tracking
==================================

Idle Page Tracking allows users to set and read idleness of pages using a
bitmap file which represents each page with each bit of the file.  One
recommended usage of it is working set size detection.  Users can do that
by

    1. find PFN of each page for workloads in interest,
    2. set all the pages as idle by doing writes to the bitmap file,
    3. wait until the workload accesses its working set, and
    4. read the idleness of the pages again and count pages became not idle.

NOTE: While Idle Page Tracking is for user space users, DAMON is primarily
designed for kernel subsystems though it can easily exposed to the user
space.  Hence, this section only assumes such user space use of DAMON.

For what use cases Idle Page Tracking would be better?
------------------------------------------------------

1. Flexible usecases other than hotness monitoring.

Because Idle Page Tracking allows users to control the primitive (Page
idleness) by themselves, Idle Page Tracking users can do anything they
want.  Meanwhile, DAMON is primarily designed to monitor the hotness of
each memory region.  For this, DAMON asks users to provide sampling
interval and aggregation interval.  For the reason, there could be some
use case that using Idle Page Tracking is simpler.

2. Physical memory monitoring.

Idle Page Tracking receives PFN range as input, so natively supports
physical memory monitoring.

DAMON is designed to be extensible for multiple address spaces and use
cases by implementing and using primitives for the given use case.
Therefore, by theory, DAMON has no limitation in the type of target
address space as long as primitives for the given address space exists.
However, the default primitives introduced by this patchset supports only
virtual address spaces.

Therefore, for physical memory monitoring, you should implement your own
primitives and use it, or simply use Idle Page Tracking.

Nonetheless, RFC patchsets[1] for the physical memory address space
primitives is already available.  It also supports user memory same to
Idle Page Tracking.

[1] https://lore.kernel.org/linux-mm/20200831104730.28970-1-sjpark@amazon.com/

For what use cases DAMON is better?
-----------------------------------

1. Hotness Monitoring.

Idle Page Tracking let users know only if a page frame is accessed or not.
For hotness check, the user should write more code and use more memory.
DAMON do that by itself.

2. Low Monitoring Overhead

DAMON receives user's monitoring request with one step and then provide
the results.  So, roughly speaking, DAMON require only O(1) user/kernel
context switches.

In case of Idle Page Tracking, however, because the interface receives
contiguous page frames, the number of user/kernel context switches
increases as the monitoring target becomes complex and huge.  As a result,
the context switch overhead could be not negligible.

Moreover, DAMON is born to handle with the monitoring overhead.  Because
the core mechanism is pure logical, Idle Page Tracking users might be able
to implement the mechanism on their own, but it would be time consuming
and the user/kernel context switching will still more frequent than that
of DAMON.  Also, the kernel subsystems cannot use the logic in this case.

3. Page granularity working set size detection.

Until v22 of this patchset, this was categorized as the thing Idle Page
Tracking could do better, because DAMON basically maintains additional
metadata for each of the monitoring target regions.  So, in the page
granularity working set size detection use case, DAMON would incur (number
of monitoring target pages * size of metadata) memory overhead.  Size of
the single metadata item is about 54 bytes, so assuming 4KB pages, about
1.3% of monitoring target pages will be additionally used.

All essential metadata for Idle Page Tracking are embedded in 'struct
page' and page table entries.  Therefore, in this use case, only one
counter variable for working set size accounting is required if Idle Page
Tracking is used.

There are more details to consider, but roughly speaking, this is true in
most cases.

However, the situation changed from v23.  Now DAMON supports arbitrary
types of monitoring targets, which don't use the metadata.  Using that,
DAMON can do the working set size detection with no additional space
overhead but less user-kernel context switch.  A first draft for the
implementation of monitoring primitives for this usage is available in a
DAMON development tree[1].  An RFC patchset for it based on this patchset
will also be available soon.

Since v24, the arbitrary type support is dropped from this patchset
because this patchset doesn't introduce real use of the type.  You can
still get it from the DAMON development tree[2], though.

[1] https://github.com/sjp38/linux/tree/damon/pgidle_hack
[2] https://github.com/sjp38/linux/tree/damon/master

4. More future usecases

While Idle Page Tracking has tight coupling with base primitives (PG_Idle
and page table Accessed bits), DAMON is designed to be extensible for many
use cases and address spaces.  If you need some special address type or
want to use special h/w access check primitives, you can write your own
primitives for that and configure DAMON to use those.  Therefore, if your
use case could be changed a lot in future, using DAMON could be better.

Can I use both Idle Page Tracking and DAMON?
--------------------------------------------

Yes, though using them concurrently for overlapping memory regions could
result in interference to each other.  Nevertheless, such use case would
be rare or makes no sense at all.  Even in the case, the noise would bot
be really significant.  So, you can choose whatever you want depending on
the characteristics of your use cases.

More Information
================

We prepared a showcase web site[1] that you can get more information.
There are

- the official documentations[2],
- the heatmap format dynamic access pattern of various realistic workloads for
  heap area[3], mmap()-ed area[4], and stack[5] area,
- the dynamic working set size distribution[6] and chronological working set
  size changes[7], and
- the latest performance test results[8].

[1] https://damonitor.github.io/_index
[2] https://damonitor.github.io/doc/html/latest-damon
[3] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.0.png.html
[4] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html
[5] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.2.png.html
[6] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html
[7] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html
[8] https://damonitor.github.io/test/result/perf/latest/html/index.html

Baseline and Complete Git Trees
===============================

The patches are based on the latest -mm tree, specifically
v5.14-rc1-mmots-2021-07-15-18-47 of https://github.com/hnaz/linux-mm.  You can
also clone the complete git tree:

    $ git clone git://github.com/sjp38/linux -b damon/patches/v34

The web is also available:
https://github.com/sjp38/linux/releases/tag/damon/patches/v34

Development Trees
-----------------

There are a couple of trees for entire DAMON patchset series and features
for future release.

- For latest release: https://github.com/sjp38/linux/tree/damon/master
- For next release: https://github.com/sjp38/linux/tree/damon/next

Long-term Support Trees
-----------------------

For people who want to test DAMON but using LTS kernels, there are another
couple of trees based on two latest LTS kernels respectively and
containing the 'damon/master' backports.

- For v5.4.y: https://github.com/sjp38/linux/tree/damon/for-v5.4.y
- For v5.10.y: https://github.com/sjp38/linux/tree/damon/for-v5.10.y

Amazon Linux Kernel Trees
-------------------------

DAMON is also merged in two public Amazon Linux kernel trees that based on
v5.4.y[1] and v5.10.y[2].

[1] https://github.com/amazonlinux/linux/tree/amazon-5.4.y/master/mm/damon
[2] https://github.com/amazonlinux/linux/tree/amazon-5.10.y/master/mm/damon

Git Tree for Diff of Patches
============================

For easy review of diff between different versions of each patch, I
prepared a git tree containing all versions of the DAMON patchset series:
https://github.com/sjp38/damon-patches

You can clone it and use 'diff' for easy review of changes between
different versions of the patchset.  For example:

    $ git clone https://github.com/sjp38/damon-patches && cd damon-patches
    $ diff -u damon/v33 damon/v34

Sequence Of Patches
===================

First three patches implement the core logics of DAMON.  The 1st patch
introduces basic sampling based hotness monitoring for arbitrary types of
targets.  Following two patches implement the core mechanisms for control
of overhead and accuracy, namely regions based sampling (patch 2) and
adaptive regions adjustment (patch 3).

Now the essential parts of DAMON is complete, but it cannot work unless
someone provides monitoring primitives for a specific use case.  The
following two patches make it just work for virtual address spaces
monitoring.  The 4th patch makes 'PG_idle' can be used by DAMON and the
5th patch implements the virtual memory address space specific monitoring
primitives using page table Accessed bits and the 'PG_idle' page flag.

Now DAMON just works for virtual address space monitoring via the kernel
space api.  To let the user space users can use DAMON, following four
patches add interfaces for them.  The 6th patch adds a tracepoint for
monitoring results.  The 7th patch implements a DAMON application kernel
module, namely damon-dbgfs, that simply wraps DAMON and exposes DAMON
interface to the user space via the debugfs interface.  The 8th patch
further exports pid of monitoring thread (kdamond) to user space for
easier cpu usage accounting, and the 9th patch makes the debugfs interface
to support multiple contexts.

Three patches for maintainability follows.  The 10th patch adds
documentations for both the user space and the kernel space.  The 11th
patch provides unit tests (based on the kunit) while the 12th patch adds
user space tests (based on the kselftest).

Finally, the last patch (13th) updates the MAINTAINERS file.

This patch (of 13):

DAMON is a data access monitoring framework for the Linux kernel.  The
core mechanisms of DAMON make it

 - accurate (the monitoring output is useful enough for DRAM level
   performance-centric memory management; It might be inappropriate for
   CPU cache levels, though),
 - light-weight (the monitoring overhead is normally low enough to be
   applied online), and
 - scalable (the upper-bound of the overhead is in constant range
   regardless of the size of target workloads).

Using this framework, hence, we can easily write efficient kernel space
data access monitoring applications.  For example, the kernel's memory
management mechanisms can make advanced decisions using this.
Experimental data access aware optimization works that incurring high
access monitoring overhead could again be implemented on top of this.

Due to its simple and flexible interface, providing user space interface
would be also easy.  Then, user space users who have some special
workloads can write personalized applications for better understanding and
optimizations of their workloads and systems.

===

Nevertheless, this commit is defining and implementing only basic access
check part without the overhead-accuracy handling core logic.  The basic
access check is as below.

The output of DAMON says what memory regions are how frequently accessed
for a given duration.  The resolution of the access frequency is
controlled by setting ``sampling interval`` and ``aggregation interval``.
In detail, DAMON checks access to each page per ``sampling interval`` and
aggregates the results.  In other words, counts the number of the accesses
to each region.  After each ``aggregation interval`` passes, DAMON calls
callback functions that previously registered by users so that users can
read the aggregated results and then clears the results.  This can be
described in below simple pseudo-code::

    init()
    while monitoring_on:
        for page in monitoring_target:
            if accessed(page):
                nr_accesses[page] += 1
        if time() % aggregation_interval == 0:
            for callback in user_registered_callbacks:
                callback(monitoring_target, nr_accesses)
            for page in monitoring_target:
                nr_accesses[page] = 0
        if time() % update_interval == 0:
            update()
        sleep(sampling interval)

The target regions constructed at the beginning of the monitoring and
updated after each ``regions_update_interval``, because the target regions
could be dynamically changed (e.g., mmap() or memory hotplug).  The
monitoring overhead of this mechanism will arbitrarily increase as the
size of the target workload grows.

The basic monitoring primitives for actual access check and dynamic target
regions construction aren't in the core part of DAMON.  Instead, it allows
users to implement their own primitives that are optimized for their use
case and configure DAMON to use those.  In other words, users cannot use
current version of DAMON without some additional works.

Following commits will implement the core mechanisms for the
overhead-accuracy control and default primitives implementations.

Link: https://lkml.kernel.org/r/20210716081449.22187-1-sj38.park@gmail.com
Link: https://lkml.kernel.org/r/20210716081449.22187-2-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Leonard Foerster <foersleo@amazon.de>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Marco Elver <elver@google.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Joe Perches <joe@perches.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 167 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100644 include/linux/damon.h

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
new file mode 100644
index 000000000000..2f652602b1ea
--- /dev/null
+++ b/include/linux/damon.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * DAMON api
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#ifndef _DAMON_H_
+#define _DAMON_H_
+
+#include <linux/mutex.h>
+#include <linux/time64.h>
+#include <linux/types.h>
+
+struct damon_ctx;
+
+/**
+ * struct damon_primitive	Monitoring primitives for given use cases.
+ *
+ * @init:			Initialize primitive-internal data structures.
+ * @update:			Update primitive-internal data structures.
+ * @prepare_access_checks:	Prepare next access check of target regions.
+ * @check_accesses:		Check the accesses to target regions.
+ * @reset_aggregated:		Reset aggregated accesses monitoring results.
+ * @target_valid:		Determine if the target is valid.
+ * @cleanup:			Clean up the context.
+ *
+ * DAMON can be extended for various address spaces and usages.  For this,
+ * users should register the low level primitives for their target address
+ * space and usecase via the &damon_ctx.primitive.  Then, the monitoring thread
+ * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting
+ * the monitoring, @update after each &damon_ctx.primitive_update_interval, and
+ * @check_accesses, @target_valid and @prepare_access_checks after each
+ * &damon_ctx.sample_interval.  Finally, @reset_aggregated is called after each
+ * &damon_ctx.aggr_interval.
+ *
+ * @init should initialize primitive-internal data structures.  For example,
+ * this could be used to construct proper monitoring target regions and link
+ * those to @damon_ctx.target.
+ * @update should update the primitive-internal data structures.  For example,
+ * this could be used to update monitoring target regions for current status.
+ * @prepare_access_checks should manipulate the monitoring regions to be
+ * prepared for the next access check.
+ * @check_accesses should check the accesses to each region that made after the
+ * last preparation and update the number of observed accesses of each region.
+ * @reset_aggregated should reset the access monitoring results that aggregated
+ * by @check_accesses.
+ * @target_valid should check whether the target is still valid for the
+ * monitoring.
+ * @cleanup is called from @kdamond just before its termination.
+ */
+struct damon_primitive {
+	void (*init)(struct damon_ctx *context);
+	void (*update)(struct damon_ctx *context);
+	void (*prepare_access_checks)(struct damon_ctx *context);
+	void (*check_accesses)(struct damon_ctx *context);
+	void (*reset_aggregated)(struct damon_ctx *context);
+	bool (*target_valid)(void *target);
+	void (*cleanup)(struct damon_ctx *context);
+};
+
+/*
+ * struct damon_callback	Monitoring events notification callbacks.
+ *
+ * @before_start:	Called before starting the monitoring.
+ * @after_sampling:	Called after each sampling.
+ * @after_aggregation:	Called after each aggregation.
+ * @before_terminate:	Called before terminating the monitoring.
+ * @private:		User private data.
+ *
+ * The monitoring thread (&damon_ctx.kdamond) calls @before_start and
+ * @before_terminate just before starting and finishing the monitoring,
+ * respectively.  Therefore, those are good places for installing and cleaning
+ * @private.
+ *
+ * The monitoring thread calls @after_sampling and @after_aggregation for each
+ * of the sampling intervals and aggregation intervals, respectively.
+ * Therefore, users can safely access the monitoring results without additional
+ * protection.  For the reason, users are recommended to use these callback for
+ * the accesses to the results.
+ *
+ * If any callback returns non-zero, monitoring stops.
+ */
+struct damon_callback {
+	void *private;
+
+	int (*before_start)(struct damon_ctx *context);
+	int (*after_sampling)(struct damon_ctx *context);
+	int (*after_aggregation)(struct damon_ctx *context);
+	int (*before_terminate)(struct damon_ctx *context);
+};
+
+/**
+ * struct damon_ctx - Represents a context for each monitoring.  This is the
+ * main interface that allows users to set the attributes and get the results
+ * of the monitoring.
+ *
+ * @sample_interval:		The time between access samplings.
+ * @aggr_interval:		The time between monitor results aggregations.
+ * @primitive_update_interval:	The time between monitoring primitive updates.
+ *
+ * For each @sample_interval, DAMON checks whether each region is accessed or
+ * not.  It aggregates and keeps the access information (number of accesses to
+ * each region) for @aggr_interval time.  DAMON also checks whether the target
+ * memory regions need update (e.g., by ``mmap()`` calls from the application,
+ * in case of virtual memory monitoring) and applies the changes for each
+ * @primitive_update_interval.  All time intervals are in micro-seconds.
+ * Please refer to &struct damon_primitive and &struct damon_callback for more
+ * detail.
+ *
+ * @kdamond:		Kernel thread who does the monitoring.
+ * @kdamond_stop:	Notifies whether kdamond should stop.
+ * @kdamond_lock:	Mutex for the synchronizations with @kdamond.
+ *
+ * For each monitoring context, one kernel thread for the monitoring is
+ * created.  The pointer to the thread is stored in @kdamond.
+ *
+ * Once started, the monitoring thread runs until explicitly required to be
+ * terminated or every monitoring target is invalid.  The validity of the
+ * targets is checked via the &damon_primitive.target_valid of @primitive.  The
+ * termination can also be explicitly requested by writing non-zero to
+ * @kdamond_stop.  The thread sets @kdamond to NULL when it terminates.
+ * Therefore, users can know whether the monitoring is ongoing or terminated by
+ * reading @kdamond.  Reads and writes to @kdamond and @kdamond_stop from
+ * outside of the monitoring thread must be protected by @kdamond_lock.
+ *
+ * Note that the monitoring thread protects only @kdamond and @kdamond_stop via
+ * @kdamond_lock.  Accesses to other fields must be protected by themselves.
+ *
+ * @primitive:	Set of monitoring primitives for given use cases.
+ * @callback:	Set of callbacks for monitoring events notifications.
+ *
+ * @target:	Pointer to the user-defined monitoring target.
+ */
+struct damon_ctx {
+	unsigned long sample_interval;
+	unsigned long aggr_interval;
+	unsigned long primitive_update_interval;
+
+/* private: internal use only */
+	struct timespec64 last_aggregation;
+	struct timespec64 last_primitive_update;
+
+/* public: */
+	struct task_struct *kdamond;
+	bool kdamond_stop;
+	struct mutex kdamond_lock;
+
+	struct damon_primitive primitive;
+	struct damon_callback callback;
+
+	void *target;
+};
+
+#ifdef CONFIG_DAMON
+
+struct damon_ctx *damon_new_ctx(void);
+void damon_destroy_ctx(struct damon_ctx *ctx);
+int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
+		unsigned long aggr_int, unsigned long primitive_upd_int);
+
+int damon_start(struct damon_ctx **ctxs, int nr_ctxs);
+int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
+
+#endif	/* CONFIG_DAMON */
+
+#endif	/* _DAMON_H */
-- 
cgit v1.2.3


From f23b8eee1871a6db5c37f90831147de5426c40b7 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:56:32 -0700
Subject: mm/damon/core: implement region-based sampling

To avoid the unbounded increase of the overhead, DAMON groups adjacent
pages that are assumed to have the same access frequencies into a
region.  As long as the assumption (pages in a region have the same
access frequencies) is kept, only one page in the region is required to
be checked.  Thus, for each ``sampling interval``,

 1. the 'prepare_access_checks' primitive picks one page in each region,
 2. waits for one ``sampling interval``,
 3. checks whether the page is accessed meanwhile, and
 4. increases the access count of the region if so.

Therefore, the monitoring overhead is controllable by adjusting the
number of regions.  DAMON allows both the underlying primitives and user
callbacks to adjust regions for the trade-off.  In other words, this
commit makes DAMON to use not only time-based sampling but also
space-based sampling.

This scheme, however, cannot preserve the quality of the output if the
assumption is not guaranteed.  Next commit will address this problem.

Link: https://lkml.kernel.org/r/20210716081449.22187-3-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Leonard Foerster <foersleo@amazon.de>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 77 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 74 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 2f652602b1ea..67db309ad61b 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -12,6 +12,48 @@
 #include <linux/time64.h>
 #include <linux/types.h>
 
+/**
+ * struct damon_addr_range - Represents an address region of [@start, @end).
+ * @start:	Start address of the region (inclusive).
+ * @end:	End address of the region (exclusive).
+ */
+struct damon_addr_range {
+	unsigned long start;
+	unsigned long end;
+};
+
+/**
+ * struct damon_region - Represents a monitoring target region.
+ * @ar:			The address range of the region.
+ * @sampling_addr:	Address of the sample for the next access check.
+ * @nr_accesses:	Access frequency of this region.
+ * @list:		List head for siblings.
+ */
+struct damon_region {
+	struct damon_addr_range ar;
+	unsigned long sampling_addr;
+	unsigned int nr_accesses;
+	struct list_head list;
+};
+
+/**
+ * struct damon_target - Represents a monitoring target.
+ * @id:			Unique identifier for this target.
+ * @regions_list:	Head of the monitoring target regions of this target.
+ * @list:		List head for siblings.
+ *
+ * Each monitoring context could have multiple targets.  For example, a context
+ * for virtual memory address spaces could have multiple target processes.  The
+ * @id of each target should be unique among the targets of the context.  For
+ * example, in the virtual address monitoring context, it could be a pidfd or
+ * an address of an mm_struct.
+ */
+struct damon_target {
+	unsigned long id;
+	struct list_head regions_list;
+	struct list_head list;
+};
+
 struct damon_ctx;
 
 /**
@@ -36,7 +78,7 @@ struct damon_ctx;
  *
  * @init should initialize primitive-internal data structures.  For example,
  * this could be used to construct proper monitoring target regions and link
- * those to @damon_ctx.target.
+ * those to @damon_ctx.adaptive_targets.
  * @update should update the primitive-internal data structures.  For example,
  * this could be used to update monitoring target regions for current status.
  * @prepare_access_checks should manipulate the monitoring regions to be
@@ -130,7 +172,7 @@ struct damon_callback {
  * @primitive:	Set of monitoring primitives for given use cases.
  * @callback:	Set of callbacks for monitoring events notifications.
  *
- * @target:	Pointer to the user-defined monitoring target.
+ * @region_targets:	Head of monitoring targets (&damon_target) list.
  */
 struct damon_ctx {
 	unsigned long sample_interval;
@@ -149,11 +191,40 @@ struct damon_ctx {
 	struct damon_primitive primitive;
 	struct damon_callback callback;
 
-	void *target;
+	struct list_head region_targets;
 };
 
+#define damon_next_region(r) \
+	(container_of(r->list.next, struct damon_region, list))
+
+#define damon_prev_region(r) \
+	(container_of(r->list.prev, struct damon_region, list))
+
+#define damon_for_each_region(r, t) \
+	list_for_each_entry(r, &t->regions_list, list)
+
+#define damon_for_each_region_safe(r, next, t) \
+	list_for_each_entry_safe(r, next, &t->regions_list, list)
+
+#define damon_for_each_target(t, ctx) \
+	list_for_each_entry(t, &(ctx)->region_targets, list)
+
+#define damon_for_each_target_safe(t, next, ctx)	\
+	list_for_each_entry_safe(t, next, &(ctx)->region_targets, list)
+
 #ifdef CONFIG_DAMON
 
+struct damon_region *damon_new_region(unsigned long start, unsigned long end);
+inline void damon_insert_region(struct damon_region *r,
+		struct damon_region *prev, struct damon_region *next);
+void damon_add_region(struct damon_region *r, struct damon_target *t);
+void damon_destroy_region(struct damon_region *r);
+
+struct damon_target *damon_new_target(unsigned long id);
+void damon_add_target(struct damon_ctx *ctx, struct damon_target *t);
+void damon_free_target(struct damon_target *t);
+void damon_destroy_target(struct damon_target *t);
+
 struct damon_ctx *damon_new_ctx(void);
 void damon_destroy_ctx(struct damon_ctx *ctx);
 int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
-- 
cgit v1.2.3


From b9a6ac4e4ede4172d165c133398b93e3233b0ba7 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:56:36 -0700
Subject: mm/damon: adaptively adjust regions

Even somehow the initial monitoring target regions are well constructed to
fulfill the assumption (pages in same region have similar access
frequencies), the data access pattern can be dynamically changed.  This
will result in low monitoring quality.  To keep the assumption as much as
possible, DAMON adaptively merges and splits each region based on their
access frequency.

For each ``aggregation interval``, it compares the access frequencies of
adjacent regions and merges those if the frequency difference is small.
Then, after it reports and clears the aggregated access frequency of each
region, it splits each region into two or three regions if the total
number of regions will not exceed the user-specified maximum number of
regions after the split.

In this way, DAMON provides its best-effort quality and minimal overhead
while keeping the upper-bound overhead that users set.

Link: https://lkml.kernel.org/r/20210716081449.22187-4-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Leonard Foerster <foersleo@amazon.de>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 67db309ad61b..ce2a84b26cd7 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -12,6 +12,9 @@
 #include <linux/time64.h>
 #include <linux/types.h>
 
+/* Minimal region size.  Every damon_region is aligned by this. */
+#define DAMON_MIN_REGION	PAGE_SIZE
+
 /**
  * struct damon_addr_range - Represents an address region of [@start, @end).
  * @start:	Start address of the region (inclusive).
@@ -39,6 +42,7 @@ struct damon_region {
 /**
  * struct damon_target - Represents a monitoring target.
  * @id:			Unique identifier for this target.
+ * @nr_regions:		Number of monitoring target regions of this target.
  * @regions_list:	Head of the monitoring target regions of this target.
  * @list:		List head for siblings.
  *
@@ -50,6 +54,7 @@ struct damon_region {
  */
 struct damon_target {
 	unsigned long id;
+	unsigned int nr_regions;
 	struct list_head regions_list;
 	struct list_head list;
 };
@@ -85,6 +90,8 @@ struct damon_ctx;
  * prepared for the next access check.
  * @check_accesses should check the accesses to each region that made after the
  * last preparation and update the number of observed accesses of each region.
+ * It should also return max number of observed accesses that made as a result
+ * of its update.  The value will be used for regions adjustment threshold.
  * @reset_aggregated should reset the access monitoring results that aggregated
  * by @check_accesses.
  * @target_valid should check whether the target is still valid for the
@@ -95,7 +102,7 @@ struct damon_primitive {
 	void (*init)(struct damon_ctx *context);
 	void (*update)(struct damon_ctx *context);
 	void (*prepare_access_checks)(struct damon_ctx *context);
-	void (*check_accesses)(struct damon_ctx *context);
+	unsigned int (*check_accesses)(struct damon_ctx *context);
 	void (*reset_aggregated)(struct damon_ctx *context);
 	bool (*target_valid)(void *target);
 	void (*cleanup)(struct damon_ctx *context);
@@ -172,7 +179,9 @@ struct damon_callback {
  * @primitive:	Set of monitoring primitives for given use cases.
  * @callback:	Set of callbacks for monitoring events notifications.
  *
- * @region_targets:	Head of monitoring targets (&damon_target) list.
+ * @min_nr_regions:	The minimum number of adaptive monitoring regions.
+ * @max_nr_regions:	The maximum number of adaptive monitoring regions.
+ * @adaptive_targets:	Head of monitoring targets (&damon_target) list.
  */
 struct damon_ctx {
 	unsigned long sample_interval;
@@ -191,7 +200,9 @@ struct damon_ctx {
 	struct damon_primitive primitive;
 	struct damon_callback callback;
 
-	struct list_head region_targets;
+	unsigned long min_nr_regions;
+	unsigned long max_nr_regions;
+	struct list_head adaptive_targets;
 };
 
 #define damon_next_region(r) \
@@ -207,28 +218,31 @@ struct damon_ctx {
 	list_for_each_entry_safe(r, next, &t->regions_list, list)
 
 #define damon_for_each_target(t, ctx) \
-	list_for_each_entry(t, &(ctx)->region_targets, list)
+	list_for_each_entry(t, &(ctx)->adaptive_targets, list)
 
 #define damon_for_each_target_safe(t, next, ctx)	\
-	list_for_each_entry_safe(t, next, &(ctx)->region_targets, list)
+	list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list)
 
 #ifdef CONFIG_DAMON
 
 struct damon_region *damon_new_region(unsigned long start, unsigned long end);
 inline void damon_insert_region(struct damon_region *r,
-		struct damon_region *prev, struct damon_region *next);
+		struct damon_region *prev, struct damon_region *next,
+		struct damon_target *t);
 void damon_add_region(struct damon_region *r, struct damon_target *t);
-void damon_destroy_region(struct damon_region *r);
+void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 
 struct damon_target *damon_new_target(unsigned long id);
 void damon_add_target(struct damon_ctx *ctx, struct damon_target *t);
 void damon_free_target(struct damon_target *t);
 void damon_destroy_target(struct damon_target *t);
+unsigned int damon_nr_regions(struct damon_target *t);
 
 struct damon_ctx *damon_new_ctx(void);
 void damon_destroy_ctx(struct damon_ctx *ctx);
 int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
-		unsigned long aggr_int, unsigned long primitive_upd_int);
+		unsigned long aggr_int, unsigned long primitive_upd_int,
+		unsigned long min_nr_reg, unsigned long max_nr_reg);
 
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
-- 
cgit v1.2.3


From 1c676e0d9b1a59b98885b24a0e16a81fe4cc8301 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:56:40 -0700
Subject: mm/idle_page_tracking: make PG_idle reusable

PG_idle and PG_young allow the two PTE Accessed bit users, Idle Page
Tracking and the reclaim logic concurrently work while not interfering
with each other.  That is, when they need to clear the Accessed bit, they
set PG_young to represent the previous state of the bit, respectively.
And when they need to read the bit, if the bit is cleared, they further
read the PG_young to know whether the other has cleared the bit meanwhile
or not.

For yet another user of the PTE Accessed bit, we could add another page
flag, or extend the mechanism to use the flags.  For the DAMON usecase,
however, we don't need to do that just yet.  IDLE_PAGE_TRACKING and DAMON
are mutually exclusive, so there's only ever going to be one user of the
current set of flags.

In this commit, we split out the CONFIG options to allow for the use of
PG_young and PG_idle outside of idle page tracking.

In the next commit, DAMON's reference implementation of the virtual memory
address space monitoring primitives will use it.

[sjpark@amazon.de: set PAGE_EXTENSION for non-64BIT]
  Link: https://lkml.kernel.org/r/20210806095153.6444-1-sj38.park@gmail.com
[akpm@linux-foundation.org: tweak Kconfig text]
[sjpark@amazon.de: hide PAGE_IDLE_FLAG from users]
  Link: https://lkml.kernel.org/r/20210813081238.34705-1-sj38.park@gmail.com

Link: https://lkml.kernel.org/r/20210716081449.22187-5-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Leonard Foerster <foersleo@amazon.de>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 4 ++--
 include/linux/page_ext.h   | 2 +-
 include/linux/page_idle.h  | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6b8d66965145..0a51dd1bb6b1 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -131,7 +131,7 @@ enum pageflags {
 #ifdef CONFIG_MEMORY_FAILURE
 	PG_hwpoison,		/* hardware poisoned page. Don't touch */
 #endif
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
 	PG_young,
 	PG_idle,
 #endif
@@ -441,7 +441,7 @@ PAGEFLAG_FALSE(HWPoison)
 #define __PG_HWPOISON 0
 #endif
 
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
 TESTPAGEFLAG(Young, young, PF_ANY)
 SETPAGEFLAG(Young, young, PF_ANY)
 TESTCLEARFLAG(Young, young, PF_ANY)
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index aff81ba31bd8..fabb2e1e087f 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -19,7 +19,7 @@ struct page_ext_operations {
 enum page_ext_flags {
 	PAGE_EXT_OWNER,
 	PAGE_EXT_OWNER_ALLOCATED,
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
 	PAGE_EXT_YOUNG,
 	PAGE_EXT_IDLE,
 #endif
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 1e894d34bdce..d8a6aecf99cb 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -6,7 +6,7 @@
 #include <linux/page-flags.h>
 #include <linux/page_ext.h>
 
-#ifdef CONFIG_IDLE_PAGE_TRACKING
+#ifdef CONFIG_PAGE_IDLE_FLAG
 
 #ifdef CONFIG_64BIT
 static inline bool page_is_young(struct page *page)
@@ -106,7 +106,7 @@ static inline void clear_page_idle(struct page *page)
 }
 #endif /* CONFIG_64BIT */
 
-#else /* !CONFIG_IDLE_PAGE_TRACKING */
+#else /* !CONFIG_PAGE_IDLE_FLAG */
 
 static inline bool page_is_young(struct page *page)
 {
@@ -135,6 +135,6 @@ static inline void clear_page_idle(struct page *page)
 {
 }
 
-#endif /* CONFIG_IDLE_PAGE_TRACKING */
+#endif /* CONFIG_PAGE_IDLE_FLAG */
 
 #endif /* _LINUX_MM_PAGE_IDLE_H */
-- 
cgit v1.2.3


From 3f49584b262cf8f42b25f4c1ad9f5bfd3bdc1bca Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:56:44 -0700
Subject: mm/damon: implement primitives for the virtual memory address spaces

This commit introduces a reference implementation of the address space
specific low level primitives for the virtual address space, so that users
of DAMON can easily monitor the data accesses on virtual address spaces of
specific processes by simply configuring the implementation to be used by
DAMON.

The low level primitives for the fundamental access monitoring are defined
in two parts:

1. Identification of the monitoring target address range for the address
   space.
2. Access check of specific address range in the target space.

The reference implementation for the virtual address space does the works
as below.

PTE Accessed-bit Based Access Check
-----------------------------------

The implementation uses PTE Accessed-bit for basic access checks.  That
is, it clears the bit for the next sampling target page and checks whether
it is set again after one sampling period.  This could disturb the reclaim
logic.  DAMON uses ``PG_idle`` and ``PG_young`` page flags to solve the
conflict, as Idle page tracking does.

VMA-based Target Address Range Construction
-------------------------------------------

Only small parts in the super-huge virtual address space of the processes
are mapped to physical memory and accessed.  Thus, tracking the unmapped
address regions is just wasteful.  However, because DAMON can deal with
some level of noise using the adaptive regions adjustment mechanism,
tracking every mapping is not strictly required but could even incur a
high overhead in some cases.  That said, too huge unmapped areas inside
the monitoring target should be removed to not take the time for the
adaptive mechanism.

For the reason, this implementation converts the complex mappings to three
distinct regions that cover every mapped area of the address space.  Also,
the two gaps between the three regions are the two biggest unmapped areas
in the given address space.  The two biggest unmapped areas would be the
gap between the heap and the uppermost mmap()-ed region, and the gap
between the lowermost mmap()-ed region and the stack in most of the cases.
Because these gaps are exceptionally huge in usual address spaces,
excluding these will be sufficient to make a reasonable trade-off.  Below
shows this in detail::

    <heap>
    <BIG UNMAPPED REGION 1>
    <uppermost mmap()-ed region>
    (small mmap()-ed regions and munmap()-ed regions)
    <lowermost mmap()-ed region>
    <BIG UNMAPPED REGION 2>
    <stack>

[akpm@linux-foundation.org: mm/damon/vaddr.c needs highmem.h for kunmap_atomic()]
[sjpark@amazon.de: remove unnecessary PAGE_EXTENSION setup]
  Link: https://lkml.kernel.org/r/20210806095153.6444-2-sj38.park@gmail.com
[sjpark@amazon.de: safely walk page table]
  Link: https://lkml.kernel.org/r/20210831161800.29419-1-sj38.park@gmail.com

Link: https://lkml.kernel.org/r/20210716081449.22187-6-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Leonard Foerster <foersleo@amazon.de>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index ce2a84b26cd7..edb350e52b93 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -249,4 +249,17 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 
 #endif	/* CONFIG_DAMON */
 
+#ifdef CONFIG_DAMON_VADDR
+
+/* Monitoring primitives for virtual memory address spaces */
+void damon_va_init(struct damon_ctx *ctx);
+void damon_va_update(struct damon_ctx *ctx);
+void damon_va_prepare_access_checks(struct damon_ctx *ctx);
+unsigned int damon_va_check_accesses(struct damon_ctx *ctx);
+bool damon_va_target_valid(void *t);
+void damon_va_cleanup(struct damon_ctx *ctx);
+void damon_va_set_primitives(struct damon_ctx *ctx);
+
+#endif	/* CONFIG_DAMON_VADDR */
+
 #endif	/* _DAMON_H */
-- 
cgit v1.2.3


From 4bc05954d0076655cfaf6f0135585bdc20cd6b11 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sjpark@amazon.de>
Date: Tue, 7 Sep 2021 19:56:53 -0700
Subject: mm/damon: implement a debugfs-based user space interface

DAMON is designed to be used by kernel space code such as the memory
management subsystems, and therefore it provides only kernel space API.
That said, letting the user space control DAMON could provide some
benefits to them.  For example, it will allow user space to analyze their
specific workloads and make their own special optimizations.

For such cases, this commit implements a simple DAMON application kernel
module, namely 'damon-dbgfs', which merely wraps the DAMON api and exports
those to the user space via the debugfs.

'damon-dbgfs' exports three files, ``attrs``, ``target_ids``, and
``monitor_on`` under its debugfs directory, ``<debugfs>/damon/``.

Attributes
----------

Users can read and write the ``sampling interval``, ``aggregation
interval``, ``regions update interval``, and min/max number of monitoring
target regions by reading from and writing to the ``attrs`` file.  For
example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10,
1000 and check it again::

    # cd <debugfs>/damon
    # echo 5000 100000 1000000 10 1000 > attrs
    # cat attrs
    5000 100000 1000000 10 1000

Target IDs
----------

Some types of address spaces supports multiple monitoring target.  For
example, the virtual memory address spaces monitoring can have multiple
processes as the monitoring targets.  Users can set the targets by writing
relevant id values of the targets to, and get the ids of the current
targets by reading from the ``target_ids`` file.  In case of the virtual
address spaces monitoring, the values should be pids of the monitoring
target processes.  For example, below commands set processes having pids
42 and 4242 as the monitoring targets and check it again::

    # cd <debugfs>/damon
    # echo 42 4242 > target_ids
    # cat target_ids
    42 4242

Note that setting the target ids doesn't start the monitoring.

Turning On/Off
--------------

Setting the files as described above doesn't incur effect unless you
explicitly start the monitoring.  You can start, stop, and check the
current status of the monitoring by writing to and reading from the
``monitor_on`` file.  Writing ``on`` to the file starts the monitoring of
the targets with the attributes.  Writing ``off`` to the file stops those.
DAMON also stops if every targets are invalidated (in case of the virtual
memory monitoring, target processes are invalidated when terminated).
Below example commands turn on, off, and check the status of DAMON::

    # cd <debugfs>/damon
    # echo on > monitor_on
    # echo off > monitor_on
    # cat monitor_on
    off

Please note that you cannot write to the above-mentioned debugfs files
while the monitoring is turned on.  If you write to the files while DAMON
is running, an error code such as ``-EBUSY`` will be returned.

[akpm@linux-foundation.org: remove unneeded "alloc failed" printks]
[akpm@linux-foundation.org: replace macro with static inline]

Link: https://lkml.kernel.org/r/20210716081449.22187-8-sj38.park@gmail.com
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Leonard Foerster <foersleo@amazon.de>
Reviewed-by: Fernand Sieber <sieberf@amazon.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Amit Shah <amit@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David Woodhouse <dwmw@amazon.com>
Cc: Fan Du <fan.du@intel.com>
Cc: Greg Kroah-Hartman <greg@kroah.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Marco Elver <elver@google.com>
Cc: Markus Boehme <markubo@amazon.de>
Cc: Maximilian Heyne <mheyne@amazon.de>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/damon.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index edb350e52b93..d68b67b8d458 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -240,9 +240,12 @@ unsigned int damon_nr_regions(struct damon_target *t);
 
 struct damon_ctx *damon_new_ctx(void);
 void damon_destroy_ctx(struct damon_ctx *ctx);
+int damon_set_targets(struct damon_ctx *ctx,
+		unsigned long *ids, ssize_t nr_ids);
 int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
 		unsigned long aggr_int, unsigned long primitive_upd_int,
 		unsigned long min_nr_reg, unsigned long max_nr_reg);
+int damon_nr_running_ctxs(void);
 
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
-- 
cgit v1.2.3


From a8a47cf5ce4bbc70a54fa4eca71d35f43dc8218a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 7 Sep 2021 19:57:41 -0700
Subject: include/linux/once.h: fix trivia typo Not -> Note

Fix trivia typo Not -> Note in the comment to DO_ONCE().

Link: https://lkml.kernel.org/r/20210722184349.76290-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/once.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/once.h b/include/linux/once.h
index ae6f4eb41cbe..d361fb14ac3a 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -16,7 +16,7 @@ void __do_once_done(bool *done, struct static_key_true *once_key,
  * out the condition into a nop. DO_ONCE() guarantees type safety of
  * arguments!
  *
- * Not that the following is not equivalent ...
+ * Note that the following is not equivalent ...
  *
  *   DO_ONCE(func, arg);
  *   DO_ONCE(func, arg);
-- 
cgit v1.2.3


From c9221919a2d2df5741ab074dfec5bdfc6f1e043b Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 7 Sep 2021 19:57:44 -0700
Subject: units: change from 'L' to 'UL'

Patch series "Add Hz macros", v3.

There are multiple definitions of the HZ_PER_MHZ or HZ_PER_KHZ in the
different drivers.  Instead of duplicating this definition again and
again, add one in the units.h header to be reused in all the place the
redefiniton occurs.

At the same time, change the type of the Watts, as they can not be
negative.

This patch (of 10):

The users of the macros are safe to be assigned with an unsigned instead
of signed as the variables using them are themselves unsigned.

Link: https://lkml.kernel.org/r/20210816114732.1834145-1-daniel.lezcano@linaro.org
Link: https://lkml.kernel.org/r/20210816114732.1834145-2-daniel.lezcano@linaro.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: Christian Eggers <ceggers@arri.de>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: MyungJoo Ham <myungjoo.ham@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Lars-Peter Clausen <lars@metafoo.de>
Cc: Peter Meerwald <pmeerw@pmeerw.net>
Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/units.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/units.h b/include/linux/units.h
index dcc30a53fa93..ff51d3cfc6a0 100644
--- a/include/linux/units.h
+++ b/include/linux/units.h
@@ -4,9 +4,9 @@
 
 #include <linux/math.h>
 
-#define MILLIWATT_PER_WATT	1000L
-#define MICROWATT_PER_MILLIWATT	1000L
-#define MICROWATT_PER_WATT	1000000L
+#define MILLIWATT_PER_WATT	1000UL
+#define MICROWATT_PER_MILLIWATT	1000UL
+#define MICROWATT_PER_WATT	1000000UL
 
 #define ABSOLUTE_ZERO_MILLICELSIUS -273150
 
-- 
cgit v1.2.3


From e2c77032fcbe515194107994d12cd72ddb77b022 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 7 Sep 2021 19:57:48 -0700
Subject: units: add the HZ macros

The macros for the unit conversion for frequency are duplicated in
different places.

Provide these macros in the 'units' header, so they can be reused.

Link: https://lkml.kernel.org/r/20210816114732.1834145-3-daniel.lezcano@linaro.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Christian Eggers <ceggers@arri.de>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Lars-Peter Clausen <lars@metafoo.de>
Cc: Lukasz Luba <lukasz.luba@arm.com>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: MyungJoo Ham <myungjoo.ham@samsung.com>
Cc: Peter Meerwald <pmeerw@pmeerw.net>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/units.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/units.h b/include/linux/units.h
index ff51d3cfc6a0..8b8dc8a84d93 100644
--- a/include/linux/units.h
+++ b/include/linux/units.h
@@ -4,6 +4,10 @@
 
 #include <linux/math.h>
 
+#define HZ_PER_KHZ		1000UL
+#define KHZ_PER_MHZ		1000UL
+#define HZ_PER_MHZ		1000000UL
+
 #define MILLIWATT_PER_WATT	1000UL
 #define MICROWATT_PER_MILLIWATT	1000UL
 #define MICROWATT_PER_WATT	1000000UL
-- 
cgit v1.2.3


From 1e1c15839df084f4011825fee922aa976c9159dc Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 7 Sep 2021 20:00:00 -0700
Subject: fs/epoll: use a per-cpu counter for user's watches count

This counter tracks the number of watches a user has, to compare against
the 'max_user_watches' limit. This causes a scalability bottleneck on
SPECjbb2015 on large systems as there is only one user. Changing to a
per-cpu counter increases throughput of the benchmark by about 30% on a
16-socket, > 1000 thread system.

[rdunlap@infradead.org: fix build errors in kernel/user.c when CONFIG_EPOLL=n]
[npiggin@gmail.com: move ifdefs into wrapper functions, slightly improve panic message]
  Link: https://lkml.kernel.org/r/1628051945.fens3r99ox.astroid@bobo.none
[akpm@linux-foundation.org: tweak user_epoll_alloc(), per Guenter]
  Link: https://lkml.kernel.org/r/20210804191421.GA1900577@roeck-us.net

Link: https://lkml.kernel.org/r/20210802032013.2751916-1-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reported-by: Anton Blanchard <anton@ozlabs.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/user.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 2462f7d07695..00ed419dd464 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -4,6 +4,7 @@
 
 #include <linux/uidgid.h>
 #include <linux/atomic.h>
+#include <linux/percpu_counter.h>
 #include <linux/refcount.h>
 #include <linux/ratelimit.h>
 
@@ -13,7 +14,7 @@
 struct user_struct {
 	refcount_t __count;	/* reference count */
 #ifdef CONFIG_EPOLL
-	atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
+	struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */
 #endif
 	unsigned long unix_inflight;	/* How many files in flight in unix sockets */
 	atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */
-- 
cgit v1.2.3


From 5b91a75b3312c03798f555e10569fd85211a490c Mon Sep 17 00:00:00 2001
From: Takahiro Itazuri <itazur@amazon.com>
Date: Tue, 7 Sep 2021 20:00:38 -0700
Subject: pid: cleanup the stale comment mentioning pidmap_init().

pidmap_init() has already been replaced with pid_idr_init() in the commit
95846ecf9dac ("pid: replace pid bitmap implementation with IDR API").
Cleanup the stale comment which still mentions it.

Link: https://lkml.kernel.org/r/20210714120713.19825-1-itazur@amazon.com
Signed-off-by: Takahiro Itazuri <itazur@amazon.com>
Cc: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/threads.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/threads.h b/include/linux/threads.h
index 18d5a74bcc3d..c34173e6c5f1 100644
--- a/include/linux/threads.h
+++ b/include/linux/threads.h
@@ -38,7 +38,7 @@
  * Define a minimum number of pids per cpu.  Heuristically based
  * on original pid max of 32k for 32 cpus.  Also, increase the
  * minimum settable value for pid_max on the running system based
- * on similar defaults.  See kernel/pid.c:pidmap_init() for details.
+ * on similar defaults.  See kernel/pid.c:pid_idr_init() for details.
  */
 #define PIDS_PER_CPU_DEFAULT	1024
 #define PIDS_PER_CPU_MIN	8
-- 
cgit v1.2.3


From e130242dc351f1cfa2bbeb6766a1486ce936ef88 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 8 Sep 2021 15:18:21 -0700
Subject: mm: simplify compat numa syscalls

The compat implementations for mbind, get_mempolicy, set_mempolicy and
migrate_pages are just there to handle the subtly different layout of
bitmaps on 32-bit hosts.

The compat implementation however lacks some of the checks that are
present in the native one, in particular for checking that the extra bits
are all zero when user space has a larger mask size than the kernel.
Worse, those extra bits do not get cleared when copying in or out of the
kernel, which can lead to incorrect data as well.

Unify the implementation to handle the compat bitmap layout directly in
the get_nodes() and copy_nodes_to_user() helpers.  Splitting out the
get_bitmap() helper from get_nodes() also helps readability of the native
case.

On x86, two additional problems are addressed by this: compat tasks can
pass a bitmap at the end of a mapping, causing a fault when reading across
the page boundary for a 64-bit word.  x32 tasks might also run into
problems with get_mempolicy corrupting data when an odd number of 32-bit
words gets passed.

On parisc the migrate_pages() system call apparently had the wrong calling
convention, as big-endian architectures expect the words inside of a
bitmap to be swapped.  This is not a problem though since parisc has no
NUMA support.

[arnd@arndb.de: fix mempolicy crash]
  Link: https://lkml.kernel.org/r/20210730143417.3700653-1-arnd@kernel.org
  Link: https://lore.kernel.org/lkml/YQPLG20V3dmOfq3a@osiris/

Link: https://lkml.kernel.org/r/20210727144859.4150043-5-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 8e0598c7d1d1..3a2ac5afee30 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -395,14 +395,6 @@ struct compat_kexec_segment;
 struct compat_mq_attr;
 struct compat_msgbuf;
 
-#define BITS_PER_COMPAT_LONG    (8*sizeof(compat_long_t))
-
-#define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG)
-
-long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
-		       unsigned long bitmap_size);
-long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
-		       unsigned long bitmap_size);
 void copy_siginfo_to_external32(struct compat_siginfo *to,
 		const struct kernel_siginfo *from);
 int copy_siginfo_from_user32(kernel_siginfo_t *to,
@@ -976,6 +968,15 @@ static inline bool in_compat_syscall(void) { return false; }
 
 #endif /* CONFIG_COMPAT */
 
+#define BITS_PER_COMPAT_LONG    (8*sizeof(compat_long_t))
+
+#define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG)
+
+long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
+		       unsigned long bitmap_size);
+long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
+		       unsigned long bitmap_size);
+
 /*
  * Some legacy ABIs like the i386 one use less than natural alignment for 64-bit
  * types, and will need special compat treatment for that.  Most architectures
-- 
cgit v1.2.3


From 59ab844eed9c6b01d32dcb27b57accc23771b324 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 8 Sep 2021 15:18:25 -0700
Subject: compat: remove some compat entry points

These are all handled correctly when calling the native system call entry
point, so remove the special cases.

Link: https://lkml.kernel.org/r/20210727144859.4150043-6-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 3a2ac5afee30..2d42cebd1fb8 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -799,26 +799,6 @@ asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr
 /* mm/fadvise.c: No generic prototype for fadvise64_64 */
 
 /* mm/, CONFIG_MMU only */
-asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
-				 compat_ulong_t mode,
-				 compat_ulong_t __user *nmask,
-				 compat_ulong_t maxnode, compat_ulong_t flags);
-asmlinkage long compat_sys_get_mempolicy(int __user *policy,
-					 compat_ulong_t __user *nmask,
-					 compat_ulong_t maxnode,
-					 compat_ulong_t addr,
-					 compat_ulong_t flags);
-asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
-					 compat_ulong_t maxnode);
-asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
-		compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes,
-		const compat_ulong_t __user *new_nodes);
-asmlinkage long compat_sys_move_pages(pid_t pid, compat_ulong_t nr_pages,
-				      __u32 __user *pages,
-				      const int __user *nodes,
-				      int __user *status,
-				      int flags);
-
 asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid,
 					compat_pid_t pid, int sig,
 					struct compat_siginfo __user *uinfo);
-- 
cgit v1.2.3


From a7a08b275a8bbade798c4bdaad07ade68fe7003c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 8 Sep 2021 15:18:29 -0700
Subject: arch: remove compat_alloc_user_space

All users of compat_alloc_user_space() and copy_in_user() have been
removed from the kernel, only a few functions in sparc remain that can be
changed to calling arch_copy_in_user() instead.

Link: https://lkml.kernel.org/r/20210727144859.4150043-7-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h  |  2 --
 include/linux/uaccess.h | 10 ----------
 2 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 2d42cebd1fb8..1c758b0e0359 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -511,8 +511,6 @@ extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 struct epoll_event;	/* fortunately, this one is fixed-layout */
 
-extern void __user *compat_alloc_user_space(unsigned long len);
-
 int compat_restore_altstack(const compat_stack_t __user *uss);
 int __compat_save_altstack(compat_stack_t __user *, unsigned long);
 #define unsafe_compat_save_altstack(uss, sp, label) do { \
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index c05e903cef02..ac0394087f7d 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -200,16 +200,6 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
 		n = _copy_to_user(to, from, n);
 	return n;
 }
-#ifdef CONFIG_COMPAT
-static __always_inline unsigned long __must_check
-copy_in_user(void __user *to, const void __user *from, unsigned long n)
-{
-	might_fault();
-	if (access_ok(to, n) && access_ok(from, n))
-		n = raw_copy_in_user(to, from, n);
-	return n;
-}
-#endif
 
 #ifndef copy_mc_to_kernel
 /*
-- 
cgit v1.2.3


From b83a908498d68fafca931e1276e145b339cac5fb Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Mon, 2 Aug 2021 13:23:20 -0700
Subject: compiler_attributes.h: move __compiletime_{error|warning}

Clang 14 will add support for __attribute__((__error__(""))) and
__attribute__((__warning__(""))). To make use of these in
__compiletime_error and __compiletime_warning (as used by BUILD_BUG and
friends) for newer clang and detect/fallback for older versions of
clang, move these to compiler_attributes.h and guard them with
__has_attribute preprocessor guards.

Link: https://reviews.llvm.org/D106030
Link: https://bugs.llvm.org/show_bug.cgi?id=16428
Link: https://github.com/ClangBuiltLinux/linux/issues/1173
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
[Reworded, landed in Clang 14]
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 include/linux/compiler-gcc.h        |  3 ---
 include/linux/compiler_attributes.h | 24 ++++++++++++++++++++++++
 include/linux/compiler_types.h      |  6 ------
 3 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 5d97ef738a57..61c1479688db 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -43,9 +43,6 @@
 
 #define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
 
-#define __compiletime_warning(message) __attribute__((__warning__(message)))
-#define __compiletime_error(message) __attribute__((__error__(message)))
-
 #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__)
 #define __latent_entropy __attribute__((latent_entropy))
 #endif
diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index 7b1fa5c30169..f4df9e5a8c76 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -30,6 +30,7 @@
 # define __GCC4_has_attribute___assume_aligned__      (__GNUC_MINOR__ >= 9)
 # define __GCC4_has_attribute___copy__                0
 # define __GCC4_has_attribute___designated_init__     0
+# define __GCC4_has_attribute___error__               1
 # define __GCC4_has_attribute___externally_visible__  1
 # define __GCC4_has_attribute___no_caller_saved_registers__ 0
 # define __GCC4_has_attribute___noclone__             1
@@ -38,6 +39,7 @@
 # define __GCC4_has_attribute___no_sanitize_undefined__ (__GNUC_MINOR__ >= 9)
 # define __GCC4_has_attribute___no_sanitize_coverage__ 0
 # define __GCC4_has_attribute___fallthrough__         0
+# define __GCC4_has_attribute___warning__             1
 #endif
 
 /*
@@ -137,6 +139,17 @@
 # define __designated_init
 #endif
 
+/*
+ * Optional: only supported since clang >= 14.0
+ *
+ *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-error-function-attribute
+ */
+#if __has_attribute(__error__)
+# define __compiletime_error(msg)       __attribute__((__error__(msg)))
+#else
+# define __compiletime_error(msg)
+#endif
+
 /*
  * Optional: not supported by clang
  *
@@ -286,6 +299,17 @@
  */
 #define __must_check                    __attribute__((__warn_unused_result__))
 
+/*
+ * Optional: only supported since clang >= 14.0
+ *
+ *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-warning-function-attribute
+ */
+#if __has_attribute(__warning__)
+# define __compiletime_warning(msg)     __attribute__((__warning__(msg)))
+#else
+# define __compiletime_warning(msg)
+#endif
+
 /*
  *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-weak-function-attribute
  *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-weak-variable-attribute
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index d29bda7f6ebd..8246d0caffa6 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -294,12 +294,6 @@ struct ftrace_likely_data {
 #ifndef __compiletime_object_size
 # define __compiletime_object_size(obj) -1
 #endif
-#ifndef __compiletime_warning
-# define __compiletime_warning(message)
-#endif
-#ifndef __compiletime_error
-# define __compiletime_error(message)
-#endif
 
 #ifdef __OPTIMIZE__
 # define __compiletime_assert(condition, msg, prefix, suffix)		\
-- 
cgit v1.2.3


From 13db8c50477d83ad3e3b9b0ae247e5cd833a7ae4 Mon Sep 17 00:00:00 2001
From: Liu Zixian <liuzixian4@huawei.com>
Date: Wed, 8 Sep 2021 18:10:05 -0700
Subject: mm/hugetlb: initialize hugetlb_usage in mm_init

After fork, the child process will get incorrect (2x) hugetlb_usage.  If
a process uses 5 2MB hugetlb pages in an anonymous mapping,

	HugetlbPages:	   10240 kB

and then forks, the child will show,

	HugetlbPages:	   20480 kB

The reason for double the amount is because hugetlb_usage will be copied
from the parent and then increased when we copy page tables from parent
to child.  Child will have 2x actual usage.

Fix this by adding hugetlb_count_init in mm_init.

Link: https://lkml.kernel.org/r/20210826071742.877-1-liuzixian4@huawei.com
Fixes: 5d317b2b6536 ("mm: hugetlb: proc: add HugetlbPages field to /proc/PID/status")
Signed-off-by: Liu Zixian <liuzixian4@huawei.com>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f7ca1a3870ea..1faebe1cd0ed 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -858,6 +858,11 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
 
 void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm);
 
+static inline void hugetlb_count_init(struct mm_struct *mm)
+{
+	atomic_long_set(&mm->hugetlb_usage, 0);
+}
+
 static inline void hugetlb_count_add(long l, struct mm_struct *mm)
 {
 	atomic_long_add(l, &mm->hugetlb_usage);
@@ -1042,6 +1047,10 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
 	return &mm->page_table_lock;
 }
 
+static inline void hugetlb_count_init(struct mm_struct *mm)
+{
+}
+
 static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m)
 {
 }
-- 
cgit v1.2.3


From 10994316089c9682f2fbe0be0b1e82bcaf5f4e8c Mon Sep 17 00:00:00 2001
From: Liam Howlett <liam.howlett@oracle.com>
Date: Wed, 8 Sep 2021 18:10:14 -0700
Subject: mmap_lock: change trace and locking order

Print to the trace log before releasing the lock to avoid racing with
other trace log printers of the same lock type.

Link: https://lkml.kernel.org/r/20210903022041.1843024-1-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Suggested-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michel Lespinasse <walken.cr@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmap_lock.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 0540f0156f58..b179f1e3541a 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -101,14 +101,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
 
 static inline void mmap_write_unlock(struct mm_struct *mm)
 {
-	up_write(&mm->mmap_lock);
 	__mmap_lock_trace_released(mm, true);
+	up_write(&mm->mmap_lock);
 }
 
 static inline void mmap_write_downgrade(struct mm_struct *mm)
 {
-	downgrade_write(&mm->mmap_lock);
 	__mmap_lock_trace_acquire_returned(mm, false, true);
+	downgrade_write(&mm->mmap_lock);
 }
 
 static inline void mmap_read_lock(struct mm_struct *mm)
@@ -140,8 +140,8 @@ static inline bool mmap_read_trylock(struct mm_struct *mm)
 
 static inline void mmap_read_unlock(struct mm_struct *mm)
 {
-	up_read(&mm->mmap_lock);
 	__mmap_lock_trace_released(mm, false);
+	up_read(&mm->mmap_lock);
 }
 
 static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm)
@@ -155,8 +155,8 @@ static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm)
 
 static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
 {
-	up_read_non_owner(&mm->mmap_lock);
 	__mmap_lock_trace_released(mm, false);
+	up_read_non_owner(&mm->mmap_lock);
 }
 
 static inline void mmap_assert_locked(struct mm_struct *mm)
-- 
cgit v1.2.3


From 5dfe50b05588010f347cb2f436434bf22b7a84ed Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 9 Sep 2021 22:36:38 +0900
Subject: bootconfig: Rename xbc_node_find_child() to xbc_node_find_subkey()

Rename xbc_node_find_child() to xbc_node_find_subkey() for
clarifying that function returns a key node (no value node).
Since there are xbc_node_for_each_child() (loop on all child
nodes) and xbc_node_for_each_subkey() (loop on only subkey
nodes), this name distinction is necessary to avoid confusing
users.

Link: https://lkml.kernel.org/r/163119459826.161018.11200274779483115300.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/bootconfig.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index abe089c27529..537e1b991f11 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -110,7 +110,7 @@ static inline __init bool xbc_node_is_leaf(struct xbc_node *node)
 }
 
 /* Tree-based key-value access APIs */
-struct xbc_node * __init xbc_node_find_child(struct xbc_node *parent,
+struct xbc_node * __init xbc_node_find_subkey(struct xbc_node *parent,
 					     const char *key);
 
 const char * __init xbc_node_find_value(struct xbc_node *parent,
@@ -148,7 +148,7 @@ xbc_find_value(const char *key, struct xbc_node **vnode)
  */
 static inline struct xbc_node * __init xbc_find_node(const char *key)
 {
-	return xbc_node_find_child(NULL, key);
+	return xbc_node_find_subkey(NULL, key);
 }
 
 /**
-- 
cgit v1.2.3


From 8c854303ce0e38e5bbedd725ff39da7e235865d8 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 3 Aug 2021 16:16:21 +0200
Subject: cpu/hotplug: Remove deprecated CPU-hotplug functions.

No users in tree use the deprecated CPU-hotplug functions anymore.

Remove them.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210803141621.780504-39-bigeasy@linutronix.de
---
 include/linux/cpu.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 94a578a96202..9cf51e41e697 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -143,12 +143,6 @@ static inline int remove_cpu(unsigned int cpu) { return -EPERM; }
 static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
 #endif	/* !CONFIG_HOTPLUG_CPU */
 
-/* Wrappers which go away once all code is converted */
-static inline void cpu_hotplug_begin(void) { cpus_write_lock(); }
-static inline void cpu_hotplug_done(void) { cpus_write_unlock(); }
-static inline void get_online_cpus(void) { cpus_read_lock(); }
-static inline void put_online_cpus(void) { cpus_read_unlock(); }
-
 #ifdef CONFIG_PM_SLEEP_SMP
 extern int freeze_secondary_cpus(int primary);
 extern void thaw_secondary_cpus(void);
-- 
cgit v1.2.3


From c9871c800f65fffed40f3df3e1eb38984f95cfcf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 9 Sep 2021 14:34:59 +0200
Subject: Documentation: core-api/cpuhotplug: Rewrite the API section

Dave stumbled over the incomplete and confusing documentation of the CPU
hotplug API.

Rewrite it, add the missing function documentations and correct the
existing ones.

Reported-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210909123212.489059409@linutronix.de
---
 include/linux/cpuhotplug.h | 132 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 110 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 39cf84a30b9f..832d8a74fa59 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -22,8 +22,42 @@
  *              AP_ACTIVE			AP_ACTIVE
  */
 
+/*
+ * CPU hotplug states. The state machine invokes the installed state
+ * startup callbacks sequentially from CPUHP_OFFLINE + 1 to CPUHP_ONLINE
+ * during a CPU online operation. During a CPU offline operation the
+ * installed teardown callbacks are invoked in the reverse order from
+ * CPU_ONLINE - 1 down to CPUHP_OFFLINE.
+ *
+ * The state space has three sections: PREPARE, STARTING and ONLINE.
+ *
+ * PREPARE: The callbacks are invoked on a control CPU before the
+ * hotplugged CPU is started up or after the hotplugged CPU has died.
+ *
+ * STARTING: The callbacks are invoked on the hotplugged CPU from the low level
+ * hotplug startup/teardown code with interrupts disabled.
+ *
+ * ONLINE: The callbacks are invoked on the hotplugged CPU from the per CPU
+ * hotplug thread with interrupts and preemption enabled.
+ *
+ * Adding explicit states to this enum is only necessary when:
+ *
+ * 1) The state is within the STARTING section
+ *
+ * 2) The state has ordering constraints vs. other states in the
+ *    same section.
+ *
+ * If neither #1 nor #2 apply, please use the dynamic state space when
+ * setting up a state by using CPUHP_PREPARE_DYN or CPUHP_PREPARE_ONLINE
+ * for the @state argument of the setup function.
+ *
+ * See Documentation/core-api/cpu_hotplug.rst for further information and
+ * examples.
+ */
 enum cpuhp_state {
 	CPUHP_INVALID = -1,
+
+	/* PREPARE section invoked on a control CPU */
 	CPUHP_OFFLINE = 0,
 	CPUHP_CREATE_THREADS,
 	CPUHP_PERF_PREPARE,
@@ -95,6 +129,11 @@ enum cpuhp_state {
 	CPUHP_BP_PREPARE_DYN,
 	CPUHP_BP_PREPARE_DYN_END		= CPUHP_BP_PREPARE_DYN + 20,
 	CPUHP_BRINGUP_CPU,
+
+	/*
+	 * STARTING section invoked on the hotplugged CPU in low level
+	 * bringup and teardown code.
+	 */
 	CPUHP_AP_IDLE_DEAD,
 	CPUHP_AP_OFFLINE,
 	CPUHP_AP_SCHED_STARTING,
@@ -155,6 +194,8 @@ enum cpuhp_state {
 	CPUHP_AP_ARM_CACHE_B15_RAC_DYING,
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
+
+	/* Online section invoked on the hotplugged CPU from the hotplug thread */
 	CPUHP_AP_ONLINE_IDLE,
 	CPUHP_AP_SCHED_WAIT_EMPTY,
 	CPUHP_AP_SMPBOOT_THREADS,
@@ -216,14 +257,15 @@ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name,
 				   int (*teardown)(unsigned int cpu),
 				   bool multi_instance);
 /**
- * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks
+ * cpuhp_setup_state - Setup hotplug state callbacks with calling the @startup
+ *                     callback
  * @state:	The state for which the calls are installed
  * @name:	Name of the callback (will be used in debug output)
- * @startup:	startup callback function
- * @teardown:	teardown callback function
+ * @startup:	startup callback function or NULL if not required
+ * @teardown:	teardown callback function or NULL if not required
  *
- * Installs the callback functions and invokes the startup callback on
- * the present cpus which have already reached the @state.
+ * Installs the callback functions and invokes the @startup callback on
+ * the online cpus which have already reached the @state.
  */
 static inline int cpuhp_setup_state(enum cpuhp_state state,
 				    const char *name,
@@ -233,6 +275,18 @@ static inline int cpuhp_setup_state(enum cpuhp_state state,
 	return __cpuhp_setup_state(state, name, true, startup, teardown, false);
 }
 
+/**
+ * cpuhp_setup_state_cpuslocked - Setup hotplug state callbacks with calling
+ *				  @startup callback from a cpus_read_lock()
+ *				  held region
+ * @state:	The state for which the calls are installed
+ * @name:	Name of the callback (will be used in debug output)
+ * @startup:	startup callback function or NULL if not required
+ * @teardown:	teardown callback function or NULL if not required
+ *
+ * Same as cpuhp_setup_state() except that it must be invoked from within a
+ * cpus_read_lock() held region.
+ */
 static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
 					       const char *name,
 					       int (*startup)(unsigned int cpu),
@@ -244,14 +298,14 @@ static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
 
 /**
  * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the
- *			       callbacks
+ *			       @startup callback
  * @state:	The state for which the calls are installed
  * @name:	Name of the callback.
- * @startup:	startup callback function
- * @teardown:	teardown callback function
+ * @startup:	startup callback function or NULL if not required
+ * @teardown:	teardown callback function or NULL if not required
  *
- * Same as @cpuhp_setup_state except that no calls are executed are invoked
- * during installation of this callback. NOP if SMP=n or HOTPLUG_CPU=n.
+ * Same as cpuhp_setup_state() except that the @startup callback is not
+ * invoked during installation. NOP if SMP=n or HOTPLUG_CPU=n.
  */
 static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state,
 					    const char *name,
@@ -262,6 +316,19 @@ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state,
 				   false);
 }
 
+/**
+ * cpuhp_setup_state_nocalls_cpuslocked - Setup hotplug state callbacks without
+ *					  invoking the @startup callback from
+ *					  a cpus_read_lock() held region
+ *			       callbacks
+ * @state:	The state for which the calls are installed
+ * @name:	Name of the callback.
+ * @startup:	startup callback function or NULL if not required
+ * @teardown:	teardown callback function or NULL if not required
+ *
+ * Same as cpuhp_setup_state_nocalls() except that it must be invoked from
+ * within a cpus_read_lock() held region.
+ */
 static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state,
 						     const char *name,
 						     int (*startup)(unsigned int cpu),
@@ -275,13 +342,13 @@ static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state,
  * cpuhp_setup_state_multi - Add callbacks for multi state
  * @state:	The state for which the calls are installed
  * @name:	Name of the callback.
- * @startup:	startup callback function
- * @teardown:	teardown callback function
+ * @startup:	startup callback function or NULL if not required
+ * @teardown:	teardown callback function or NULL if not required
  *
  * Sets the internal multi_instance flag and prepares a state to work as a multi
  * instance callback. No callbacks are invoked at this point. The callbacks are
  * invoked once an instance for this state are registered via
- * @cpuhp_state_add_instance or @cpuhp_state_add_instance_nocalls.
+ * cpuhp_state_add_instance() or cpuhp_state_add_instance_nocalls()
  */
 static inline int cpuhp_setup_state_multi(enum cpuhp_state state,
 					  const char *name,
@@ -306,9 +373,10 @@ int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
  * @state:	The state for which the instance is installed
  * @node:	The node for this individual state.
  *
- * Installs the instance for the @state and invokes the startup callback on
- * the present cpus which have already reached the @state. The @state must have
- * been earlier marked as multi-instance by @cpuhp_setup_state_multi.
+ * Installs the instance for the @state and invokes the registered startup
+ * callback on the online cpus which have already reached the @state. The
+ * @state must have been earlier marked as multi-instance by
+ * cpuhp_setup_state_multi().
  */
 static inline int cpuhp_state_add_instance(enum cpuhp_state state,
 					   struct hlist_node *node)
@@ -322,8 +390,9 @@ static inline int cpuhp_state_add_instance(enum cpuhp_state state,
  * @state:	The state for which the instance is installed
  * @node:	The node for this individual state.
  *
- * Installs the instance for the @state The @state must have been earlier
- * marked as multi-instance by @cpuhp_setup_state_multi.
+ * Installs the instance for the @state. The @state must have been earlier
+ * marked as multi-instance by cpuhp_setup_state_multi. NOP if SMP=n or
+ * HOTPLUG_CPU=n.
  */
 static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state,
 						   struct hlist_node *node)
@@ -331,6 +400,17 @@ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state,
 	return __cpuhp_state_add_instance(state, node, false);
 }
 
+/**
+ * cpuhp_state_add_instance_nocalls_cpuslocked - Add an instance for a state
+ *						 without invoking the startup
+ *						 callback from a cpus_read_lock()
+ *						 held region.
+ * @state:	The state for which the instance is installed
+ * @node:	The node for this individual state.
+ *
+ * Same as cpuhp_state_add_instance_nocalls() except that it must be
+ * invoked from within a cpus_read_lock() held region.
+ */
 static inline int
 cpuhp_state_add_instance_nocalls_cpuslocked(enum cpuhp_state state,
 					    struct hlist_node *node)
@@ -346,7 +426,7 @@ void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke);
  * @state:	The state for which the calls are removed
  *
  * Removes the callback functions and invokes the teardown callback on
- * the present cpus which have already reached the @state.
+ * the online cpus which have already reached the @state.
  */
 static inline void cpuhp_remove_state(enum cpuhp_state state)
 {
@@ -355,7 +435,7 @@ static inline void cpuhp_remove_state(enum cpuhp_state state)
 
 /**
  * cpuhp_remove_state_nocalls - Remove hotplug state callbacks without invoking
- *				teardown
+ *				the teardown callback
  * @state:	The state for which the calls are removed
  */
 static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state)
@@ -363,6 +443,14 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state)
 	__cpuhp_remove_state(state, false);
 }
 
+/**
+ * cpuhp_remove_state_nocalls_cpuslocked - Remove hotplug state callbacks without invoking
+ *					   teardown from a cpus_read_lock() held region.
+ * @state:	The state for which the calls are removed
+ *
+ * Same as cpuhp_remove_state nocalls() except that it must be invoked
+ * from within a cpus_read_lock() held region.
+ */
 static inline void cpuhp_remove_state_nocalls_cpuslocked(enum cpuhp_state state)
 {
 	__cpuhp_remove_state_cpuslocked(state, false);
@@ -390,8 +478,8 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,
  * @state:	The state from which the instance is removed
  * @node:	The node for this individual state.
  *
- * Removes the instance and invokes the teardown callback on the present cpus
- * which have already reached the @state.
+ * Removes the instance and invokes the teardown callback on the online cpus
+ * which have already reached @state.
  */
 static inline int cpuhp_state_remove_instance(enum cpuhp_state state,
 					      struct hlist_node *node)
-- 
cgit v1.2.3