From 2d4f545cb14fa2d9865124183f3100d46dc9b3b5 Mon Sep 17 00:00:00 2001 From: Peter Meerwald Date: Mon, 4 Jun 2018 23:34:06 +0200 Subject: rfkill: Fixes and cleanup of kernel-doc in the header file Fixes kerneldoc parameter names to match implementation, rfkill_set_hw_state(), rfkill_set_sw_state(). Fix description of rfkill_resume_polling(). Fix typos in documentation of rfkill_find_type(). Consistently start kerneldoc description with uppercase letter. Signed-off-by: Peter Meerwald-Stadler Signed-off-by: Johannes Berg --- include/linux/rfkill.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index e6a0031d1b1f..8ad2487a86d5 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -66,7 +66,7 @@ struct rfkill_ops { #if defined(CONFIG_RFKILL) || defined(CONFIG_RFKILL_MODULE) /** - * rfkill_alloc - allocate rfkill structure + * rfkill_alloc - Allocate rfkill structure * @name: name of the struct -- the string is not copied internally * @parent: device that has rf switch on it * @type: type of the switch (RFKILL_TYPE_*) @@ -112,7 +112,7 @@ void rfkill_pause_polling(struct rfkill *rfkill); /** * rfkill_resume_polling(struct rfkill *rfkill) * - * Pause polling -- say transmitter is off for other reasons. + * Resume polling * NOTE: not necessary for suspend/resume -- in that case the * core stops polling anyway */ @@ -130,7 +130,7 @@ void rfkill_resume_polling(struct rfkill *rfkill); void rfkill_unregister(struct rfkill *rfkill); /** - * rfkill_destroy - free rfkill structure + * rfkill_destroy - Free rfkill structure * @rfkill: rfkill structure to be destroyed * * Destroys the rfkill structure. @@ -140,7 +140,7 @@ void rfkill_destroy(struct rfkill *rfkill); /** * rfkill_set_hw_state - Set the internal rfkill hardware block state * @rfkill: pointer to the rfkill class to modify. - * @state: the current hardware block state to set + * @blocked: the current hardware block state to set * * rfkill drivers that get events when the hard-blocked state changes * use this function to notify the rfkill core (and through that also @@ -161,7 +161,7 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked); /** * rfkill_set_sw_state - Set the internal rfkill software block state * @rfkill: pointer to the rfkill class to modify. - * @state: the current software block state to set + * @blocked: the current software block state to set * * rfkill drivers that get events when the soft-blocked state changes * (yes, some platforms directly act on input but allow changing again) @@ -183,7 +183,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked); /** * rfkill_init_sw_state - Initialize persistent software block state * @rfkill: pointer to the rfkill class to modify. - * @state: the current software block state to set + * @blocked: the current software block state to set * * rfkill drivers that preserve their software block state over power off * use this function to notify the rfkill core (and through that also @@ -208,17 +208,17 @@ void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked); void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw); /** - * rfkill_blocked - query rfkill block + * rfkill_blocked - Query rfkill block state * * @rfkill: rfkill struct to query */ bool rfkill_blocked(struct rfkill *rfkill); /** - * rfkill_find_type - Helpper for finding rfkill type by name + * rfkill_find_type - Helper for finding rfkill type by name * @name: the name of the type * - * Returns enum rfkill_type that conrresponds the name. + * Returns enum rfkill_type that corresponds to the name. */ enum rfkill_type rfkill_find_type(const char *name); @@ -296,7 +296,7 @@ static inline enum rfkill_type rfkill_find_type(const char *name) const char *rfkill_get_led_trigger_name(struct rfkill *rfkill); /** - * rfkill_set_led_trigger_name -- set the LED trigger name + * rfkill_set_led_trigger_name - Set the LED trigger name * @rfkill: rfkill struct * @name: LED trigger name * -- cgit v1.2.3 From c4cbaf7973a794839af080f13748335976cf3f3f Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 9 Jun 2018 09:14:42 +0300 Subject: cfg80211: Add support for HE Add support for the HE in cfg80211 and also add userspace API to nl80211 to send rate information out, conforming with P802.11ax_D2.0. Signed-off-by: Liad Kaufman Signed-off-by: Johannes Berg Signed-off-by: Ilan Peer Signed-off-by: Ido Yariv Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 427 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 427 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 8fe7e4306816..e6a6503bfa33 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1539,6 +1539,106 @@ struct ieee80211_vht_operation { __le16 basic_mcs_set; } __packed; +/** + * struct ieee80211_he_cap_elem - HE capabilities element + * + * This structure is the "HE capabilities element" fixed fields as + * described in P802.11ax_D2.0 section 9.4.2.237.2 and 9.4.2.237.3 + */ +struct ieee80211_he_cap_elem { + u8 mac_cap_info[5]; + u8 phy_cap_info[9]; +} __packed; + +#define IEEE80211_TX_RX_MCS_NSS_DESC_MAX_LEN 5 + +/** + * enum ieee80211_he_mcs_support - HE MCS support definitions + * @IEEE80211_HE_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the + * number of streams + * @IEEE80211_HE_MCS_SUPPORT_0_9: MCSes 0-9 are supported + * @IEEE80211_HE_MCS_SUPPORT_0_11: MCSes 0-11 are supported + * @IEEE80211_HE_MCS_NOT_SUPPORTED: This number of streams isn't supported + * + * These definitions are used in each 2-bit subfield of the rx_mcs_* + * and tx_mcs_* fields of &struct ieee80211_he_mcs_nss_supp, which are + * both split into 8 subfields by number of streams. These values indicate + * which MCSes are supported for the number of streams the value appears + * for. + */ +enum ieee80211_he_mcs_support { + IEEE80211_HE_MCS_SUPPORT_0_7 = 0, + IEEE80211_HE_MCS_SUPPORT_0_9 = 1, + IEEE80211_HE_MCS_SUPPORT_0_11 = 2, + IEEE80211_HE_MCS_NOT_SUPPORTED = 3, +}; + +/** + * struct ieee80211_he_mcs_nss_supp - HE Tx/Rx HE MCS NSS Support Field + * + * This structure holds the data required for the Tx/Rx HE MCS NSS Support Field + * described in P802.11ax_D2.0 section 9.4.2.237.4 + * + * @rx_mcs_80: Rx MCS map 2 bits for each stream, total 8 streams, for channel + * widths less than 80MHz. + * @tx_mcs_80: Tx MCS map 2 bits for each stream, total 8 streams, for channel + * widths less than 80MHz. + * @rx_mcs_160: Rx MCS map 2 bits for each stream, total 8 streams, for channel + * width 160MHz. + * @tx_mcs_160: Tx MCS map 2 bits for each stream, total 8 streams, for channel + * width 160MHz. + * @rx_mcs_80p80: Rx MCS map 2 bits for each stream, total 8 streams, for + * channel width 80p80MHz. + * @tx_mcs_80p80: Tx MCS map 2 bits for each stream, total 8 streams, for + * channel width 80p80MHz. + */ +struct ieee80211_he_mcs_nss_supp { + __le16 rx_mcs_80; + __le16 tx_mcs_80; + __le16 rx_mcs_160; + __le16 tx_mcs_160; + __le16 rx_mcs_80p80; + __le16 tx_mcs_80p80; +} __packed; + +/** + * struct ieee80211_he_operation - HE capabilities element + * + * This structure is the "HE operation element" fields as + * described in P802.11ax_D2.0 section 9.4.2.238 + */ +struct ieee80211_he_operation { + __le32 he_oper_params; + __le16 he_mcs_nss_set; + /* Optional 0,1,3 or 4 bytes: depends on @he_oper_params */ + u8 optional[0]; +} __packed; + +/** + * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field + * + * This structure is the "MU AC Parameter Record" fields as + * described in P802.11ax_D2.0 section 9.4.2.240 + */ +struct ieee80211_he_mu_edca_param_ac_rec { + u8 aifsn; + u8 ecw_min_max; + u8 mu_edca_timer; +} __packed; + +/** + * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element + * + * This structure is the "MU EDCA Parameter Set element" fields as + * described in P802.11ax_D2.0 section 9.4.2.240 + */ +struct ieee80211_mu_edca_param_set { + u8 mu_qos_info; + struct ieee80211_he_mu_edca_param_ac_rec ac_be; + struct ieee80211_he_mu_edca_param_ac_rec ac_bk; + struct ieee80211_he_mu_edca_param_ac_rec ac_vi; + struct ieee80211_he_mu_edca_param_ac_rec ac_vo; +} __packed; /* 802.11ac VHT Capabilities */ #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895 0x00000000 @@ -1577,6 +1677,328 @@ struct ieee80211_vht_operation { #define IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN 0x10000000 #define IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN 0x20000000 +/* 802.11ax HE MAC capabilities */ +#define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 +#define IEEE80211_HE_MAC_CAP0_TWT_REQ 0x02 +#define IEEE80211_HE_MAC_CAP0_TWT_RES 0x04 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_NOT_SUPP 0x00 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_1 0x08 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_2 0x10 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_3 0x18 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK 0x18 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_1 0x00 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_2 0x20 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_4 0x40 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_8 0x60 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_16 0x80 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_32 0xa0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_64 0xc0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_UNLIMITED 0xe0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_MASK 0xe0 + +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_UNLIMITED 0x00 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_128 0x01 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_256 0x02 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_512 0x03 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_MASK 0x03 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_0US 0x00 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_8US 0x04 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US 0x08 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK 0x0c +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_1 0x00 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_2 0x10 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_3 0x20 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_4 0x30 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_5 0x40 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_6 0x50 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_7 0x60 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_8 0x70 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_QOS_MASK 0x70 + +/* Link adaptation is split between byte HE_MAC_CAP1 and + * HE_MAC_CAP2. It should be set only if IEEE80211_HE_MAC_CAP0_HTC_HE + * in which case the following values apply: + * 0 = No feedback. + * 1 = reserved. + * 2 = Unsolicited feedback. + * 3 = both + */ +#define IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION 0x80 + +#define IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION 0x01 +#define IEEE80211_HE_MAC_CAP2_ALL_ACK 0x02 +#define IEEE80211_HE_MAC_CAP2_UL_MU_RESP_SCHED 0x04 +#define IEEE80211_HE_MAC_CAP2_BSR 0x08 +#define IEEE80211_HE_MAC_CAP2_BCAST_TWT 0x10 +#define IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP 0x20 +#define IEEE80211_HE_MAC_CAP2_MU_CASCADING 0x40 +#define IEEE80211_HE_MAC_CAP2_ACK_EN 0x80 + +#define IEEE80211_HE_MAC_CAP3_GRP_ADDR_MULTI_STA_BA_DL_MU 0x01 +#define IEEE80211_HE_MAC_CAP3_OMI_CONTROL 0x02 +#define IEEE80211_HE_MAC_CAP3_OFDMA_RA 0x04 + +/* The maximum length of an A-MDPU is defined by the combination of the Maximum + * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the + * same field in the HE capabilities. + */ +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_USE_VHT 0x00 +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_1 0x08 +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_VHT_2 0x10 +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_RESERVED 0x18 +#define IEEE80211_HE_MAC_CAP3_MAX_A_AMPDU_LEN_EXP_MASK 0x18 +#define IEEE80211_HE_MAC_CAP3_A_AMSDU_FRAG 0x20 +#define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED 0x40 +#define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS 0x80 + +#define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG 0x01 +#define IEEE80211_HE_MAC_CAP4_QTP 0x02 +#define IEEE80211_HE_MAC_CAP4_BQR 0x04 +#define IEEE80211_HE_MAC_CAP4_SR_RESP 0x08 +#define IEEE80211_HE_MAC_CAP4_NDP_FB_REP 0x10 +#define IEEE80211_HE_MAC_CAP4_OPS 0x20 +#define IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU 0x40 + +/* 802.11ax HE PHY capabilities */ +#define IEEE80211_HE_PHY_CAP0_DUAL_BAND 0x01 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G 0x02 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G 0x04 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G 0x08 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G 0x10 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G 0x20 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G 0x40 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK 0xfe + +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ 0x01 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ 0x02 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ 0x04 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ 0x08 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK 0x0f +#define IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A 0x10 +#define IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD 0x20 +#define IEEE80211_HE_PHY_CAP1_HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US 0x40 +/* Midamble RX Max NSTS is split between byte #2 and byte #3 */ +#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_MAX_NSTS 0x80 + +#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_MAX_NSTS 0x01 +#define IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US 0x02 +#define IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ 0x04 +#define IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ 0x08 +#define IEEE80211_HE_PHY_CAP2_DOPPLER_TX 0x10 +#define IEEE80211_HE_PHY_CAP2_DOPPLER_RX 0x20 + +/* Note that the meaning of UL MU below is different between an AP and a non-AP + * sta, where in the AP case it indicates support for Rx and in the non-AP sta + * case it indicates support for Tx. + */ +#define IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO 0x40 +#define IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO 0x80 + +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK 0x01 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK 0x02 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM 0x03 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK 0x03 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_1 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_2 0x04 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK 0x08 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK 0x10 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM 0x18 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK 0x18 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2 0x20 +#define IEEE80211_HE_PHY_CAP3_RX_HE_MU_PPDU_FROM_NON_AP_STA 0x40 +#define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER 0x80 + +#define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE 0x01 +#define IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER 0x02 + +/* Minimal allowed value of Max STS under 80MHz is 3 */ +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_4 0x0c +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_5 0x10 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_6 0x14 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_7 0x18 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_8 0x1c +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_MASK 0x1c + +/* Minimal allowed value of Max STS above 80MHz is 3 */ +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_4 0x60 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_5 0x80 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_6 0xa0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_7 0xc0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_8 0xe0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_MASK 0xe0 + +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_1 0x00 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2 0x01 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_3 0x02 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_4 0x03 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_5 0x04 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_6 0x05 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_7 0x06 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_8 0x07 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK 0x07 + +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_1 0x00 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2 0x08 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_3 0x10 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_4 0x18 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_5 0x20 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_6 0x28 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_7 0x30 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_8 0x38 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK 0x38 + +#define IEEE80211_HE_PHY_CAP5_NG16_SU_FEEDBACK 0x40 +#define IEEE80211_HE_PHY_CAP5_NG16_MU_FEEDBACK 0x80 + +#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU 0x01 +#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU 0x02 +#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB 0x04 +#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB 0x08 +#define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB 0x10 +#define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE 0x20 +#define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO 0x40 +#define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT 0x80 + +#define IEEE80211_HE_PHY_CAP7_SRP_BASED_SR 0x01 +#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR 0x02 +#define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI 0x04 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_1 0x08 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_2 0x10 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_3 0x18 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_4 0x20 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_5 0x28 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_6 0x30 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_7 0x38 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_MASK 0x38 +#define IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ 0x40 +#define IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ 0x80 + +#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI 0x01 +#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G 0x02 +#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU 0x04 +#define IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU 0x08 +#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_1XLTF_AND_08_US_GI 0x10 +#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_2X_AND_1XLTF 0x20 + +/* 802.11ax HE TX/RX MCS NSS Support */ +#define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS (3) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_POS (6) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_POS (11) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_MASK 0x07c0 +#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_MASK 0xf800 + +/* TX/RX HE MCS Support field Highest MCS subfield encoding */ +enum ieee80211_he_highest_mcs_supported_subfield_enc { + HIGHEST_MCS_SUPPORTED_MCS7 = 0, + HIGHEST_MCS_SUPPORTED_MCS8, + HIGHEST_MCS_SUPPORTED_MCS9, + HIGHEST_MCS_SUPPORTED_MCS10, + HIGHEST_MCS_SUPPORTED_MCS11, +}; + +/* Calculate 802.11ax HE capabilities IE Tx/Rx HE MCS NSS Support Field size */ +static inline u8 +ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap) +{ + u8 count = 4; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) + count += 4; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) + count += 4; + + return count; +} + +/* 802.11ax HE PPE Thresholds */ +#define IEEE80211_PPE_THRES_NSS_SUPPORT_2NSS (1) +#define IEEE80211_PPE_THRES_NSS_POS (0) +#define IEEE80211_PPE_THRES_NSS_MASK (7) +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_2x966_AND_966_RU \ + (BIT(5) | BIT(6)) +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK 0x78 +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS (3) +#define IEEE80211_PPE_THRES_INFO_PPET_SIZE (3) + +/* + * Calculate 802.11ax HE capabilities IE PPE field size + * Input: Header byte of ppe_thres (first byte), and HE capa IE's PHY cap u8* + */ +static inline u8 +ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) +{ + u8 n; + + if ((phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) + return 0; + + n = hweight8(ppe_thres_hdr & + IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); + n *= (1 + ((ppe_thres_hdr & IEEE80211_PPE_THRES_NSS_MASK) >> + IEEE80211_PPE_THRES_NSS_POS)); + + /* + * Each pair is 6 bits, and we need to add the 7 "header" bits to the + * total size. + */ + n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; + n = DIV_ROUND_UP(n, 8); + + return n; +} + +/* HE Operation defines */ +#define IEEE80211_HE_OPERATION_BSS_COLOR_MASK 0x0000003f +#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK 0x000001c0 +#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_OFFSET 6 +#define IEEE80211_HE_OPERATION_TWT_REQUIRED 0x00000200 +#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK 0x000ffc00 +#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET 10 +#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR 0x000100000 +#define IEEE80211_HE_OPERATION_VHT_OPER_INFO 0x000200000 +#define IEEE80211_HE_OPERATION_MULTI_BSSID_AP 0x10000000 +#define IEEE80211_HE_OPERATION_TX_BSSID_INDICATOR 0x20000000 +#define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED 0x40000000 + +/* + * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size + * @he_oper_ie: byte data of the He Operations IE, stating from the the byte + * after the ext ID byte. It is assumed that he_oper_ie has at least + * sizeof(struct ieee80211_he_operation) bytes, checked already in + * ieee802_11_parse_elems_crc() + * @return the actual size of the IE data (not including header), or 0 on error + */ +static inline u8 +ieee80211_he_oper_size(const u8 *he_oper_ie) +{ + struct ieee80211_he_operation *he_oper = (void *)he_oper_ie; + u8 oper_len = sizeof(struct ieee80211_he_operation); + u32 he_oper_params; + + /* Make sure the input is not NULL */ + if (!he_oper_ie) + return 0; + + /* Calc required length */ + he_oper_params = le32_to_cpu(he_oper->he_oper_params); + if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO) + oper_len += 3; + if (he_oper_params & IEEE80211_HE_OPERATION_MULTI_BSSID_AP) + oper_len++; + + /* Add the first byte (extension ID) to the total length */ + oper_len++; + + return oper_len; +} + /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 @@ -1992,6 +2414,11 @@ enum ieee80211_eid_ext { WLAN_EID_EXT_FILS_WRAPPED_DATA = 8, WLAN_EID_EXT_FILS_PUBLIC_KEY = 12, WLAN_EID_EXT_FILS_NONCE = 13, + WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE = 14, + WLAN_EID_EXT_HE_CAPABILITY = 35, + WLAN_EID_EXT_HE_OPERATION = 36, + WLAN_EID_EXT_UORA = 37, + WLAN_EID_EXT_HE_MU_EDCA = 38, }; /* Action category code */ -- cgit v1.2.3 From b8042b3da925f390c1482bf9dc0898dc0b3ea7b5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 18 Jun 2018 22:39:29 +0200 Subject: ieee80211: bump IEEE80211_MAX_AMPDU_BUF to support HE Bump the IEEE80211_MAX_AMPDU_BUF size to 0x100 for HE support and - for now - use IEEE80211_MAX_AMPDU_BUF_HT everywhere. This is derived from my internal patch, parts of which Luca had sent upstream. Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index e6a6503bfa33..9c03a7d5e400 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1433,11 +1433,13 @@ struct ieee80211_ht_operation { #define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800 /* - * A-PMDU buffer sizes - * According to IEEE802.11n spec size varies from 8K to 64K (in powers of 2) + * A-MPDU buffer sizes + * According to HT size varies from 8 to 64 frames + * HE adds the ability to have up to 256 frames. */ -#define IEEE80211_MIN_AMPDU_BUF 0x8 -#define IEEE80211_MAX_AMPDU_BUF 0x40 +#define IEEE80211_MIN_AMPDU_BUF 0x8 +#define IEEE80211_MAX_AMPDU_BUF_HT 0x40 +#define IEEE80211_MAX_AMPDU_BUF 0x100 /* Spatial Multiplexing Power Save Modes (for capability) */ -- cgit v1.2.3 From 38b7ca927d6a12bf4466be22a90b7d998f5ae69a Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Mar 2018 14:36:27 +0200 Subject: net/mlx5: Expose DEVX specification This patch updates the mlx5_ifc structures and command interface to support DEVX. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 3 ++ include/linux/mlx5/mlx5_ifc.h | 67 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 02f72ebf31a7..f8671c0a43aa 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1071,6 +1071,9 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_GEN(mdev, cap) \ MLX5_GET(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap) +#define MLX5_CAP_GEN_64(mdev, cap) \ + MLX5_GET64(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap) + #define MLX5_CAP_GEN_MAX(mdev, cap) \ MLX5_GET(cmd_hca_cap, mdev->caps.hca_max[MLX5_CAP_GENERAL], cap) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 27134c4fcb76..f810772e80c0 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -75,6 +75,15 @@ enum { MLX5_SET_HCA_CAP_OP_MOD_ATOMIC = 0x3, }; +enum { + MLX5_GENERAL_OBJ_TYPES_CAP_UCTX = (1ULL << 4), + MLX5_GENERAL_OBJ_TYPES_CAP_UMEM = (1ULL << 5), +}; + +enum { + MLX5_OBJ_TYPE_UCTX = 0x0004, +}; + enum { MLX5_CMD_OP_QUERY_HCA_CAP = 0x100, MLX5_CMD_OP_QUERY_ADAPTER = 0x101, @@ -242,6 +251,8 @@ enum { MLX5_CMD_OP_FPGA_QUERY_QP = 0x962, MLX5_CMD_OP_FPGA_DESTROY_QP = 0x963, MLX5_CMD_OP_FPGA_QUERY_QP_COUNTERS = 0x964, + MLX5_CMD_OP_CREATE_GENERAL_OBJECT = 0xa00, + MLX5_CMD_OP_DESTROY_GENERAL_OBJECT = 0xa03, MLX5_CMD_OP_MAX }; @@ -1113,7 +1124,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_3f8[0x3]; u8 log_max_current_uc_list[0x5]; - u8 reserved_at_400[0x80]; + u8 general_obj_types[0x40]; + + u8 reserved_at_440[0x40]; u8 reserved_at_480[0x3]; u8 log_max_l2_table[0x5]; @@ -9115,4 +9128,56 @@ struct mlx5_ifc_dealloc_memic_out_bits { u8 reserved_at_40[0x40]; }; +struct mlx5_ifc_general_obj_in_cmd_hdr_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 obj_type[0x10]; + + u8 obj_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_general_obj_out_cmd_hdr_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 obj_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_umem_bits { + u8 modify_field_select[0x40]; + + u8 reserved_at_40[0x5b]; + u8 log_page_size[0x5]; + + u8 page_offset[0x20]; + + u8 num_of_mtt[0x40]; + + struct mlx5_ifc_mtt_bits mtt[0]; +}; + +struct mlx5_ifc_uctx_bits { + u8 modify_field_select[0x40]; + + u8 reserved_at_40[0x1c0]; +}; + +struct mlx5_ifc_create_umem_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_umem_bits umem; +}; + +struct mlx5_ifc_create_uctx_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_uctx_bits uctx; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From 0af5107cd0640ee3424e337b492e4b11b450ce28 Mon Sep 17 00:00:00 2001 From: Talat Batheesh Date: Thu, 17 May 2018 11:14:18 +0300 Subject: net/mlx5: Add RoCE RX ICRC encapsulated counter Add capability bit in PCAM register and RoCE ICRC error counter to PPCNT register. Signed-off-by: Talat Batheesh Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f810772e80c0..deb3a459a22e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1681,7 +1681,11 @@ struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits { u8 rx_buffer_full_low[0x20]; - u8 reserved_at_1c0[0x600]; + u8 rx_icrc_encapsulated_high[0x20]; + + u8 rx_icrc_encapsulated_low[0x20]; + + u8 reserved_at_200[0x5c0]; }; struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits { @@ -8044,8 +8048,9 @@ struct mlx5_ifc_peir_reg_bits { }; struct mlx5_ifc_pcam_enhanced_features_bits { - u8 reserved_at_0[0x76]; - + u8 reserved_at_0[0x6d]; + u8 rx_icrc_encapsulated_counter[0x1]; + u8 reserved_at_6e[0x8]; u8 pfcc_mask[0x1]; u8 reserved_at_77[0x4]; u8 rx_buffer_fullness_counters[0x1]; -- cgit v1.2.3 From 0eb71a9da5796851fa87ddc1a534066c0fe54055 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Jun 2018 12:52:50 +1000 Subject: rhashtable: split rhashtable.h Due to the use of rhashtables in net namespaces, rhashtable.h is included in lots of the kernel, so a small changes can required a large recompilation. This makes development painful. This patch splits out rhashtable-types.h which just includes the major type declarations, and does not include (non-trivial) inline code. rhashtable.h is no longer included by anything in the include/ directory. Common include files only include rhashtable-types.h so a large recompilation is only triggered when that changes. Acked-by: Herbert Xu Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/ipc.h | 2 +- include/linux/ipc_namespace.h | 2 +- include/linux/mroute_base.h | 2 +- include/linux/rhashtable-types.h | 139 +++++++++++++++++++++++++++++++++++++++ include/linux/rhashtable.h | 127 +---------------------------------- 5 files changed, 144 insertions(+), 128 deletions(-) create mode 100644 include/linux/rhashtable-types.h (limited to 'include/linux') diff --git a/include/linux/ipc.h b/include/linux/ipc.h index 6cc2df7f7ac9..e1c9eea6015b 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index b5630c8eb2f3..6cea726612b7 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include struct user_namespace; diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index d633f737b3c6..fd436cdd4725 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -2,7 +2,7 @@ #define __LINUX_MROUTE_BASE_H #include -#include +#include #include #include #include diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h new file mode 100644 index 000000000000..9740063ff13b --- /dev/null +++ b/include/linux/rhashtable-types.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Resizable, Scalable, Concurrent Hash Table + * + * Simple structures that might be needed in include + * files. + */ + +#ifndef _LINUX_RHASHTABLE_TYPES_H +#define _LINUX_RHASHTABLE_TYPES_H + +#include +#include +#include +#include + +struct rhash_head { + struct rhash_head __rcu *next; +}; + +struct rhlist_head { + struct rhash_head rhead; + struct rhlist_head __rcu *next; +}; + +struct bucket_table; + +/** + * struct rhashtable_compare_arg - Key for the function rhashtable_compare + * @ht: Hash table + * @key: Key to compare against + */ +struct rhashtable_compare_arg { + struct rhashtable *ht; + const void *key; +}; + +typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed); +typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed); +typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, + const void *obj); + +/** + * struct rhashtable_params - Hash table construction parameters + * @nelem_hint: Hint on number of elements, should be 75% of desired size + * @key_len: Length of key + * @key_offset: Offset of key in struct to be hashed + * @head_offset: Offset of rhash_head in struct to be hashed + * @max_size: Maximum size while expanding + * @min_size: Minimum size while shrinking + * @locks_mul: Number of bucket locks to allocate per cpu (default: 32) + * @automatic_shrinking: Enable automatic shrinking of tables + * @nulls_base: Base value to generate nulls marker + * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) + * @obj_hashfn: Function to hash object + * @obj_cmpfn: Function to compare key with object + */ +struct rhashtable_params { + u16 nelem_hint; + u16 key_len; + u16 key_offset; + u16 head_offset; + unsigned int max_size; + u16 min_size; + bool automatic_shrinking; + u8 locks_mul; + u32 nulls_base; + rht_hashfn_t hashfn; + rht_obj_hashfn_t obj_hashfn; + rht_obj_cmpfn_t obj_cmpfn; +}; + +/** + * struct rhashtable - Hash table handle + * @tbl: Bucket table + * @key_len: Key length for hashfn + * @max_elems: Maximum number of elements in table + * @p: Configuration parameters + * @rhlist: True if this is an rhltable + * @run_work: Deferred worker to expand/shrink asynchronously + * @mutex: Mutex to protect current/future table swapping + * @lock: Spin lock to protect walker list + * @nelems: Number of elements in table + */ +struct rhashtable { + struct bucket_table __rcu *tbl; + unsigned int key_len; + unsigned int max_elems; + struct rhashtable_params p; + bool rhlist; + struct work_struct run_work; + struct mutex mutex; + spinlock_t lock; + atomic_t nelems; +}; + +/** + * struct rhltable - Hash table with duplicate objects in a list + * @ht: Underlying rhtable + */ +struct rhltable { + struct rhashtable ht; +}; + +/** + * struct rhashtable_walker - Hash table walker + * @list: List entry on list of walkers + * @tbl: The table that we were walking over + */ +struct rhashtable_walker { + struct list_head list; + struct bucket_table *tbl; +}; + +/** + * struct rhashtable_iter - Hash table iterator + * @ht: Table to iterate through + * @p: Current pointer + * @list: Current hash list pointer + * @walker: Associated rhashtable walker + * @slot: Current slot + * @skip: Number of entries to skip in slot + */ +struct rhashtable_iter { + struct rhashtable *ht; + struct rhash_head *p; + struct rhlist_head *list; + struct rhashtable_walker walker; + unsigned int slot; + unsigned int skip; + bool end_of_table; +}; + +int rhashtable_init(struct rhashtable *ht, + const struct rhashtable_params *params); +int rhltable_init(struct rhltable *hlt, + const struct rhashtable_params *params); + +#endif /* _LINUX_RHASHTABLE_TYPES_H */ diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 4e1f535c2034..48754ab07cdf 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Resizable, Scalable, Concurrent Hash Table * @@ -17,16 +18,14 @@ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H -#include -#include #include #include #include #include #include -#include #include +#include /* * The end of the chain is marked with a special nulls marks which has * the following format: @@ -64,15 +63,6 @@ */ #define RHT_ELASTICITY 16u -struct rhash_head { - struct rhash_head __rcu *next; -}; - -struct rhlist_head { - struct rhash_head rhead; - struct rhlist_head __rcu *next; -}; - /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets @@ -102,114 +92,6 @@ struct bucket_table { struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; -/** - * struct rhashtable_compare_arg - Key for the function rhashtable_compare - * @ht: Hash table - * @key: Key to compare against - */ -struct rhashtable_compare_arg { - struct rhashtable *ht; - const void *key; -}; - -typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed); -typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed); -typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, - const void *obj); - -struct rhashtable; - -/** - * struct rhashtable_params - Hash table construction parameters - * @nelem_hint: Hint on number of elements, should be 75% of desired size - * @key_len: Length of key - * @key_offset: Offset of key in struct to be hashed - * @head_offset: Offset of rhash_head in struct to be hashed - * @max_size: Maximum size while expanding - * @min_size: Minimum size while shrinking - * @locks_mul: Number of bucket locks to allocate per cpu (default: 32) - * @automatic_shrinking: Enable automatic shrinking of tables - * @nulls_base: Base value to generate nulls marker - * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) - * @obj_hashfn: Function to hash object - * @obj_cmpfn: Function to compare key with object - */ -struct rhashtable_params { - u16 nelem_hint; - u16 key_len; - u16 key_offset; - u16 head_offset; - unsigned int max_size; - u16 min_size; - bool automatic_shrinking; - u8 locks_mul; - u32 nulls_base; - rht_hashfn_t hashfn; - rht_obj_hashfn_t obj_hashfn; - rht_obj_cmpfn_t obj_cmpfn; -}; - -/** - * struct rhashtable - Hash table handle - * @tbl: Bucket table - * @key_len: Key length for hashfn - * @max_elems: Maximum number of elements in table - * @p: Configuration parameters - * @rhlist: True if this is an rhltable - * @run_work: Deferred worker to expand/shrink asynchronously - * @mutex: Mutex to protect current/future table swapping - * @lock: Spin lock to protect walker list - * @nelems: Number of elements in table - */ -struct rhashtable { - struct bucket_table __rcu *tbl; - unsigned int key_len; - unsigned int max_elems; - struct rhashtable_params p; - bool rhlist; - struct work_struct run_work; - struct mutex mutex; - spinlock_t lock; - atomic_t nelems; -}; - -/** - * struct rhltable - Hash table with duplicate objects in a list - * @ht: Underlying rhtable - */ -struct rhltable { - struct rhashtable ht; -}; - -/** - * struct rhashtable_walker - Hash table walker - * @list: List entry on list of walkers - * @tbl: The table that we were walking over - */ -struct rhashtable_walker { - struct list_head list; - struct bucket_table *tbl; -}; - -/** - * struct rhashtable_iter - Hash table iterator - * @ht: Table to iterate through - * @p: Current pointer - * @list: Current hash list pointer - * @walker: Associated rhashtable walker - * @slot: Current slot - * @skip: Number of entries to skip in slot - */ -struct rhashtable_iter { - struct rhashtable *ht; - struct rhash_head *p; - struct rhlist_head *list; - struct rhashtable_walker walker; - unsigned int slot; - unsigned int skip; - bool end_of_table; -}; - static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash) { return NULLS_MARKER(ht->p.nulls_base + hash); @@ -376,11 +258,6 @@ static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, } #endif /* CONFIG_PROVE_LOCKING */ -int rhashtable_init(struct rhashtable *ht, - const struct rhashtable_params *params); -int rhltable_init(struct rhltable *hlt, - const struct rhashtable_params *params); - void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj); -- cgit v1.2.3 From 9f9a707738aa7a8b9f78a641b83927ada256a626 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Jun 2018 12:52:50 +1000 Subject: rhashtable: remove nulls_base and related code. This "feature" is unused, undocumented, and untested and so doesn't really belong. A patch is under development to properly implement support for detecting when a search gets diverted down a different chain, which the common purpose of nulls markers. This patch actually fixes a bug too. The table resizing allows a table to grow to 2^31 buckets, but the hash is truncated to 27 bits - any growth beyond 2^27 is wasteful an ineffective. This patch results in NULLS_MARKER(0) being used for all chains, and leaves the use of rht_is_a_null() to test for it. Acked-by: Herbert Xu Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable-types.h | 2 -- include/linux/rhashtable.h | 33 +++------------------------------ 2 files changed, 3 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 9740063ff13b..763d613ce2c2 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -50,7 +50,6 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, * @min_size: Minimum size while shrinking * @locks_mul: Number of bucket locks to allocate per cpu (default: 32) * @automatic_shrinking: Enable automatic shrinking of tables - * @nulls_base: Base value to generate nulls marker * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) * @obj_hashfn: Function to hash object * @obj_cmpfn: Function to compare key with object @@ -64,7 +63,6 @@ struct rhashtable_params { u16 min_size; bool automatic_shrinking; u8 locks_mul; - u32 nulls_base; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; rht_obj_cmpfn_t obj_cmpfn; diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 48754ab07cdf..d9f719af7936 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -28,25 +28,8 @@ #include /* * The end of the chain is marked with a special nulls marks which has - * the following format: - * - * +-------+-----------------------------------------------------+-+ - * | Base | Hash |1| - * +-------+-----------------------------------------------------+-+ - * - * Base (4 bits) : Reserved to distinguish between multiple tables. - * Specified via &struct rhashtable_params.nulls_base. - * Hash (27 bits): Full hash (unmasked) of first element added to bucket - * 1 (1 bit) : Nulls marker (always set) - * - * The remaining bits of the next pointer remain unused for now. + * the least significant bit set. */ -#define RHT_BASE_BITS 4 -#define RHT_HASH_BITS 27 -#define RHT_BASE_SHIFT RHT_HASH_BITS - -/* Base bits plus 1 bit for nulls marker */ -#define RHT_HASH_RESERVED_SPACE (RHT_BASE_BITS + 1) /* Maximum chain length before rehash * @@ -92,24 +75,14 @@ struct bucket_table { struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; -static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash) -{ - return NULLS_MARKER(ht->p.nulls_base + hash); -} - #define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \ - ((ptr) = (typeof(ptr)) rht_marker(ht, hash)) + ((ptr) = (typeof(ptr)) NULLS_MARKER(0)) static inline bool rht_is_a_nulls(const struct rhash_head *ptr) { return ((unsigned long) ptr & 1); } -static inline unsigned long rht_get_nulls_value(const struct rhash_head *ptr) -{ - return ((unsigned long) ptr) >> 1; -} - static inline void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) { @@ -119,7 +92,7 @@ static inline void *rht_obj(const struct rhashtable *ht, static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, unsigned int hash) { - return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1); + return hash & (tbl->size - 1); } static inline unsigned int rht_key_get_hash(struct rhashtable *ht, -- cgit v1.2.3 From 9b4f64a227b6f462482a8cc68c7134dc6e26f1c1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Jun 2018 12:52:50 +1000 Subject: rhashtable: simplify INIT_RHT_NULLS_HEAD() The 'ht' and 'hash' arguments to INIT_RHT_NULLS_HEAD() are no longer used - so drop them. This allows us to also remove the nhash argument from nested_table_alloc(). Acked-by: Herbert Xu Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index d9f719af7936..3f3a182bd0b4 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -75,7 +75,7 @@ struct bucket_table { struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; -#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \ +#define INIT_RHT_NULLS_HEAD(ptr) \ ((ptr) = (typeof(ptr)) NULLS_MARKER(0)) static inline bool rht_is_a_nulls(const struct rhash_head *ptr) -- cgit v1.2.3 From c0690016a73fe6bd456887bbbe6e10c7f0096554 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Jun 2018 12:52:50 +1000 Subject: rhashtable: clean up dereference of ->future_tbl. Using rht_dereference_bucket() to dereference ->future_tbl looks like a type error, and could be confusing. Using rht_dereference_rcu() to test a pointer for NULL adds an unnecessary barrier - rcu_access_pointer() is preferred for NULL tests when no lock is held. This uses 3 different ways to access ->future_tbl. - if we know the mutex is held, use rht_dereference() - if we don't hold the mutex, and are only testing for NULL, use rcu_access_pointer() - otherwise (using RCU protection for true dereference), use rht_dereference_rcu(). Note that this includes a simplification of the call to rhashtable_last_table() - we don't do an extra dereference before the call any more. Acked-by: Herbert Xu Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 3f3a182bd0b4..eb7111039247 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -595,7 +595,7 @@ static inline void *__rhashtable_insert_fast( lock = rht_bucket_lock(tbl, hash); spin_lock_bh(lock); - if (unlikely(rht_dereference_bucket(tbl->future_tbl, tbl, hash))) { + if (unlikely(rcu_access_pointer(tbl->future_tbl))) { slow_path: spin_unlock_bh(lock); rcu_read_unlock(); -- cgit v1.2.3 From 3f6c65d6255a872846c44182c82c78d3dc6239f5 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 19 Jun 2018 21:42:50 -0700 Subject: tcp: ignore rcv_rtt sample with old ts ecr value When receiving multiple packets with the same ts ecr value, only try to compute rcv_rtt sample with the earliest received packet. This is because the rcv_rtt calculated by later received packets could possibly include long idle time or other types of delay. For example: (1) server sends last packet of reply with TS val V1 (2) client ACKs last packet of reply with TS ecr V1 (3) long idle time passes (4) client sends next request data packet with TS ecr V1 (again!) At this time, the rcv_rtt computed on server with TS ecr V1 will be inflated with the idle time and should get ignored. Signed-off-by: Wei Wang Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 72705eaf4b84..3dbea6610304 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -350,6 +350,7 @@ struct tcp_sock { #endif /* Receiver side RTT estimation */ + u32 rcv_rtt_last_tsecr; struct { u32 rtt_us; u32 seq; -- cgit v1.2.3 From d4546c2509b1e9cd082e3682dcec98472e37ee5a Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 24 Jun 2018 14:13:49 +0900 Subject: net: Convert GRO SKB handling to list_head. Manage pending per-NAPI GRO packets via list_head. Return an SKB pointer from the GRO receive handlers. When GRO receive handlers return non-NULL, it means that this SKB needs to be completed at this time and removed from the NAPI queue. Several operations are greatly simplified by this transformation, especially timing out the oldest SKB in the list when gro_count exceeds MAX_GRO_SKBS, and napi_gro_flush() which walks the queue in reverse order. Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 3 +-- include/linux/netdevice.h | 32 ++++++++++++++++---------------- include/linux/skbuff.h | 3 ++- include/linux/udp.h | 4 ++-- 4 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 79563840c295..572e11bb8696 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -59,8 +59,7 @@ struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, unsigned int rxqs); #define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1) -struct sk_buff **eth_gro_receive(struct sk_buff **head, - struct sk_buff *skb); +struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb); int eth_gro_complete(struct sk_buff *skb, int nhoff); /* Reserved Ethernet Addresses per IEEE 802.1Q */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec9850c7936..f176d9873910 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -322,7 +322,7 @@ struct napi_struct { int poll_owner; #endif struct net_device *dev; - struct sk_buff *gro_list; + struct list_head gro_list; struct sk_buff *skb; struct hrtimer timer; struct list_head dev_list; @@ -2255,10 +2255,10 @@ static inline int gro_recursion_inc_test(struct sk_buff *skb) return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; } -typedef struct sk_buff **(*gro_receive_t)(struct sk_buff **, struct sk_buff *); -static inline struct sk_buff **call_gro_receive(gro_receive_t cb, - struct sk_buff **head, - struct sk_buff *skb) +typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *); +static inline struct sk_buff *call_gro_receive(gro_receive_t cb, + struct list_head *head, + struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; @@ -2268,12 +2268,12 @@ static inline struct sk_buff **call_gro_receive(gro_receive_t cb, return cb(head, skb); } -typedef struct sk_buff **(*gro_receive_sk_t)(struct sock *, struct sk_buff **, - struct sk_buff *); -static inline struct sk_buff **call_gro_receive_sk(gro_receive_sk_t cb, - struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *, + struct sk_buff *); +static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb, + struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; @@ -2299,8 +2299,8 @@ struct packet_type { struct offload_callbacks { struct sk_buff *(*gso_segment)(struct sk_buff *skb, netdev_features_t features); - struct sk_buff **(*gro_receive)(struct sk_buff **head, - struct sk_buff *skb); + struct sk_buff *(*gro_receive)(struct list_head *head, + struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb, int nhoff); }; @@ -2568,7 +2568,7 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); -int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb); +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); static inline unsigned int skb_gro_offset(const struct sk_buff *skb) { @@ -2784,13 +2784,13 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, } #ifdef CONFIG_XFRM_OFFLOAD -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush) +static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { if (PTR_ERR(pp) != -EINPROGRESS) NAPI_GRO_CB(skb)->flush |= flush; } #else -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush) +static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { NAPI_GRO_CB(skb)->flush |= flush; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c86885954994..7ccc601b55d9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -677,7 +677,8 @@ struct sk_buff { int ip_defrag_offset; }; }; - struct rb_node rbnode; /* used in netem & tcp stack */ + struct rb_node rbnode; /* used in netem & tcp stack */ + struct list_head list; }; struct sock *sk; diff --git a/include/linux/udp.h b/include/linux/udp.h index ca840345571b..320d49d85484 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -74,8 +74,8 @@ struct udp_sock { void (*encap_destroy)(struct sock *sk); /* GRO functions for UDP socket */ - struct sk_buff ** (*gro_receive)(struct sock *sk, - struct sk_buff **head, + struct sk_buff * (*gro_receive)(struct sock *sk, + struct list_head *head, struct sk_buff *skb); int (*gro_complete)(struct sock *sk, struct sk_buff *skb, -- cgit v1.2.3 From 07d78363dcffd9cb1bf6f06a6cac0e0847f3c1de Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 24 Jun 2018 14:14:02 +0900 Subject: net: Convert NAPI gro list into a small hash table. Improve the performance of GRO receive by splitting flows into multiple hash chains. Suggested-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f176d9873910..c6b377a15869 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -305,6 +305,7 @@ int __init netdev_boot_setup(char *str); /* * Structure for NAPI scheduling similar to tasklet but with weighting */ +#define GRO_HASH_BUCKETS 8 struct napi_struct { /* The poll_list must only be managed by the entity which * changes the state of the NAPI_STATE_SCHED bit. This means @@ -322,7 +323,7 @@ struct napi_struct { int poll_owner; #endif struct net_device *dev; - struct list_head gro_list; + struct list_head gro_hash[GRO_HASH_BUCKETS]; struct sk_buff *skb; struct hrtimer timer; struct list_head dev_list; -- cgit v1.2.3 From a8f62d0c6fe533e07cd1acce7588278f9d6e7720 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Mon, 25 Jun 2018 20:37:08 +0800 Subject: ptp: support DPAA FMan 1588 timer in ptp_qoriq This patch is to support DPAA (Data Path Acceleration Architecture) 1588 timer by adding "fsl,fman-ptp-timer" compatible, sharing interrupt with FMan, adding FSL_DPAA_ETH dependency, and fixing up register offset. Signed-off-by: Yangbo Lu Acked-by: Richard Cochran Acked-by: Madalin Bucur Signed-off-by: David S. Miller --- include/linux/fsl/ptp_qoriq.h | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h index b462d9ea8007..dc3dac40f069 100644 --- a/include/linux/fsl/ptp_qoriq.h +++ b/include/linux/fsl/ptp_qoriq.h @@ -11,9 +11,8 @@ /* * qoriq ptp registers - * Generated by regen.tcl on Thu May 13 01:38:57 PM CEST 2010 */ -struct qoriq_ptp_registers { +struct ctrl_regs { u32 tmr_ctrl; /* Timer control register */ u32 tmr_tevent; /* Timestamp event register */ u32 tmr_temask; /* Timer event mask register */ @@ -28,22 +27,47 @@ struct qoriq_ptp_registers { u8 res1[4]; u32 tmroff_h; /* Timer offset high */ u32 tmroff_l; /* Timer offset low */ - u8 res2[8]; +}; + +struct alarm_regs { u32 tmr_alarm1_h; /* Timer alarm 1 high register */ u32 tmr_alarm1_l; /* Timer alarm 1 high register */ u32 tmr_alarm2_h; /* Timer alarm 2 high register */ u32 tmr_alarm2_l; /* Timer alarm 2 high register */ - u8 res3[48]; +}; + +struct fiper_regs { u32 tmr_fiper1; /* Timer fixed period interval */ u32 tmr_fiper2; /* Timer fixed period interval */ u32 tmr_fiper3; /* Timer fixed period interval */ - u8 res4[20]; +}; + +struct etts_regs { u32 tmr_etts1_h; /* Timestamp of general purpose external trigger */ u32 tmr_etts1_l; /* Timestamp of general purpose external trigger */ u32 tmr_etts2_h; /* Timestamp of general purpose external trigger */ u32 tmr_etts2_l; /* Timestamp of general purpose external trigger */ }; +struct qoriq_ptp_registers { + struct ctrl_regs __iomem *ctrl_regs; + struct alarm_regs __iomem *alarm_regs; + struct fiper_regs __iomem *fiper_regs; + struct etts_regs __iomem *etts_regs; +}; + +/* Offset definitions for the four register groups */ +#define CTRL_REGS_OFFSET 0x0 +#define ALARM_REGS_OFFSET 0x40 +#define FIPER_REGS_OFFSET 0x80 +#define ETTS_REGS_OFFSET 0xa0 + +#define FMAN_CTRL_REGS_OFFSET 0x80 +#define FMAN_ALARM_REGS_OFFSET 0xb8 +#define FMAN_FIPER_REGS_OFFSET 0xd0 +#define FMAN_ETTS_REGS_OFFSET 0xe0 + + /* Bit definitions for the TMR_CTRL register */ #define ALM1P (1<<31) /* Alarm1 output polarity */ #define ALM2P (1<<30) /* Alarm2 output polarity */ @@ -105,10 +129,10 @@ struct qoriq_ptp_registers { #define DRIVER "ptp_qoriq" #define DEFAULT_CKSEL 1 #define N_EXT_TS 2 -#define REG_SIZE sizeof(struct qoriq_ptp_registers) struct qoriq_ptp { - struct qoriq_ptp_registers __iomem *regs; + void __iomem *base; + struct qoriq_ptp_registers regs; spinlock_t lock; /* protects regs */ struct ptp_clock *clock; struct ptp_clock_info caps; -- cgit v1.2.3 From e7d4a95da86e0b048702765bbdcdc968aaf312e7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 20 Jun 2018 08:58:28 +0200 Subject: bitfield: fix *_encode_bits() There's a bug in *_encode_bits() in using ~field_multiplier() for the check whether or not the constant value fits into the field, this is wrong and clearly ~field_mask() was intended. This was triggering for me for both constant and non-constant values. Additionally, make this case actually into an compile error. Declaring the extern function that will never exist with just a warning is pointless as then later we'll just get a link error. While at it, also fix the indentation in those lines I'm touching. Finally, as suggested by Andy Shevchenko, add some tests and for that introduce also u8 helpers. The tests don't compile without the fix, showing that it's necessary. Fixes: 00b0c9b82663 ("Add primitives for manipulating bitfields both in host- and fixed-endian.") Reviewed-by: Andy Shevchenko Signed-off-by: Johannes Berg Signed-off-by: Kalle Valo --- include/linux/bitfield.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index cf2588d81148..147a7bb341dd 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -104,7 +104,7 @@ (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \ }) -extern void __compiletime_warning("value doesn't fit into mask") +extern void __compiletime_error("value doesn't fit into mask") __field_overflow(void); extern void __compiletime_error("bad bitfield mask") __bad_mask(void); @@ -121,8 +121,8 @@ static __always_inline u64 field_mask(u64 field) #define ____MAKE_OP(type,base,to,from) \ static __always_inline __##type type##_encode_bits(base v, base field) \ { \ - if (__builtin_constant_p(v) && (v & ~field_multiplier(field))) \ - __field_overflow(); \ + if (__builtin_constant_p(v) && (v & ~field_mask(field))) \ + __field_overflow(); \ return to((v & field_mask(field)) * field_multiplier(field)); \ } \ static __always_inline __##type type##_replace_bits(__##type old, \ -- cgit v1.2.3 From 37a3862e1238262e9866d5d66e5f5f9069cee3a1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 20 Jun 2018 08:58:29 +0200 Subject: bitfield: add u8 helpers There's no reason why we shouldn't pack/unpack bits into/from u8 values/registers/etc., so add u8 helpers. Use the ____MAKE_OP() macro directly to avoid having nonsense le8_encode_bits() and similar functions. Reviewed-by: Andy Shevchenko Signed-off-by: Johannes Berg Signed-off-by: Kalle Valo --- include/linux/bitfield.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index 147a7bb341dd..65a6981eef7b 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -143,6 +143,7 @@ static __always_inline base type##_get_bits(__##type v, base field) \ ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \ ____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu) \ ____MAKE_OP(u##size,u##size,,) +____MAKE_OP(u8,u8,,) __MAKE_OP(16) __MAKE_OP(32) __MAKE_OP(64) -- cgit v1.2.3 From 80d19669ecd34423e85ca04f2210b0e42a47cb16 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 29 Jun 2018 21:26:41 -0700 Subject: net: Refactor XPS for CPUs and Rx queues Refactor XPS code to support Tx queue selection based on CPU(s) map or Rx queue(s) map. Signed-off-by: Amritha Nambiar Signed-off-by: David S. Miller --- include/linux/cpumask.h | 11 ++++-- include/linux/netdevice.h | 98 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 103 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index bf53d893ad02..57f20a0a7794 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask; #define cpu_active(cpu) ((cpu) == 0) #endif -/* verify cpu argument to cpumask_* operators */ -static inline unsigned int cpumask_check(unsigned int cpu) +static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) { #ifdef CONFIG_DEBUG_PER_CPU_MAPS - WARN_ON_ONCE(cpu >= nr_cpumask_bits); + WARN_ON_ONCE(cpu >= bits); #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ +} + +/* verify cpu argument to cpumask_* operators */ +static inline unsigned int cpumask_check(unsigned int cpu) +{ + cpu_max_bits_warn(cpu, nr_cpumask_bits); return cpu; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c6b377a15869..8bf8d6149f79 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -731,10 +731,15 @@ struct xps_map { */ struct xps_dev_maps { struct rcu_head rcu; - struct xps_map __rcu *cpu_map[0]; + struct xps_map __rcu *attr_map[0]; /* Either CPUs map or RXQs map */ }; -#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \ + +#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \ (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *))) + +#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\ + (_rxqs * (_tcs) * sizeof(struct xps_map *))) + #endif /* CONFIG_XPS */ #define TC_MAX_QUEUE 16 @@ -1910,7 +1915,8 @@ struct net_device { int watchdog_timeo; #ifdef CONFIG_XPS - struct xps_dev_maps __rcu *xps_maps; + struct xps_dev_maps __rcu *xps_cpus_map; + struct xps_dev_maps __rcu *xps_rxqs_map; #endif #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc __rcu *miniq_egress; @@ -3259,6 +3265,92 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) #ifdef CONFIG_XPS int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index); +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, + u16 index, bool is_rxqs_map); + +/** + * netif_attr_test_mask - Test a CPU or Rx queue set in a mask + * @j: CPU/Rx queue index + * @mask: bitmask of all cpus/rx queues + * @nr_bits: number of bits in the bitmask + * + * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues. + */ +static inline bool netif_attr_test_mask(unsigned long j, + const unsigned long *mask, + unsigned int nr_bits) +{ + cpu_max_bits_warn(j, nr_bits); + return test_bit(j, mask); +} + +/** + * netif_attr_test_online - Test for online CPU/Rx queue + * @j: CPU/Rx queue index + * @online_mask: bitmask for CPUs/Rx queues that are online + * @nr_bits: number of bits in the bitmask + * + * Returns true if a CPU/Rx queue is online. + */ +static inline bool netif_attr_test_online(unsigned long j, + const unsigned long *online_mask, + unsigned int nr_bits) +{ + cpu_max_bits_warn(j, nr_bits); + + if (online_mask) + return test_bit(j, online_mask); + + return (j < nr_bits); +} + +/** + * netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask + * @n: CPU/Rx queue index + * @srcp: the cpumask/Rx queue mask pointer + * @nr_bits: number of bits in the bitmask + * + * Returns >= nr_bits if no further CPUs/Rx queues set. + */ +static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, + unsigned int nr_bits) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpu_max_bits_warn(n, nr_bits); + + if (srcp) + return find_next_bit(srcp, nr_bits, n + 1); + + return n + 1; +} + +/** + * netif_attrmask_next_and - get the next CPU/Rx queue in *src1p & *src2p + * @n: CPU/Rx queue index + * @src1p: the first CPUs/Rx queues mask pointer + * @src2p: the second CPUs/Rx queues mask pointer + * @nr_bits: number of bits in the bitmask + * + * Returns >= nr_bits if no further CPUs/Rx queues set in both. + */ +static inline int netif_attrmask_next_and(int n, const unsigned long *src1p, + const unsigned long *src2p, + unsigned int nr_bits) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpu_max_bits_warn(n, nr_bits); + + if (src1p && src2p) + return find_next_and_bit(src1p, src2p, nr_bits, n + 1); + else if (src1p) + return find_next_bit(src1p, nr_bits, n + 1); + else if (src2p) + return find_next_bit(src2p, nr_bits, n + 1); + + return n + 1; +} #else static inline int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, -- cgit v1.2.3 From 8a9c58d28d0f66569737a3295116710ed24573cd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 2 Jul 2018 18:21:12 +0800 Subject: sctp: add support for dscp and flowlabel per transport Like some other per transport params, flowlabel and dscp are added in transport, asoc and sctp_sock. By default, transport sets its value from asoc's, and asoc does it from sctp_sock. flowlabel only works for ipv6 transport. Other than that they need to be passed down in sctp_xmit, flow4/6 also needs to set them before looking up route in get_dst. Note that it uses '& 0x100000' to check if flowlabel is set and '& 0x1' (tos 1st bit is unused) to check if dscp is set by users, so that they could be set to 0 by sockopt in next patch. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/sctp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index b36c76635f18..83d94341e003 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -801,4 +801,11 @@ struct sctp_strreset_resptsn { __be32 receivers_next_tsn; }; +enum { + SCTP_DSCP_SET_MASK = 0x1, + SCTP_DSCP_VAL_MASK = 0xfc, + SCTP_FLOWLABEL_SET_MASK = 0x100000, + SCTP_FLOWLABEL_VAL_MASK = 0xfffff +}; + #endif /* __LINUX_SCTP_H__ */ -- cgit v1.2.3 From f6ad8c1bcdf014272d08c55b9469536952a0a771 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:12:45 +0100 Subject: net: core: trivial netif_receive_skb_list() entry point Just calls netif_receive_skb() in a loop. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 64480a0f2c16..f67258f057ca 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3477,6 +3477,7 @@ int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); +void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); -- cgit v1.2.3 From 4ce0017a373afaaa9ef17614d8fa4f6fde261d18 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:13:40 +0100 Subject: net: core: another layer of lists, around PF_MEMALLOC skb handling First example of a layer splitting the list (rather than merely taking individual packets off it). Involves new list.h function, list_cut_before(), like list_cut_position() but cuts on the other side of the given entry. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/list.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 4b129df4d46b..de04cc5ed536 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -285,6 +285,36 @@ static inline void list_cut_position(struct list_head *list, __list_cut_position(list, head, entry); } +/** + * list_cut_before - cut a list into two, before given entry + * @list: a new list to add all removed entries + * @head: a list with entries + * @entry: an entry within head, could be the head itself + * + * This helper moves the initial part of @head, up to but + * excluding @entry, from @head to @list. You should pass + * in @entry an element you know is on @head. @list should + * be an empty list or a list you do not care about losing + * its data. + * If @entry == @head, all entries on @head are moved to + * @list. + */ +static inline void list_cut_before(struct list_head *list, + struct list_head *head, + struct list_head *entry) +{ + if (head->next == entry) { + INIT_LIST_HEAD(list); + return; + } + list->next = head->next; + list->next->prev = list; + list->prev = entry->prev; + list->prev->next = list; + head->next = entry; + entry->prev = head; +} + static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) -- cgit v1.2.3 From 17266ee939849cb095ed7dd9edbec4162172226b Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:14:12 +0100 Subject: net: ipv4: listified version of ip_rcv Also involved adding a way to run a netfilter hook over a list of packets. Rather than attempting to make netfilter know about lists (which would be a major project in itself) we just let it call the regular okfn (in this case ip_rcv_finish()) for any packets it steals, and have it give us back a list of packets it's synchronously accepted (which normally NF_HOOK would automatically call okfn() on, but we want to be able to potentially pass the list to a listified version of okfn().) The netfilter hooks themselves are indirect calls that still happen per- packet (see nf_hook_entry_hookfn()), but again, changing that can be left for future work. There is potential for out-of-order receives if the netfilter hook ends up synchronously stealing packets, as they will be processed before any accepts earlier in the list. However, it was already possible for an asynchronous accept to cause out-of-order receives, so presumably this is considered OK. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/linux/netfilter.h | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f67258f057ca..c1ef749b6f9f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2297,6 +2297,9 @@ struct packet_type { struct net_device *, struct packet_type *, struct net_device *); + void (*list_func) (struct list_head *, + struct packet_type *, + struct net_device *); bool (*id_match)(struct packet_type *ptype, struct sock *sk); void *af_packet_priv; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index dd2052f0efb7..5a5e0a2ab2a3 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -288,6 +288,20 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct return ret; } +static inline void +NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, + struct list_head *head, struct net_device *in, struct net_device *out, + int (*okfn)(struct net *, struct sock *, struct sk_buff *)) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) { + int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn); + if (ret != 1) + list_del(&skb->list); + } +} + /* Call setsockopt() */ int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, unsigned int len); @@ -369,6 +383,14 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, return okfn(net, sk, skb); } +static inline void +NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, + struct list_head *head, struct net_device *in, struct net_device *out, + int (*okfn)(struct net *, struct sock *, struct sk_buff *)) +{ + /* nothing to do */ +} + static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, -- cgit v1.2.3 From 25db26a91364db00f5a30da2fea8e9afe14a163c Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Tue, 3 Jul 2018 15:42:53 -0700 Subject: net/sched: Introduce the ETF Qdisc The ETF (Earliest TxTime First) qdisc uses the information added earlier in this series (the socket option SO_TXTIME and the new role of sk_buff->tstamp) to schedule packets transmission based on absolute time. For some workloads, just bandwidth enforcement is not enough, and precise control of the transmission of packets is necessary. Example: $ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0 $ tc qdisc add dev enp2s0 parent 100:1 etf delta 100000 \ clockid CLOCK_TAI In this example, the Qdisc will provide SW best-effort for the control of the transmission time to the network adapter, the time stamp in the socket will be in reference to the clockid CLOCK_TAI and packets will leave the qdisc "delta" (100000) nanoseconds before its transmission time. The ETF qdisc will buffer packets sorted by their txtime. It will drop packets on enqueue() if their skbuff clockid does not match the clock reference of the Qdisc. Moreover, on dequeue(), a packet will be dropped if it expires while being enqueued. The qdisc also supports the SO_TXTIME deadline mode. For this mode, it will dequeue a packet as soon as possible and change the skb timestamp to 'now' during etf_dequeue(). Note that both the qdisc's and the SO_TXTIME ABIs allow for a clockid to be configured, but it's been decided that usage of CLOCK_TAI should be enforced until we decide to allow for other clockids to be used. The rationale here is that PTP times are usually in the TAI scale, thus no other clocks should be necessary. For now, the qdisc will return EINVAL if any clocks other than CLOCK_TAI are used. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c1ef749b6f9f..f06ee8f91e74 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -798,6 +798,7 @@ enum tc_setup_type { TC_SETUP_QDISC_RED, TC_SETUP_QDISC_PRIO, TC_SETUP_QDISC_MQ, + TC_SETUP_QDISC_ETF, }; /* These structures hold the attributes of bpf state that are being passed -- cgit v1.2.3 From 4d4fb5dc988a36307711be292bde6e39b8bdbceb Mon Sep 17 00:00:00 2001 From: Yonatan Cohen Date: Tue, 19 Jun 2018 08:47:22 +0300 Subject: net/mlx5: Limit scope of dump_fill_mkey function mlx5_core_dump_fill_mkey() is going to be used in next patch in IB and doesn't need to be visible to whole mlx5_core. Move that command to mlx5_ib. Signed-off-by: Yonatan Cohen Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 80cbb7fdce4a..1cb1c0317b77 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1067,8 +1067,6 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey); int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, u32 *out, int outlen); -int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_mkey, - u32 *mkey); int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn); int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn); int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, -- cgit v1.2.3 From b183ee27f5fb07c8428e2fe45d5f35dac611c45d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 19 Jun 2018 08:47:23 +0300 Subject: net/mlx5: Add hardware definitions for dump_fill_mkey MLX5 IB HCA offers the memory key, dump_fill_mkey to boost performance by forcing local HCA operations to skip the PCI bus access, This patch adds needed hardware definitions. Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index deb3a459a22e..1853e7fd6924 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -885,7 +885,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_eq_sz[0x8]; u8 reserved_at_e8[0x2]; u8 log_max_mkey[0x6]; - u8 reserved_at_f0[0xc]; + u8 reserved_at_f0[0x8]; + u8 dump_fill_mkey[0x1]; + u8 reserved_at_f9[0x3]; u8 log_max_eq[0x4]; u8 max_indirection[0x8]; -- cgit v1.2.3 From 6312fe77751f57d4fa2b28abeef84c6a95c28136 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 5 Jul 2018 14:34:32 +0800 Subject: net: limit each hash list length to MAX_GRO_SKBS After commit 07d78363dcff ("net: Convert NAPI gro list into a small hash table.")' there is 8 hash buckets, which allows more flows to be held for merging. but MAX_GRO_SKBS, the total held skb for merging, is 8 skb still, limit the hash table performance. keep MAX_GRO_SKBS as 8 skb, but limit each hash list length to 8 skb, not the total 8 skb Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f06ee8f91e74..b683971e500d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -302,6 +302,11 @@ struct netdev_boot_setup { int __init netdev_boot_setup(char *str); +struct gro_list { + struct list_head list; + int count; +}; + /* * Structure for NAPI scheduling similar to tasklet but with weighting */ @@ -323,7 +328,7 @@ struct napi_struct { int poll_owner; #endif struct net_device *dev; - struct list_head gro_hash[GRO_HASH_BUCKETS]; + struct gro_list gro_hash[GRO_HASH_BUCKETS]; struct sk_buff *skb; struct hrtimer timer; struct list_head dev_list; -- cgit v1.2.3 From 1b0707a781eeaeeaacb464e18bb3f049b99b496b Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 7 Jun 2018 19:50:38 +0000 Subject: Bluetooth: remove unused bt-nokia-h4p.h header Nothing in tree use this header which seems a remains of a staging driver. This patch remove it. Signed-off-by: Corentin Labbe Reviewed-by: Sebastian Reichel Signed-off-by: Marcel Holtmann --- include/linux/platform_data/bt-nokia-h4p.h | 38 ------------------------------ 1 file changed, 38 deletions(-) delete mode 100644 include/linux/platform_data/bt-nokia-h4p.h (limited to 'include/linux') diff --git a/include/linux/platform_data/bt-nokia-h4p.h b/include/linux/platform_data/bt-nokia-h4p.h deleted file mode 100644 index 30d169dfadf3..000000000000 --- a/include/linux/platform_data/bt-nokia-h4p.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * This file is part of Nokia H4P bluetooth driver - * - * Copyright (C) 2010 Nokia Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA - * 02110-1301 USA - * - */ - - -/** - * struct hci_h4p_platform data - hci_h4p Platform data structure - */ -struct hci_h4p_platform_data { - int chip_type; - int bt_sysclk; - unsigned int bt_wakeup_gpio; - unsigned int host_wakeup_gpio; - unsigned int reset_gpio; - int reset_gpio_shared; - unsigned int uart_irq; - phys_addr_t uart_base; - const char *uart_iclk; - const char *uart_fclk; - void (*set_pm_limits)(struct device *dev, bool set); -}; -- cgit v1.2.3 From 06ae48269d1e0324d806fca30fe77112f4a4a14a Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 6 Jul 2018 15:13:18 -0700 Subject: lib: reciprocal_div: implement the improved algorithm on the paper mentioned The new added "reciprocal_value_adv" implements the advanced version of the algorithm described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose ceil(log2(d)) result will be 32 which then requires u128 divide on host. The exception case could be easily handled before calling "reciprocal_value_adv". The advanced version requires more complex calculation to get the reciprocal multiplier and other control variables, but then could reduce the required emulation operations. It makes no sense to use this advanced version for host divide emulation, those extra complexities for calculating multiplier etc could completely waive our saving on emulation operations. However, it makes sense to use it for JIT divide code generation (for example eBPF JIT backends) for which we are willing to trade performance of JITed code with that of host. As shown by the following pseudo code, the required emulation operations could go down from 6 (the basic version) to 3 or 4. To use the result of "reciprocal_value_adv", suppose we want to calculate n/d, the C-style pseudo code will be the following, it could be easily changed to real code generation for other JIT targets. struct reciprocal_value_adv rvalue; u8 pre_shift, exp; // handle exception case. if (d >= (1U << 31)) { result = n >= d; return; } rvalue = reciprocal_value_adv(d, 32) exp = rvalue.exp; if (rvalue.is_wide_m && !(d & 1)) { // floor(log2(d & (2^32 -d))) pre_shift = fls(d & -d) - 1; rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); } else { pre_shift = 0; } // code generation starts. if (imm == 1U << exp) { result = n >> exp; } else if (rvalue.is_wide_m) { // pre_shift must be zero when reached here. t = (n * rvalue.m) >> 32; result = n - t; result >>= 1; result += t; result >>= rvalue.sh - 1; } else { if (pre_shift) result = n >> pre_shift; result = ((u64)result * rvalue.m) >> 32; result >>= rvalue.sh; } Signed-off-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/reciprocal_div.h | 68 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h index e031e9f2f9d8..585ce89c0f33 100644 --- a/include/linux/reciprocal_div.h +++ b/include/linux/reciprocal_div.h @@ -25,6 +25,9 @@ struct reciprocal_value { u8 sh1, sh2; }; +/* "reciprocal_value" and "reciprocal_divide" together implement the basic + * version of the algorithm described in Figure 4.1 of the paper. + */ struct reciprocal_value reciprocal_value(u32 d); static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) @@ -33,4 +36,69 @@ static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) return (t + ((a - t) >> R.sh1)) >> R.sh2; } +struct reciprocal_value_adv { + u32 m; + u8 sh, exp; + bool is_wide_m; +}; + +/* "reciprocal_value_adv" implements the advanced version of the algorithm + * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose + * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The + * exception case could be easily handled before calling "reciprocal_value_adv". + * + * The advanced version requires more complex calculation to get the reciprocal + * multiplier and other control variables, but then could reduce the required + * emulation operations. + * + * It makes no sense to use this advanced version for host divide emulation, + * those extra complexities for calculating multiplier etc could completely + * waive our saving on emulation operations. + * + * However, it makes sense to use it for JIT divide code generation for which + * we are willing to trade performance of JITed code with that of host. As shown + * by the following pseudo code, the required emulation operations could go down + * from 6 (the basic version) to 3 or 4. + * + * To use the result of "reciprocal_value_adv", suppose we want to calculate + * n/d, the pseudo C code will be: + * + * struct reciprocal_value_adv rvalue; + * u8 pre_shift, exp; + * + * // handle exception case. + * if (d >= (1U << 31)) { + * result = n >= d; + * return; + * } + * + * rvalue = reciprocal_value_adv(d, 32) + * exp = rvalue.exp; + * if (rvalue.is_wide_m && !(d & 1)) { + * // floor(log2(d & (2^32 -d))) + * pre_shift = fls(d & -d) - 1; + * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); + * } else { + * pre_shift = 0; + * } + * + * // code generation starts. + * if (imm == 1U << exp) { + * result = n >> exp; + * } else if (rvalue.is_wide_m) { + * // pre_shift must be zero when reached here. + * t = (n * rvalue.m) >> 32; + * result = n - t; + * result >>= 1; + * result += t; + * result >>= rvalue.sh - 1; + * } else { + * if (pre_shift) + * result = n >> pre_shift; + * result = ((u64)result * rvalue.m) >> 32; + * result >>= rvalue.sh; + * } + */ +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec); + #endif /* _LINUX_RECIPROCAL_DIV_H */ -- cgit v1.2.3 From b233504033dbd65740e59681820ccfd0a2a8ec53 Mon Sep 17 00:00:00 2001 From: Yifeng Sun Date: Mon, 2 Jul 2018 08:18:03 -0700 Subject: openvswitch: kernel datapath clone action Add 'clone' action to kernel datapath by using existing functions. When actions within clone don't modify the current flow, the flow key is not cloned before executing clone actions. This is a follow up patch for this incomplete work: https://patchwork.ozlabs.org/patch/722096/ v1 -> v2: Refactor as advised by reviewer. Signed-off-by: Yifeng Sun Signed-off-by: Andy Zhou Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/linux/openvswitch.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h index e6b240b6196c..379affc63e24 100644 --- a/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h @@ -21,4 +21,9 @@ #include +#define OVS_CLONE_ATTR_EXEC 0 /* Specify an u32 value. When nonzero, + * actions in clone will not change flow + * keys. False otherwise. + */ + #endif /* _LINUX_OPENVSWITCH_H */ -- cgit v1.2.3 From ffcfe25bb50f27395e15fa999f1a7eb769f55360 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:38 -0400 Subject: net: Add support for subordinate device traffic classes This patch is meant to provide the basic tools needed to allow us to create subordinate device traffic classes. The general idea here is to allow subdividing the queues of a device into queue groups accessible through an upper device such as a macvlan. The idea here is to enforce the idea that an upper device has to be a single queue device, ideally with IFF_NO_QUQUE set. With that being the case we can pretty much guarantee that the tc_to_txq mappings and XPS maps for the upper device are unused. As such we could reuse those in order to support subdividing the lower device and distributing those queues between the subordinate devices. In order to distinguish between a regular set of traffic classes and if a device is carrying subordinate traffic classes I changed num_tc from a u8 to a s16 value and use the negative values to represent the subordinate pool values. So starting at -1 and running to -32768 we can encode those as pool values, and the existing values of 0 to 15 can be maintained. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/netdevice.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b683971e500d..b1ff77276bc4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -575,6 +575,9 @@ struct netdev_queue { * (/sys/class/net/DEV/Q/trans_timeout) */ unsigned long trans_timeout; + + /* Subordinate device that the queue has been assigned to */ + struct net_device *sb_dev; /* * write-mostly part */ @@ -1991,7 +1994,7 @@ struct net_device { #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; #endif - u8 num_tc; + s16 num_tc; struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; u8 prio_tc_map[TC_BITMASK + 1]; @@ -2045,6 +2048,17 @@ int netdev_get_num_tc(struct net_device *dev) return dev->num_tc; } +void netdev_unbind_sb_channel(struct net_device *dev, + struct net_device *sb_dev); +int netdev_bind_sb_channel_queue(struct net_device *dev, + struct net_device *sb_dev, + u8 tc, u16 count, u16 offset); +int netdev_set_sb_channel(struct net_device *dev, u16 channel); +static inline int netdev_get_sb_channel(struct net_device *dev) +{ + return max_t(int, -dev->num_tc, 0); +} + static inline struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, unsigned int index) -- cgit v1.2.3 From eadec877ce9ca46a94e9036b5a44e7941d4fc501 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:48 -0400 Subject: net: Add support for subordinate traffic classes to netdev_pick_tx This change makes it so that we can support the concept of subordinate device traffic classes to the core networking code. In doing this we can start pulling out the driver specific bits needed to support selecting a queue based on an upper device. The solution at is currently stands is only partially implemented. I have the start of some XPS bits in here, but I would still need to allow for configuration of the XPS maps on the queues reserved for the subordinate devices. For now I am using the reference to the sb_dev XPS map as just a way to skip the lookup of the lower device XPS map for now as that would result in the wrong queue being picked. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b1ff77276bc4..fda0bcda7a42 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2103,7 +2103,7 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, - void *accel_priv); + struct net_device *sb_dev); /* returns the headroom that the master device needs to take in account * when forwarding to this dev @@ -2568,7 +2568,7 @@ void dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); int dev_queue_xmit(struct sk_buff *skb); -int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv); +int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); -- cgit v1.2.3 From a4ea8a3dacc312c3402c78f6e4843afdda9b43a0 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:54 -0400 Subject: net: Add generic ndo_select_queue functions This patch adds a generic version of the ndo_select_queue functions for either returning 0 or selecting a queue based on the processor ID. This is generally meant to just reduce the number of functions we have to change in the future when we have to deal with ndo_select_queue changes. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fda0bcda7a42..46f4c44ce3e4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2567,6 +2567,10 @@ void dev_close(struct net_device *dev); void dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); +u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback); +u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, select_queue_fallback_t fallback); int dev_queue_xmit(struct sk_buff *skb); int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); -- cgit v1.2.3 From 4f49dec9075aa0277b8c9c657ec31e6361f88724 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:19:59 -0400 Subject: net: allow ndo_select_queue to pass netdev This patch makes it so that instead of passing a void pointer as the accel_priv we instead pass a net_device pointer as sb_dev. Making this change allows us to pass the subordinate device through to the fallback function eventually so that we can keep the actual code in the ndo_select_queue call as focused on possible on the exception cases. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/netdevice.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 46f4c44ce3e4..bbf062c1ca8a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -957,7 +957,8 @@ struct dev_ifalias { * those the driver believes to be appropriate. * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, - * void *accel_priv, select_queue_fallback_t fallback); + * struct net_device *sb_dev, + * select_queue_fallback_t fallback); * Called to decide which queue to use when device supports multiple * transmit queues. * @@ -1229,7 +1230,7 @@ struct net_device_ops { netdev_features_t features); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback); void (*ndo_change_rx_flags)(struct net_device *dev, int flags); @@ -2568,9 +2569,11 @@ void dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback); + struct net_device *sb_dev, + select_queue_fallback_t fallback); u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, select_queue_fallback_t fallback); + struct net_device *sb_dev, + select_queue_fallback_t fallback); int dev_queue_xmit(struct sk_buff *skb); int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); -- cgit v1.2.3 From 8ec56fc3c5ee6f9700adac190e9ce5b8859a58b6 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 9 Jul 2018 12:20:04 -0400 Subject: net: allow fallback function to pass netdev For most of these calls we can just pass NULL through to the fallback function as the sb_dev. The only cases where we cannot are the cases where we might be dealing with either an upper device or a driver that would have configured things to support an sb_dev itself. The only driver that has any significant change in this patch set should be ixgbe as we can drop the redundant functionality that existed in both the ndo_select_queue function and the fallback function that was passed through to us. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bbf062c1ca8a..2daf2fa6554f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -793,7 +793,8 @@ static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a, } typedef u16 (*select_queue_fallback_t)(struct net_device *dev, - struct sk_buff *skb); + struct sk_buff *skb, + struct net_device *sb_dev); enum tc_setup_type { TC_SETUP_QDISC_MQPRIO, -- cgit v1.2.3 From 9f17dbf04ddf55ae48f5bbafea4c4920ea943215 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 9 Jul 2018 18:10:02 +0100 Subject: netfilter: fix use-after-free in NF_HOOK_LIST nf_hook() can free the skb, so we need to remove it from the list before calling, and add passed skbs to a sublist afterwards. Fixes: 17266ee93984 ("net: ipv4: listified version of ip_rcv") Reported-by: Dan Carpenter Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/netfilter.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 5a5e0a2ab2a3..23b48de8c2e2 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -294,12 +294,16 @@ NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { struct sk_buff *skb, *next; + struct list_head sublist; + INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { - int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn); - if (ret != 1) - list_del(&skb->list); + list_del(&skb->list); + if (nf_hook(pf, hook, net, sk, skb, in, out, okfn) == 1) + list_add_tail(&skb->list, &sublist); } + /* Put passed packets back on main list */ + list_splice(&sublist, head); } /* Call setsockopt() */ -- cgit v1.2.3 From b60a60405fb95a688eb2ef4ef20f5fcaa7b64f68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: netfilter: Add nf_ct_get_tuple_skb global lookup function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds a global netfilter function to extract a conntrack tuple from an skb. The function uses a new function added to nf_ct_hook, which will try to get the tuple from skb->_nfct, and do a full lookup if that fails. This makes it possible to use the lookup function before the skb has passed through the conntrack init hooks (e.g., in an ingress qdisc). The tuple is copied to the caller to avoid issues with reference counting. The function returns false if conntrack is not loaded, allowing it to be used without incurring a module dependency on conntrack. This is used by the NAT mode in sch_cake. Cc: netfilter-devel@vger.kernel.org Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/linux/netfilter.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 23b48de8c2e2..07efffd0c759 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -414,8 +414,17 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu; void nf_ct_attach(struct sk_buff *, const struct sk_buff *); +struct nf_conntrack_tuple; +bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, + const struct sk_buff *skb); #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} +struct nf_conntrack_tuple; +static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, + const struct sk_buff *skb) +{ + return false; +} #endif struct nf_conn; @@ -424,6 +433,8 @@ enum ip_conntrack_info; struct nf_ct_hook { int (*update)(struct net *net, struct sk_buff *skb); void (*destroy)(struct nf_conntrack *); + bool (*get_tuple_skb)(struct nf_conntrack_tuple *, + const struct sk_buff *); }; extern struct nf_ct_hook __rcu *nf_ct_hook; -- cgit v1.2.3 From 3443b00e07eed5798605ba6524de37e1d3f1a4bf Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 10 Jul 2018 10:02:57 +0300 Subject: team: Publish team_port_get_rcu() A follow-up patch adds a new entry point, team_port_dev_txable(). Making it an ordinary exported function would mean that any module that may need the service in one of the supported configurations also unconditionally needs to pull in the team module, whether or not the user actually intends to create team interfaces. To prevent that, team_port_dev_txable() is defined in if_team.h, and therefore all dependencies of that function also need to be publicly-visible. Therefore move team_port_get_rcu() from team.c to if_team.h. Signed-off-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/if_team.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index d95cae09dea0..0d07c6655cce 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -74,6 +74,11 @@ struct team_port { long mode_priv[0]; }; +static inline struct team_port *team_port_get_rcu(const struct net_device *dev) +{ + return rcu_dereference(dev->rx_handler_data); +} + static inline bool team_port_enabled(struct team_port *port) { return port->index != -1; -- cgit v1.2.3 From eeed992b776c54af6108187c87ac60d028e69d37 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 10 Jul 2018 10:02:58 +0300 Subject: net: Add lag.h, net_lag_port_dev_txable() LAG devices (team or bond) recognize for each one of their slave devices whether LAG traffic is going to be sent through that device. Bond calls such devices "active", team calls them "txable". When this state changes, a NETDEV_CHANGELOWERSTATE notification is distributed, together with a netdev_notifier_changelowerstate_info structure that for LAG devices includes a tx_enabled flag that refers to the new state. The notification thus makes it possible to react to the changes in txability in drivers. However there's no way to query txability from the outside on demand. That is problematic namely for mlxsw, which when resolving ERSPAN packet path, may encounter a LAG device, and needs to determine which of the slaves it should choose. To that end, introduce a new function, net_lag_port_dev_txable(), which determines whether a given slave device is "active" or "txable" (depending on the flavor of the LAG device). That function then dispatches to per-LAG-flavor helpers, bond_is_active_slave_dev() resp. team_port_dev_txable(). Because there currently is no good place where net_lag_port_dev_txable() should be added, introduce a new header file, lag.h, which should from now on hold any logic common to both team and bond. (But keep netif_is_lag_master() together with the rest of netif_is_*_master() functions). Signed-off-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/if_team.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index 0d07c6655cce..ac42da56f7a2 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -89,6 +89,19 @@ static inline bool team_port_txable(struct team_port *port) return port->linkup && team_port_enabled(port); } +static inline bool team_port_dev_txable(const struct net_device *port_dev) +{ + struct team_port *port; + bool txable; + + rcu_read_lock(); + port = team_port_get_rcu(port_dev); + txable = port ? team_port_txable(port) : false; + rcu_read_unlock(); + + return txable; +} + #ifdef CONFIG_NET_POLL_CONTROLLER static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) -- cgit v1.2.3 From cca9bab1b72cd2296097c75f59ef11ef80461279 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Jul 2018 12:16:12 +0200 Subject: tcp: use monotonic timestamps for PAWS Using get_seconds() for timestamps is deprecated since it can lead to overflows on 32-bit systems. While the interface generally doesn't overflow until year 2106, the specific implementation of the TCP PAWS algorithm breaks in 2038 when the intermediate signed 32-bit timestamps overflow. A related problem is that the local timestamps in CLOCK_REALTIME form lead to unexpected behavior when settimeofday is called to set the system clock backwards or forwards by more than 24 days. While the first problem could be solved by using an overflow-safe method of comparing the timestamps, a nicer solution is to use a monotonic clocksource with ktime_get_seconds() that simply doesn't overflow (at least not until 136 years after boot) and that doesn't change during settimeofday(). To make 32-bit and 64-bit architectures behave the same way here, and also save a few bytes in the tcp_options_received structure, I'm changing the type to a 32-bit integer, which is now safe on all architectures. Finally, the ts_recent_stamp field also (confusingly) gets used to store a jiffies value in tcp_synq_overflow()/tcp_synq_no_recent_overflow(). This is currently safe, but changing the type to 32-bit requires some small changes there to keep it working. Signed-off-by: Arnd Bergmann Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3dbea6610304..58a8d7d71354 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -89,7 +89,7 @@ struct tcp_sack_block { struct tcp_options_received { /* PAWS/RTTM data */ - long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ + int ts_recent_stamp;/* Time we stored ts_recent (for aging) */ u32 ts_recent; /* Time stamp to echo next */ u32 rcv_tsval; /* Time stamp value */ u32 rcv_tsecr; /* Time stamp echo reply */ @@ -426,7 +426,7 @@ struct tcp_timewait_sock { /* The time we sent the last out-of-window ACK: */ u32 tw_last_oow_ack_time; - long tw_ts_recent_stamp; + int tw_ts_recent_stamp; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif -- cgit v1.2.3 From 523f9eb1ef25aab4aaf9aeb5356160e8039411ef Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:15 +0300 Subject: net/mlx4_core: Add health buffer address capability Health buffer address is a 32 bit PCI address offset provided by the FW. This offset is used for reading FW health debug data located on the shared CR space. Cr space is accessible in both driver and FW and allows for different queries and configurations. Health buffer size is always 64B of readable data followed by a lock which is used to block volatile CR space access. Signed-off-by: Alex Vesker Signed-off-by: Tariq Toukan Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 122e7e9d3091..e3bfe76aea98 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -630,6 +630,7 @@ struct mlx4_caps { u32 vf_caps; bool wol_port[MLX4_MAX_PORTS + 1]; struct mlx4_rate_limit_caps rl_caps; + u32 health_buffer_addrs; }; struct mlx4_buf_list { -- cgit v1.2.3 From bedc989b0c98285b277ff8a08ff9514e580913f4 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:16 +0300 Subject: net/mlx4_core: Add Crdump FW snapshot support Crdump allows the driver to create a snapshot of the FW PCI crspace and health buffer during a critical FW issue. In case of a FW command timeout, FW getting stuck or a non zero value on the catastrophic buffer, a snapshot will be taken. The snapshot is exposed using devlink, cr-space, fw-health address regions are registered on init and snapshots are attached once a new snapshot is collected by the driver. Signed-off-by: Alex Vesker Signed-off-by: Tariq Toukan Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index e3bfe76aea98..300b944e6e1e 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -852,6 +852,11 @@ struct mlx4_vf_dev { u8 n_ports; }; +struct mlx4_fw_crdump { + struct devlink_region *region_crspace; + struct devlink_region *region_fw_health; +}; + enum mlx4_pci_status { MLX4_PCI_STATUS_DISABLED, MLX4_PCI_STATUS_ENABLED, @@ -872,6 +877,7 @@ struct mlx4_dev_persistent { u8 interface_state; struct mutex pci_status_mutex; /* sync pci state */ enum mlx4_pci_status pci_status; + struct mlx4_fw_crdump crdump; }; struct mlx4_dev { -- cgit v1.2.3 From 3c641ba4a852cf4e90e3d7f29c5df08e24213c5d Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:18 +0300 Subject: net/mlx4_core: Use devlink region_snapshot parameter This parameter enables capturing region snapshot of the crspace during critical errors. The default value of this parameter is disabled, it can be enabled using devlink param commands. It is possible to configure during runtime and also driver init. Signed-off-by: Alex Vesker Signed-off-by: Jiri Pirko Reviewed-by: Moshe Shemesh Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 300b944e6e1e..dca6ab4eaa99 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -853,6 +853,7 @@ struct mlx4_vf_dev { }; struct mlx4_fw_crdump { + bool snapshot_enable; struct devlink_region *region_crspace; struct devlink_region *region_fw_health; }; -- cgit v1.2.3 From 6b8675897338f874c41612655a85d8e10cdb23d8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 Jul 2018 20:36:39 -0700 Subject: xdp: don't make drivers report attachment mode prog_attached of struct netdev_bpf should have been superseded by simply setting prog_id long time ago, but we kept it around to allow offloading drivers to communicate attachment mode (drv vs hw). Subsequently drivers were also allowed to report back attachment flags (prog_flags), and since nowadays only programs attached will XDP_FLAGS_HW_MODE can get offloaded, we can tell the attachment mode from the flags driver reports. Remove prog_attached member. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/netdevice.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b683971e500d..69a664789b33 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -819,10 +819,6 @@ enum bpf_netdev_command { */ XDP_SETUP_PROG, XDP_SETUP_PROG_HW, - /* Check if a bpf program is set on the device. The callee should - * set @prog_attached to one of XDP_ATTACHED_* values, note that "true" - * is equivalent to XDP_ATTACHED_DRV. - */ XDP_QUERY_PROG, /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_VERIFIER_PREP, @@ -849,7 +845,6 @@ struct netdev_bpf { }; /* XDP_QUERY_PROG */ struct { - u8 prog_attached; u32 prog_id; /* flags with which program was installed */ u32 prog_flags; -- cgit v1.2.3 From a25717d2b604347d9af8da81deea7b08e8c94220 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 Jul 2018 20:36:41 -0700 Subject: xdp: support simultaneous driver and hw XDP attachment Split the query of HW-attached program from the software one. Introduce new .ndo_bpf command to query HW-attached program. This will allow drivers to install different programs in HW and SW at the same time. Netlink can now also carry multiple programs on dump (in which case mode will be set to XDP_ATTACHED_MULTI and user has to check per-attachment point attributes, IFLA_XDP_PROG_ID will not be present). We reuse IFLA_XDP_PROG_ID skb space for second mode, so rtnl_xdp_size() doesn't need to be updated. Note that the installation side is still not there, since all drivers currently reject installing more than one program at the time. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/netdevice.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 69a664789b33..2422c0e88f5c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -820,6 +820,7 @@ enum bpf_netdev_command { XDP_SETUP_PROG, XDP_SETUP_PROG_HW, XDP_QUERY_PROG, + XDP_QUERY_PROG_HW, /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_VERIFIER_PREP, BPF_OFFLOAD_TRANSLATE, @@ -843,7 +844,7 @@ struct netdev_bpf { struct bpf_prog *prog; struct netlink_ext_ack *extack; }; - /* XDP_QUERY_PROG */ + /* XDP_QUERY_PROG, XDP_QUERY_PROG_HW */ struct { u32 prog_id; /* flags with which program was installed */ @@ -3533,8 +3534,8 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); -void __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, - struct netdev_bpf *xdp); +u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, + enum bpf_netdev_command cmd); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); -- cgit v1.2.3 From c921c2077b32081617789a645120148bc8b60c98 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 13 Jul 2018 12:16:43 +0300 Subject: net: ipmr: add support for passing full packet on wrong vif This patch adds support for IGMPMSG_WRVIFWHOLE which is used to pass full packet and real vif id when the incoming interface is wrong. While the RP and FHR are setting up state we need to be sending the registers encapsulated with all the data inside otherwise we lose it. The RP then decapsulates it and forwards it to the interested parties. Currently with WRONGVIF we can only be sending empty register packets and will lose that data. This behaviour can be enabled by using MRT_PIM with val == IGMPMSG_WRVIFWHOLE. This doesn't prevent IGMPMSG_WRONGVIF from happening, it happens in addition to it, also it is controlled by the same throttling parameters as WRONGVIF (i.e. 1 packet per 3 seconds currently). Both messages are generated to keep backwards compatibily and avoid breaking someone who was enabling MRT_PIM with val == 4, since any positive val is accepted and treated the same. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index fd436cdd4725..6675b9f81979 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -254,6 +254,7 @@ struct mr_table { atomic_t cache_resolve_queue_len; bool mroute_do_assert; bool mroute_do_pim; + bool mroute_do_wrvifwhole; int mroute_reg_vif_num; }; -- cgit v1.2.3 From 784abe24c903b093af04cf1a043140faa556cad7 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:35 +0300 Subject: net: Add decrypted field to skb The decrypted bit is propogated to cloned/copied skbs. This will be used later by the inline crypto receive side offload of tls. Signed-off-by: Boris Pismenny Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7601838c2513..3ceb8dcc54da 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -630,6 +630,7 @@ typedef unsigned char *sk_buff_data_t; * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue + * @decrypted: Decrypted SKB * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -736,7 +737,11 @@ struct sk_buff { peeked:1, head_frag:1, xmit_more:1, - __unused:1; /* one bit hole */ +#ifdef CONFIG_TLS_DEVICE + decrypted:1; +#else + __unused:1; +#endif /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() -- cgit v1.2.3 From 14136564c8ee94566945e85014019cbdb1716dca Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Fri, 13 Jul 2018 14:33:36 +0300 Subject: net: Add TLS RX offload feature This patch adds a netdev feature to configure TLS RX inline crypto offload. Signed-off-by: Ilya Lesokhin Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 623bb8ced060..2b2a6dce1630 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -79,6 +79,7 @@ enum { NETIF_F_HW_ESP_TX_CSUM_BIT, /* ESP with TX checksum offload */ NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */ NETIF_F_HW_TLS_TX_BIT, /* Hardware TLS TX offload */ + NETIF_F_HW_TLS_RX_BIT, /* Hardware TLS RX offload */ NETIF_F_GRO_HW_BIT, /* Hardware Generic receive offload */ NETIF_F_HW_TLS_RECORD_BIT, /* Offload TLS record */ @@ -151,6 +152,7 @@ enum { #define NETIF_F_HW_TLS_RECORD __NETIF_F(HW_TLS_RECORD) #define NETIF_F_GSO_UDP_L4 __NETIF_F(GSO_UDP_L4) #define NETIF_F_HW_TLS_TX __NETIF_F(HW_TLS_TX) +#define NETIF_F_HW_TLS_RX __NETIF_F(HW_TLS_RX) #define for_each_netdev_feature(mask_addr, bit) \ for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) -- cgit v1.2.3 From 16e4edc297ffc9b643b8dd3da6b0d579753ea2b3 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:37 +0300 Subject: net: Add TLS rx resync NDO Add new netdev tls op for resynchronizing HW tls context Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4fa7f7a3f8b3..3514d67112b3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -903,6 +903,8 @@ struct tlsdev_ops { void (*tls_dev_del)(struct net_device *netdev, struct tls_context *ctx, enum tls_offload_ctx_dir direction); + void (*tls_dev_resync_rx)(struct net_device *netdev, + struct sock *sk, u32 seq, u64 rcd_sn); }; #endif -- cgit v1.2.3 From ab412e1dd7db132c2abeb9385b4bf0dc8e6c5a65 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:46 +0300 Subject: net/mlx5: Accel, add TLS rx offload routines In Innova TLS, TLS contexts are added or deleted via a command message over the SBU connection. The HW then sends a response message over the same connection. Complete the implementation for Innova TLS (FPGA-based) hardware by adding support for rx inline crypto offload. Signed-off-by: Boris Pismenny Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc_fpga.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index 64d0f40d4cc3..37e065a80a43 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -576,6 +576,7 @@ struct mlx5_ifc_fpga_ipsec_sa { enum fpga_tls_cmds { CMD_SETUP_STREAM = 0x1001, CMD_TEARDOWN_STREAM = 0x1002, + CMD_RESYNC_RX = 0x1003, }; #define MLX5_TLS_1_2 (0) -- cgit v1.2.3 From d7e5a9a50245b91f016c814b0f076f7e55cbb980 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Jun 2018 17:49:43 +0200 Subject: netfilter: utils: move nf_ip_checksum* from ipv4 to utils allows to make nf_ip_checksum_partial static, it no longer has an external caller. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv4.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h index b31dabfdb453..95ab5cc64422 100644 --- a/include/linux/netfilter_ipv4.h +++ b/include/linux/netfilter_ipv4.h @@ -23,9 +23,6 @@ struct nf_queue_entry; #ifdef CONFIG_INET __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol); -__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol); int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict); int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry); @@ -35,14 +32,6 @@ static inline __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, { return 0; } -static inline __sum16 nf_ip_checksum_partial(struct sk_buff *skb, - unsigned int hook, - unsigned int dataoff, - unsigned int len, - u_int8_t protocol) -{ - return 0; -} static inline int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict) { -- cgit v1.2.3 From ebee5a50d0b7cdc576aa8081f05b86971880054d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Jun 2018 17:49:59 +0200 Subject: netfilter: utils: move nf_ip6_checksum* from ipv6 to utils similar to previous change, this also allows to remove it from nf_ipv6_ops and avoid the indirection. It also removes the bogus dependency of nf_conntrack_ipv6 on ipv6 module: ipv6 checksum functions are built into kernel even if CONFIG_IPV6=m, but ipv6/netfilter.o isn't. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 288c597e75b3..c0dc4dd78887 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -30,11 +30,6 @@ struct nf_ipv6_ops { void (*route_input)(struct sk_buff *skb); int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)); - __sum16 (*checksum)(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol); - __sum16 (*checksum_partial)(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, unsigned int len, - u_int8_t protocol); int (*route)(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict); int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry); -- cgit v1.2.3 From 2b9672ddb6f347467d7b33b86c5dfc4d5c0501a8 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 12 Jul 2018 21:32:53 +0200 Subject: net: phy: add phy_speed_down and phy_speed_up Some network drivers include functionality to speed down the PHY when suspending and just waiting for a WoL packet because this saves energy. This functionality is quite generic, therefore let's factor it out to phylib. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 6cd09098427c..075c2f770d3e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -942,6 +942,8 @@ void phy_start(struct phy_device *phydev); void phy_stop(struct phy_device *phydev); int phy_start_aneg(struct phy_device *phydev); int phy_aneg_done(struct phy_device *phydev); +int phy_speed_down(struct phy_device *phydev, bool sync); +int phy_speed_up(struct phy_device *phydev); int phy_stop_interrupts(struct phy_device *phydev); int phy_restart_aneg(struct phy_device *phydev); -- cgit v1.2.3 From d9f37d01e294e5338aa3e9d3b2eda61b59b619df Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 13 Jul 2018 14:41:36 +0800 Subject: net: convert gro_count to bitmask gro_hash size is 192 bytes, and uses 3 cache lines, if there is few flows, gro_hash may be not fully used, so it is unnecessary to iterate all gro_hash in napi_gro_flush(), to occupy unnecessary cacheline. convert gro_count to a bitmask, and rename it as gro_bitmask, each bit represents a element of gro_hash, only flush a gro_hash element if the related bit is set, to speed up napi_gro_flush(). and update gro_bitmask only if it will be changed, to reduce cache update Suggested-by: Eric Dumazet Signed-off-by: Li RongQing Cc: Stefano Brivio Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3514d67112b3..c1295c7a452e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -308,9 +308,14 @@ struct gro_list { }; /* - * Structure for NAPI scheduling similar to tasklet but with weighting + * size of gro hash buckets, must less than bit number of + * napi_struct::gro_bitmask */ #define GRO_HASH_BUCKETS 8 + +/* + * Structure for NAPI scheduling similar to tasklet but with weighting + */ struct napi_struct { /* The poll_list must only be managed by the entity which * changes the state of the NAPI_STATE_SCHED bit. This means @@ -322,7 +327,7 @@ struct napi_struct { unsigned long state; int weight; - unsigned int gro_count; + unsigned long gro_bitmask; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL int poll_owner; -- cgit v1.2.3 From 2fe31e43124040a67495fa81f7ac67bc4c09ebc8 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 17 Jul 2018 21:48:10 +0200 Subject: hwmon: Add missing HWMON_T_LCRIT_ALARM define The enum hwmon_temp_lcrit_alarm exists, but the BIT definition is missing. Signed-off-by: Andrew Lunn Acked-by: Guenter Roeck Signed-off-by: David S. Miller --- include/linux/hwmon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index e5fd2707b6df..1b74ad11a5a4 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -93,6 +93,7 @@ enum hwmon_temp_attributes { #define HWMON_T_MIN_ALARM BIT(hwmon_temp_min_alarm) #define HWMON_T_MAX_ALARM BIT(hwmon_temp_max_alarm) #define HWMON_T_CRIT_ALARM BIT(hwmon_temp_crit_alarm) +#define HWMON_T_LCRIT_ALARM BIT(hwmon_temp_lcrit_alarm) #define HWMON_T_EMERGENCY_ALARM BIT(hwmon_temp_emergency_alarm) #define HWMON_T_FAULT BIT(hwmon_temp_fault) #define HWMON_T_OFFSET BIT(hwmon_temp_offset) -- cgit v1.2.3 From aa7f29b07c8702127124d0522e3cd46850cdbc41 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 17 Jul 2018 21:48:11 +0200 Subject: hwmon: Add support for power min, lcrit, min_alarm and lcrit_alarm Some sensors support reporting minimal and lower critical power, as well as alarms when these thresholds are reached. Add support for these attributes to the hwmon core. Signed-off-by: Andrew Lunn Acked-by: Guenter Roeck Signed-off-by: David S. Miller --- include/linux/hwmon.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 1b74ad11a5a4..b217101ca76e 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -188,12 +188,16 @@ enum hwmon_power_attributes { hwmon_power_cap_hyst, hwmon_power_cap_max, hwmon_power_cap_min, + hwmon_power_min, hwmon_power_max, hwmon_power_crit, + hwmon_power_lcrit, hwmon_power_label, hwmon_power_alarm, hwmon_power_cap_alarm, + hwmon_power_min_alarm, hwmon_power_max_alarm, + hwmon_power_lcrit_alarm, hwmon_power_crit_alarm, }; @@ -214,12 +218,16 @@ enum hwmon_power_attributes { #define HWMON_P_CAP_HYST BIT(hwmon_power_cap_hyst) #define HWMON_P_CAP_MAX BIT(hwmon_power_cap_max) #define HWMON_P_CAP_MIN BIT(hwmon_power_cap_min) +#define HWMON_P_MIN BIT(hwmon_power_min) #define HWMON_P_MAX BIT(hwmon_power_max) +#define HWMON_P_LCRIT BIT(hwmon_power_lcrit) #define HWMON_P_CRIT BIT(hwmon_power_crit) #define HWMON_P_LABEL BIT(hwmon_power_label) #define HWMON_P_ALARM BIT(hwmon_power_alarm) #define HWMON_P_CAP_ALARM BIT(hwmon_power_cap_alarm) +#define HWMON_P_MIN_ALARM BIT(hwmon_power_max_alarm) #define HWMON_P_MAX_ALARM BIT(hwmon_power_max_alarm) +#define HWMON_P_LCRIT_ALARM BIT(hwmon_power_lcrit_alarm) #define HWMON_P_CRIT_ALARM BIT(hwmon_power_crit_alarm) enum hwmon_energy_attributes { -- cgit v1.2.3 From dcb5d0fcaa1ddd0af9c18ef51e6b05e38a6cec71 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 17 Jul 2018 21:48:12 +0200 Subject: hwmon: Add helper to tell if a char is invalid in a name HWMON device names are not allowed to contain "-* \t\n". Add a helper which will return true if passed an invalid character. It can be used to massage a string into a hwmon compatible name by replacing invalid characters with '_'. Signed-off-by: Andrew Lunn Acked-by: Guenter Roeck Signed-off-by: David S. Miller --- include/linux/hwmon.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index b217101ca76e..9493d4a388db 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -398,4 +398,27 @@ devm_hwmon_device_register_with_info(struct device *dev, void hwmon_device_unregister(struct device *dev); void devm_hwmon_device_unregister(struct device *dev); +/** + * hwmon_is_bad_char - Is the char invalid in a hwmon name + * @ch: the char to be considered + * + * hwmon_is_bad_char() can be used to determine if the given character + * may not be used in a hwmon name. + * + * Returns true if the char is invalid, false otherwise. + */ +static inline bool hwmon_is_bad_char(const char ch) +{ + switch (ch) { + case '-': + case '*': + case ' ': + case '\t': + case '\n': + return true; + default: + return false; + } +} + #endif -- cgit v1.2.3 From 1323061a018a7514287894a552c4ec2a5f0cb0cd Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 17 Jul 2018 21:48:13 +0200 Subject: net: phy: sfp: Add HWMON support for module sensors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SFP modules can contain a number of sensors. The EEPROM also contains recommended alarm and critical values for each sensor, and indications of if these have been exceeded. Export this information via HWMON. Currently temperature, VCC, bias current, transmit power, and possibly receiver power is supported. The sensors in the modules can either return calibrate or uncalibrated values. Uncalibrated values need to be manipulated, using coefficients provided in the SFP EEPROM. Uncalibrated receive power values require floating point maths in order to calibrate them. Performing this in the kernel is hard. So if the SFP module indicates it uses uncalibrated values, RX power is not made available. With this hwmon device, it is possible to view the sensor values using lm-sensors programs: in0: +3.29 V (crit min = +2.90 V, min = +3.00 V) (max = +3.60 V, crit max = +3.70 V) temp1: +33.0°C (low = -5.0°C, high = +80.0°C) (crit low = -10.0°C, crit = +85.0°C) power1: 1000.00 nW (max = 794.00 uW, min = 50.00 uW) ALARM (LCRIT) (lcrit = 40.00 uW, crit = 1000.00 uW) curr1: +0.00 A (crit min = +0.00 A, min = +0.00 A) ALARM (LCRIT, MIN) (max = +0.01 A, crit max = +0.01 A) The scaling sensors performs on the bias current is not particularly good. The raw values are more useful: curr1: curr1_input: 0.000 curr1_min: 0.002 curr1_max: 0.010 curr1_lcrit: 0.000 curr1_crit: 0.011 curr1_min_alarm: 1.000 curr1_max_alarm: 0.000 curr1_lcrit_alarm: 1.000 curr1_crit_alarm: 0.000 In order to keep the I2C overhead to a minimum, the constant values, such as limits and calibration coefficients are read once at module insertion time. Thus only reading *_input and *_alarm properties requires i2c read operations. Signed-off-by: Andrew Lunn Acked-by: Guenter Roeck Signed-off-by: David S. Miller --- include/linux/sfp.h | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index ebce9e24906a..d37518e89db2 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -231,6 +231,50 @@ struct sfp_eeprom_id { struct sfp_eeprom_ext ext; } __packed; +struct sfp_diag { + __be16 temp_high_alarm; + __be16 temp_low_alarm; + __be16 temp_high_warn; + __be16 temp_low_warn; + __be16 volt_high_alarm; + __be16 volt_low_alarm; + __be16 volt_high_warn; + __be16 volt_low_warn; + __be16 bias_high_alarm; + __be16 bias_low_alarm; + __be16 bias_high_warn; + __be16 bias_low_warn; + __be16 txpwr_high_alarm; + __be16 txpwr_low_alarm; + __be16 txpwr_high_warn; + __be16 txpwr_low_warn; + __be16 rxpwr_high_alarm; + __be16 rxpwr_low_alarm; + __be16 rxpwr_high_warn; + __be16 rxpwr_low_warn; + __be16 laser_temp_high_alarm; + __be16 laser_temp_low_alarm; + __be16 laser_temp_high_warn; + __be16 laser_temp_low_warn; + __be16 tec_cur_high_alarm; + __be16 tec_cur_low_alarm; + __be16 tec_cur_high_warn; + __be16 tec_cur_low_warn; + __be32 cal_rxpwr4; + __be32 cal_rxpwr3; + __be32 cal_rxpwr2; + __be32 cal_rxpwr1; + __be32 cal_rxpwr0; + __be16 cal_txi_slope; + __be16 cal_txi_offset; + __be16 cal_txpwr_slope; + __be16 cal_txpwr_offset; + __be16 cal_t_slope; + __be16 cal_t_offset; + __be16 cal_v_slope; + __be16 cal_v_offset; +} __packed; + /* SFP EEPROM registers */ enum { SFP_PHYS_ID = 0x00, @@ -384,7 +428,33 @@ enum { SFP_TEC_CUR = 0x6c, SFP_STATUS = 0x6e, - SFP_ALARM = 0x70, + SFP_ALARM0 = 0x70, + SFP_ALARM0_TEMP_HIGH = BIT(7), + SFP_ALARM0_TEMP_LOW = BIT(6), + SFP_ALARM0_VCC_HIGH = BIT(5), + SFP_ALARM0_VCC_LOW = BIT(4), + SFP_ALARM0_TX_BIAS_HIGH = BIT(3), + SFP_ALARM0_TX_BIAS_LOW = BIT(2), + SFP_ALARM0_TXPWR_HIGH = BIT(1), + SFP_ALARM0_TXPWR_LOW = BIT(0), + + SFP_ALARM1 = 0x71, + SFP_ALARM1_RXPWR_HIGH = BIT(7), + SFP_ALARM1_RXPWR_LOW = BIT(6), + + SFP_WARN0 = 0x74, + SFP_WARN0_TEMP_HIGH = BIT(7), + SFP_WARN0_TEMP_LOW = BIT(6), + SFP_WARN0_VCC_HIGH = BIT(5), + SFP_WARN0_VCC_LOW = BIT(4), + SFP_WARN0_TX_BIAS_HIGH = BIT(3), + SFP_WARN0_TX_BIAS_LOW = BIT(2), + SFP_WARN0_TXPWR_HIGH = BIT(1), + SFP_WARN0_TXPWR_LOW = BIT(0), + + SFP_WARN1 = 0x75, + SFP_WARN1_RXPWR_HIGH = BIT(7), + SFP_WARN1_RXPWR_LOW = BIT(6), SFP_EXT_STATUS = 0x76, SFP_VSL = 0x78, -- cgit v1.2.3 From be2ab5b4d5c0bf041a34ec2e1397d50afbfb095e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Jul 2018 13:45:12 +0200 Subject: netfilter: nf_tables: take module reference when starting a batch Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 3ecc3050be0e..4a520d3304a2 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -29,6 +29,7 @@ struct nfnetlink_subsystem { __u8 subsys_id; /* nfnetlink subsystem ID */ __u8 cb_count; /* number of callbacks */ const struct nfnl_callback *cb; /* callback for individual types */ + struct module *owner; int (*commit)(struct net *net, struct sk_buff *skb); int (*abort)(struct net *net, struct sk_buff *skb); void (*cleanup)(struct net *net); -- cgit v1.2.3 From d29ab6e1fa21ebc2a8a771015dd9e0e5d4e28dc1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 13 Jul 2018 12:41:10 -0700 Subject: bpf: bpf_prog_array_alloc() should return a generic non-rcu pointer Currently the return type of the bpf_prog_array_alloc() is struct bpf_prog_array __rcu *, which is not quite correct. Obviously, the returned pointer is a generic pointer, which is valid for an indefinite amount of time and it's not shared with anyone else, so there is no sense in marking it as __rcu. This change eliminate the following sparse warnings: kernel/bpf/core.c:1544:31: warning: incorrect type in return expression (different address spaces) kernel/bpf/core.c:1544:31: expected struct bpf_prog_array [noderef] * kernel/bpf/core.c:1544:31: got void * kernel/bpf/core.c:1548:17: warning: incorrect type in return expression (different address spaces) kernel/bpf/core.c:1548:17: expected struct bpf_prog_array [noderef] * kernel/bpf/core.c:1548:17: got struct bpf_prog_array * kernel/bpf/core.c:1681:15: warning: incorrect type in assignment (different address spaces) kernel/bpf/core.c:1681:15: expected struct bpf_prog_array *array kernel/bpf/core.c:1681:15: got struct bpf_prog_array [noderef] * Fixes: 324bda9e6c5a ("bpf: multi program support for cgroup+bpf") Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8827e797ff97..943fb08d8287 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -352,7 +352,7 @@ struct bpf_prog_array { struct bpf_prog *progs[0]; }; -struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); +struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array __rcu *progs); int bpf_prog_array_length(struct bpf_prog_array __rcu *progs); int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, -- cgit v1.2.3 From 09728266b6f99ab57cd4f84f3eead65b7b65dbf7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:23 -0700 Subject: bpf: offload: rename bpf_offload_dev_match() to bpf_offload_prog_map_match() A set of new API functions exported for the drivers will soon use 'bpf_offload_dev_' as a prefix. Rename the bpf_offload_dev_match() which is internal to the core (used by the verifier) to avoid any confusion. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 943fb08d8287..9b010d9129f3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -648,7 +648,7 @@ int bpf_map_offload_delete_elem(struct bpf_map *map, void *key); int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key); -bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map); +bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); -- cgit v1.2.3 From 9fd7c5559165f4c679b40c5e6ad442955832dfad Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:24 -0700 Subject: bpf: offload: aggregate offloads per-device Currently we have two lists of offloaded objects - programs and maps. Netdevice unregister notifier scans those lists to orphan objects associated with device being unregistered. This puts unnecessary (even if negligible) burden on all netdev unregister calls in BPF- -enabled kernel. The lists of objects may potentially get long making the linear scan even more problematic. There haven't been complaints about this mechanisms so far, but it is suboptimal. Instead of relying on notifiers, make the few BPF-capable drivers register explicitly for BPF offloads. The programs and maps will now be collected per-device not on a global list, and only scanned for removal when driver unregisters from BPF offloads. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9b010d9129f3..aa2e834a122b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -650,6 +650,9 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); +int bpf_offload_dev_netdev_register(struct net_device *netdev); +void bpf_offload_dev_netdev_unregister(struct net_device *netdev); + #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); -- cgit v1.2.3 From 602144c224604f1cbff02ee2d1cf46825269ecbd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:25 -0700 Subject: bpf: offload: keep the offload state per-ASIC Create a higher-level entity to represent a device/ASIC to allow programs and maps to be shared between device ports. The extra work is required to make sure we don't destroy BPF objects as soon as the netdev for which they were loaded gets destroyed, as other ports may still be using them. When netdev goes away all of its BPF objects will be moved to other netdevs of the device, and only destroyed when last netdev is unregistered. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aa2e834a122b..913465e45062 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -85,6 +85,7 @@ struct bpf_map { char name[BPF_OBJ_NAME_LEN]; }; +struct bpf_offload_dev; struct bpf_offloaded_map; struct bpf_map_dev_ops { @@ -650,8 +651,12 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); -int bpf_offload_dev_netdev_register(struct net_device *netdev); -void bpf_offload_dev_netdev_unregister(struct net_device *netdev); +struct bpf_offload_dev *bpf_offload_dev_create(void); +void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev); +int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, + struct net_device *netdev); +void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, + struct net_device *netdev); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); -- cgit v1.2.3 From fd4f227dea0f24d89f52f7c4eb3207f84ddcbcbd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jul 2018 10:53:26 -0700 Subject: bpf: offload: allow program and map sharing per-ASIC Allow programs and maps to be re-used across different netdevs, as long as they belong to the same struct bpf_offload_dev. Update the bpf_offload_prog_map_match() helper for the verifier and export a new helper for the drivers to use when checking programs at attachment time. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 913465e45062..5b5ad95cf339 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -657,6 +657,7 @@ int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, struct net_device *netdev); +bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); -- cgit v1.2.3 From a48d189ef53146a8df132a327a637c4182f50a16 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Tue, 17 Jul 2018 11:52:57 +0200 Subject: net: Move skb decrypted field, avoid explicity copy Commit 784abe24c903 ("net: Add decrypted field to skb") introduced a 'decrypted' field that is explicitly copied on skb copy and clone. Move it between headers_start[0] and headers_end[0], so that we don't need to copy it explicitly as it's copied by the memcpy() in __copy_skb_header(). While at it, drop the assignment in __skb_clone(), it was already redundant. This doesn't change the size of sk_buff or cacheline boundaries. The 15-bits hole before tc_index becomes a 14-bits hole, and will be again a 15-bits hole when this change is merged with commit 8b7008620b84 ("net: Don't copy pfmemalloc flag in __copy_skb_header()"). v2: as reported by kbuild test robot (oops, I forgot to build with CONFIG_TLS_DEVICE it seems), we can't use CHECK_SKB_FIELD() on a bit-field member. Just drop the check for the moment being, perhaps we could think of some magic to also check bit-field members one day. Fixes: 784abe24c903 ("net: Add decrypted field to skb") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3ceb8dcc54da..14bc9ebe30f2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -630,7 +630,6 @@ typedef unsigned char *sk_buff_data_t; * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue - * @decrypted: Decrypted SKB * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -641,6 +640,7 @@ typedef unsigned char *sk_buff_data_t; * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL * @dst_pending_confirm: need to confirm neighbour + * @decrypted: Decrypted SKB * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking * @mark: Generic packet mark @@ -737,11 +737,7 @@ struct sk_buff { peeked:1, head_frag:1, xmit_more:1, -#ifdef CONFIG_TLS_DEVICE - decrypted:1; -#else __unused:1; -#endif /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() @@ -797,6 +793,9 @@ struct sk_buff { __u8 tc_redirected:1; __u8 tc_from_ingress:1; #endif +#ifdef CONFIG_TLS_DEVICE + __u8 decrypted:1; +#endif #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ -- cgit v1.2.3 From eff8ea8f24eac76bc21c25e4ca4ac4ee2dade846 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Mon, 16 Jul 2018 18:35:30 -0700 Subject: net/mlx5: FW tracer, add hardware structures This change adds the infrastructure to mlx5 core fw tracer. It introduces the following 4 new registers: MLX5_REG_MTRC_CAP - Used to read tracer capabilities MLX5_REG_MTRC_CONF - Used to set tracer configurations MLX5_REG_MTRC_STDB - Used to query tracer strings database MLX5_REG_MTRC_CTRL - Used to control the tracer The capability of the tracing can be checked using mcam access register, therefore, the mcam access register interface will expose the tracer register. Signed-off-by: Feras Daoud Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 4 +++ include/linux/mlx5/mlx5_ifc.h | 61 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1cb1c0317b77..4a4125b4279d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -138,6 +138,10 @@ enum { MLX5_REG_HOST_ENDIANNESS = 0x7004, MLX5_REG_MCIA = 0x9014, MLX5_REG_MLCR = 0x902b, + MLX5_REG_MTRC_CAP = 0x9040, + MLX5_REG_MTRC_CONF = 0x9041, + MLX5_REG_MTRC_STDB = 0x9042, + MLX5_REG_MTRC_CTRL = 0x9043, MLX5_REG_MPCNT = 0x9051, MLX5_REG_MTPPS = 0x9053, MLX5_REG_MTPPSE = 0x9054, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1853e7fd6924..bd7b71f54d59 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -8112,7 +8112,9 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 mcqi[0x1]; u8 reserved_at_1f[0x1]; - u8 regs_95_to_64[0x20]; + u8 regs_95_to_68[0x1c]; + u8 tracer_registers[0x4]; + u8 regs_63_to_32[0x20]; u8 regs_31_to_0[0x20]; }; @@ -9187,4 +9189,61 @@ struct mlx5_ifc_create_uctx_in_bits { struct mlx5_ifc_uctx_bits uctx; }; +struct mlx5_ifc_mtrc_string_db_param_bits { + u8 string_db_base_address[0x20]; + + u8 reserved_at_20[0x8]; + u8 string_db_size[0x18]; +}; + +struct mlx5_ifc_mtrc_cap_bits { + u8 trace_owner[0x1]; + u8 trace_to_memory[0x1]; + u8 reserved_at_2[0x4]; + u8 trc_ver[0x2]; + u8 reserved_at_8[0x14]; + u8 num_string_db[0x4]; + + u8 first_string_trace[0x8]; + u8 num_string_trace[0x8]; + u8 reserved_at_30[0x28]; + + u8 log_max_trace_buffer_size[0x8]; + + u8 reserved_at_60[0x20]; + + struct mlx5_ifc_mtrc_string_db_param_bits string_db_param[8]; + + u8 reserved_at_280[0x180]; +}; + +struct mlx5_ifc_mtrc_conf_bits { + u8 reserved_at_0[0x1c]; + u8 trace_mode[0x4]; + u8 reserved_at_20[0x18]; + u8 log_trace_buffer_size[0x8]; + u8 trace_mkey[0x20]; + u8 reserved_at_60[0x3a0]; +}; + +struct mlx5_ifc_mtrc_stdb_bits { + u8 string_db_index[0x4]; + u8 reserved_at_4[0x4]; + u8 read_size[0x18]; + u8 start_offset[0x20]; + u8 string_db_data[0]; +}; + +struct mlx5_ifc_mtrc_ctrl_bits { + u8 trace_status[0x2]; + u8 reserved_at_2[0x2]; + u8 arm_event[0x1]; + u8 reserved_at_5[0xb]; + u8 modify_field_select[0x10]; + u8 reserved_at_20[0x2b]; + u8 current_timestamp52_32[0x15]; + u8 current_timestamp31_0[0x20]; + u8 reserved_at_80[0x180]; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From 5e022dd353b74132bf216a77b169c43e39f5be9e Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 16 Jul 2018 18:35:31 -0700 Subject: net/mlx5: Expose MPEGC (Management PCIe General Configuration) structures This patch exposes PRM layout for handling MPEGC (Management PCIe General Configuration). This will be used in the downstream patch for configuring MPEGC via the driver. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 4a4125b4279d..957199c20a0f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -145,6 +145,7 @@ enum { MLX5_REG_MPCNT = 0x9051, MLX5_REG_MTPPS = 0x9053, MLX5_REG_MTPPSE = 0x9054, + MLX5_REG_MPEGC = 0x9056, MLX5_REG_MCQI = 0x9061, MLX5_REG_MCC = 0x9062, MLX5_REG_MCDA = 0x9063, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index bd7b71f54d59..2de5feaeb74a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -8049,6 +8049,19 @@ struct mlx5_ifc_peir_reg_bits { u8 error_type[0x8]; }; +struct mlx5_ifc_mpegc_reg_bits { + u8 reserved_at_0[0x30]; + u8 field_select[0x10]; + + u8 tx_overflow_sense[0x1]; + u8 mark_cqe[0x1]; + u8 mark_cnp[0x1]; + u8 reserved_at_43[0x1b]; + u8 tx_lossy_overflow_oper[0x2]; + + u8 reserved_at_60[0x100]; +}; + struct mlx5_ifc_pcam_enhanced_features_bits { u8 reserved_at_0[0x6d]; u8 rx_icrc_encapsulated_counter[0x1]; @@ -8097,7 +8110,11 @@ struct mlx5_ifc_pcam_reg_bits { }; struct mlx5_ifc_mcam_enhanced_features_bits { - u8 reserved_at_0[0x7b]; + u8 reserved_at_0[0x74]; + u8 mark_tx_action_cnp[0x1]; + u8 mark_tx_action_cqe[0x1]; + u8 dynamic_tx_overflow[0x1]; + u8 reserved_at_77[0x4]; u8 pcie_outbound_stalled[0x1]; u8 tx_overflow_buffer_pkt[0x1]; u8 mtpps_enh_out_per_adj[0x1]; @@ -8112,7 +8129,9 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 mcqi[0x1]; u8 reserved_at_1f[0x1]; - u8 regs_95_to_68[0x1c]; + u8 regs_95_to_87[0x9]; + u8 mpegc[0x1]; + u8 regs_85_to_68[0x12]; u8 tracer_registers[0x4]; u8 regs_63_to_32[0x20]; -- cgit v1.2.3 From 8da6fe2a18505b9bd977e573d62d33f836c6903c Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 16 Jul 2018 18:35:32 -0700 Subject: net/mlx5: Add core support for double vlan push/pop steering action As newer firmware supports double push/pop in a single FTE, we add core bits and extend vlan action logic for it. Signed-off-by: Jianbo Liu Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 4 +++- include/linux/mlx5/mlx5_ifc.h | 11 +++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 757b4a30281e..c40f2fc68655 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -152,6 +152,8 @@ struct mlx5_fs_vlan { u8 prio; }; +#define MLX5_FS_VLAN_DEPTH 2 + struct mlx5_flow_act { u32 action; bool has_flow_tag; @@ -159,7 +161,7 @@ struct mlx5_flow_act { u32 encap_id; u32 modify_id; uintptr_t esp_id; - struct mlx5_fs_vlan vlan; + struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH]; struct ib_counters *counters; }; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2de5feaeb74a..ae12120ef021 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -337,7 +337,10 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reserved_at_9[0x1]; u8 pop_vlan[0x1]; u8 push_vlan[0x1]; - u8 reserved_at_c[0x14]; + u8 reserved_at_c[0x1]; + u8 pop_vlan_2[0x1]; + u8 push_vlan_2[0x1]; + u8 reserved_at_f[0x11]; u8 reserved_at_20[0x2]; u8 log_max_ft_size[0x6]; @@ -2386,6 +2389,8 @@ enum { MLX5_FLOW_CONTEXT_ACTION_MOD_HDR = 0x40, MLX5_FLOW_CONTEXT_ACTION_VLAN_POP = 0x80, MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH = 0x100, + MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2 = 0x400, + MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2 = 0x800, }; struct mlx5_ifc_vlan_bits { @@ -2416,7 +2421,9 @@ struct mlx5_ifc_flow_context_bits { u8 modify_header_id[0x20]; - u8 reserved_at_100[0x100]; + struct mlx5_ifc_vlan_bits push_vlan_2; + + u8 reserved_at_120[0xe0]; struct mlx5_ifc_fte_match_param_bits match_value; -- cgit v1.2.3 From e2abdcf1d2dd59d7164cf975c50e1587b96b9d04 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 16 Jul 2018 18:35:36 -0700 Subject: net/mlx5: Better return types for CQE API Reduce sizes of return types. Use bool for binary indication. Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index f8671c0a43aa..0566c6a94805 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -750,7 +750,7 @@ enum { #define MLX5_MINI_CQE_ARRAY_SIZE 8 -static inline int mlx5_get_cqe_format(struct mlx5_cqe64 *cqe) +static inline u8 mlx5_get_cqe_format(struct mlx5_cqe64 *cqe) { return (cqe->op_own >> 2) & 0x3; } @@ -770,14 +770,14 @@ static inline u8 get_cqe_l3_hdr_type(struct mlx5_cqe64 *cqe) return (cqe->l4_l3_hdr_type >> 2) & 0x3; } -static inline u8 cqe_is_tunneled(struct mlx5_cqe64 *cqe) +static inline bool cqe_is_tunneled(struct mlx5_cqe64 *cqe) { return cqe->outer_l3_tunneled & 0x1; } -static inline int cqe_has_vlan(struct mlx5_cqe64 *cqe) +static inline bool cqe_has_vlan(struct mlx5_cqe64 *cqe) { - return !!(cqe->l4_l3_hdr_type & 0x1); + return cqe->l4_l3_hdr_type & 0x1; } static inline u64 get_cqe_ts(struct mlx5_cqe64 *cqe) -- cgit v1.2.3 From b51dab46c6adfbb7e80cd0f59ae17b8a30d94b1a Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Wed, 18 Jul 2018 06:27:22 -0700 Subject: qed: Add qed APIs for PHY module query. This patch adds qed APIs for reading the PHY module. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index b4040023cbfb..8cd34645e892 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -759,6 +759,9 @@ struct qed_generic_tlvs { u8 mac[QED_TLV_MAC_COUNT][ETH_ALEN]; }; +#define QED_I2C_DEV_ADDR_A0 0xA0 +#define QED_I2C_DEV_ADDR_A2 0xA2 + #define QED_NVM_SIGNATURE 0x12435687 enum qed_nvm_flash_cmd { @@ -1026,6 +1029,18 @@ struct qed_common_ops { * @param enabled - true iff WoL should be enabled. */ int (*update_wol) (struct qed_dev *cdev, bool enabled); + +/** + * @brief read_module_eeprom + * + * @param cdev + * @param buf - buffer + * @param dev_addr - PHY device memory region + * @param offset - offset into eeprom contents to be read + * @param len - buffer length, i.e., max bytes to be read + */ + int (*read_module_eeprom)(struct qed_dev *cdev, + char *buf, u8 dev_addr, u32 offset, u32 len); }; #define MASK_FIELD(_name, _value) \ -- cgit v1.2.3 From 488dee96bb62f0b3d9e678cf42574034d5b033a5 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 20 Jul 2018 21:56:47 +0000 Subject: kernfs: allow creating kernfs objects with arbitrary uid/gid This change allows creating kernfs files and directories with arbitrary uid/gid instead of always using GLOBAL_ROOT_UID/GID by extending kernfs_create_dir_ns() and kernfs_create_file_ns() with uid/gid arguments. The "simple" kernfs_create_file() and kernfs_create_dir() are left alone and always create objects belonging to the global root. When creating symlinks ownership (uid/gid) is taken from the target kernfs object. Co-Developed-by: Tyler Hicks Signed-off-by: Dmitry Torokhov Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- include/linux/kernfs.h | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index ab25c8b6d9e3..814643f7ee52 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -15,6 +15,7 @@ #include #include #include +#include #include struct file; @@ -325,12 +326,14 @@ void kernfs_destroy_root(struct kernfs_root *root); struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, + kuid_t uid, kgid_t gid, void *priv, const void *ns); struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name); struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, - const char *name, - umode_t mode, loff_t size, + const char *name, umode_t mode, + kuid_t uid, kgid_t gid, + loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key); @@ -415,12 +418,14 @@ static inline void kernfs_destroy_root(struct kernfs_root *root) { } static inline struct kernfs_node * kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, - umode_t mode, void *priv, const void *ns) + umode_t mode, kuid_t uid, kgid_t gid, + void *priv, const void *ns) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * __kernfs_create_file(struct kernfs_node *parent, const char *name, - umode_t mode, loff_t size, const struct kernfs_ops *ops, + umode_t mode, kuid_t uid, kgid_t gid, + loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { return ERR_PTR(-ENOSYS); } @@ -498,12 +503,15 @@ static inline struct kernfs_node * kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode, void *priv) { - return kernfs_create_dir_ns(parent, name, mode, priv, NULL); + return kernfs_create_dir_ns(parent, name, mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + priv, NULL); } static inline struct kernfs_node * kernfs_create_file_ns(struct kernfs_node *parent, const char *name, - umode_t mode, loff_t size, const struct kernfs_ops *ops, + umode_t mode, kuid_t uid, kgid_t gid, + loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns) { struct lock_class_key *key = NULL; @@ -511,15 +519,17 @@ kernfs_create_file_ns(struct kernfs_node *parent, const char *name, #ifdef CONFIG_DEBUG_LOCK_ALLOC key = (struct lock_class_key *)&ops->lockdep_key; #endif - return __kernfs_create_file(parent, name, mode, size, ops, priv, ns, - key); + return __kernfs_create_file(parent, name, mode, uid, gid, + size, ops, priv, ns, key); } static inline struct kernfs_node * kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, loff_t size, const struct kernfs_ops *ops, void *priv) { - return kernfs_create_file_ns(parent, name, mode, size, ops, priv, NULL); + return kernfs_create_file_ns(parent, name, mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + size, ops, priv, NULL); } static inline int kernfs_remove_by_name(struct kernfs_node *parent, -- cgit v1.2.3 From 5f81880d5204ee2388fd9a75bb850ccd526885b7 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 20 Jul 2018 21:56:48 +0000 Subject: sysfs, kobject: allow creating kobject belonging to arbitrary users Normally kobjects and their sysfs representation belong to global root, however it is not necessarily the case for objects in separate namespaces. For example, objects in separate network namespace logically belong to the container's root and not global root. This change lays groundwork for allowing network namespace objects ownership to be transferred to container's root user by defining get_ownership() callback in ktype structure and using it in sysfs code to retrieve desired uid/gid when creating sysfs objects for given kobject. Co-Developed-by: Tyler Hicks Signed-off-by: Dmitry Torokhov Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- include/linux/kobject.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 7f6f93c3df9c..b49ff230beba 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -26,6 +26,7 @@ #include #include #include +#include #define UEVENT_HELPER_PATH_LEN 256 #define UEVENT_NUM_ENVP 32 /* number of env pointers */ @@ -114,6 +115,8 @@ extern struct kobject * __must_check kobject_get_unless_zero( extern void kobject_put(struct kobject *kobj); extern const void *kobject_namespace(struct kobject *kobj); +extern void kobject_get_ownership(struct kobject *kobj, + kuid_t *uid, kgid_t *gid); extern char *kobject_get_path(struct kobject *kobj, gfp_t flag); struct kobj_type { @@ -122,6 +125,7 @@ struct kobj_type { struct attribute **default_attrs; const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj); const void *(*namespace)(struct kobject *kobj); + void (*get_ownership)(struct kobject *kobj, kuid_t *uid, kgid_t *gid); }; struct kobj_uevent_env { -- cgit v1.2.3 From 9944e894c1266dc8515c82d1ff752d681215526b Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 20 Jul 2018 21:56:50 +0000 Subject: driver core: set up ownership of class devices in sysfs Plumb in get_ownership() callback for devices belonging to a class so that they can be created with uid/gid different from global root. This will allow network devices in a container to belong to container's root and not global root. Signed-off-by: Dmitry Torokhov Reviewed-by: Tyler Hicks Signed-off-by: David S. Miller --- include/linux/device.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 055a69dbcd18..fe6ccb6dc119 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -384,6 +384,9 @@ int subsys_virtual_register(struct bus_type *subsys, * @shutdown_pre: Called at shut-down time before driver shutdown. * @ns_type: Callbacks so sysfs can detemine namespaces. * @namespace: Namespace of the device belongs to this class. + * @get_ownership: Allows class to specify uid/gid of the sysfs directories + * for the devices belonging to the class. Usually tied to + * device's namespace. * @pm: The default device power management operations of this class. * @p: The private data of the driver core, no one other than the * driver core can touch this. @@ -413,6 +416,8 @@ struct class { const struct kobj_ns_type_operations *ns_type; const void *(*namespace)(struct device *dev); + void (*get_ownership)(struct device *dev, kuid_t *uid, kgid_t *gid); + const struct dev_pm_ops *pm; struct subsys_private *p; -- cgit v1.2.3 From f53aaa31cce7b543e407da7e97690a700206f7b9 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Mon, 16 Jul 2018 15:22:01 -0700 Subject: net/mlx5: FW tracer, implement tracer logic Implement FW tracer logic and registers access, initialization and cleanup flows. Initializing the tracer will be part of load one flow, as multiple PFs will try to acquire ownership but only one will succeed and will be the tracer owner. Signed-off-by: Feras Daoud Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 957199c20a0f..86cb0ebf92fa 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -816,6 +816,8 @@ struct mlx5_clock { struct mlx5_pps pps_info; }; +struct mlx5_fw_tracer; + struct mlx5_core_dev { struct pci_dev *pdev; /* sync pci state */ @@ -860,6 +862,7 @@ struct mlx5_core_dev { struct mlx5_clock clock; struct mlx5_ib_clock_info *clock_info; struct page *clock_info_page; + struct mlx5_fw_tracer *tracer; }; struct mlx5_db { -- cgit v1.2.3 From c71ad41ccb0c29fce95149b74786574b354c9dda Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Wed, 7 Feb 2018 11:08:56 +0200 Subject: net/mlx5: FW tracer, events handling The tracer has one event, event 0x26, with two subtypes: - Subtype 0: Ownership change - Subtype 1: Traces available An ownership change occurs in the following cases: 1- Owner releases his ownership, in this case, an event will be sent to inform others to reattempt acquire ownership. 2- Ownership was taken by a higher priority tool, in this case the owner should understand that it lost ownership, and go through tear down flow. The second subtype indicates that there are traces in the trace buffer, in this case, the driver polls the tracer buffer for new traces, parse them and prepares the messages for printing. The HW starts tracing from the first address in the tracer buffer. Driver receives an event notifying that new trace block exists. HW posts a timestamp event at the last 8B of every 256B block. Comparing the timestamp to the last handled timestamp would indicate that this is a new trace block. Once the new timestamp is detected, the entire block is considered valid. Block validation and parsing, should be done after copying the current block to a different location, in order to avoid block overwritten during processing. Signed-off-by: Feras Daoud Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0566c6a94805..d489494b0a84 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -332,6 +332,13 @@ enum mlx5_event { MLX5_EVENT_TYPE_FPGA_ERROR = 0x20, MLX5_EVENT_TYPE_FPGA_QP_ERROR = 0x21, + + MLX5_EVENT_TYPE_DEVICE_TRACER = 0x26, +}; + +enum { + MLX5_TRACER_SUBTYPE_OWNERSHIP_CHANGE = 0x0, + MLX5_TRACER_SUBTYPE_TRACES_AVAILABLE = 0x1, }; enum { -- cgit v1.2.3 From 3730cf4dd70b6a36e48d58a862120311411b77f5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 24 Jul 2018 12:47:56 +0200 Subject: netlink: do not store start function in netlink_cb ->start() is called once when dump is being initialized, there is no need to store it in netlink_cb. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netlink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index f3075d6c7e82..71f121b66ca8 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -170,7 +170,6 @@ netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask) struct netlink_callback { struct sk_buff *skb; const struct nlmsghdr *nlh; - int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); -- cgit v1.2.3 From 3c507b8af638c67d4a80d70091d2057ecb01e8a6 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 23 Jul 2018 21:40:07 +0200 Subject: net: phy: add helper phy_polling_mode Add a helper for checking whether polling is used to detect PHY status changes. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 075c2f770d3e..cd6f637cbbfb 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -824,6 +824,16 @@ static inline bool phy_interrupt_is_valid(struct phy_device *phydev) return phydev->irq != PHY_POLL && phydev->irq != PHY_IGNORE_INTERRUPT; } +/** + * phy_polling_mode - Convenience function for testing whether polling is + * used to detect PHY status changes + * @phydev: the phy_device struct + */ +static inline bool phy_polling_mode(struct phy_device *phydev) +{ + return phydev->irq == PHY_POLL; +} + /** * phy_is_internal - Convenience function for testing if a PHY is internal * @phydev: the phy_device struct -- cgit v1.2.3 From 038709071328ff68a936c6b8c33a24a805eea3c5 Mon Sep 17 00:00:00 2001 From: Zhu Yi Date: Wed, 13 Jun 2018 16:37:17 +0200 Subject: can: dev: enable multi-queue for SocketCAN devices The existing SocketCAN implementation provides alloc_candev() to allocate a CAN device using a single Tx and Rx queue. This can lead to priority inversion in case the single Tx queue is already full with low priority messages and a high priority message needs to be sent while the bus is fully loaded with medium priority messages. This problem can be solved by using the existing multi-queue support of the network subsytem. The commit makes it possible to use multi-queue in the CAN subsystem in the same way it is used in the Ethernet subsystem by adding an alloc_candev_mqs() call and accompanying macros. With this support a CAN device can use multi-queue qdisc (e.g. mqprio) to avoid the aforementioned priority inversion. The exisiting functionality of alloc_candev() is the same as before. CAN devices need to have prioritized multiple hardware queues or are able to abort waiting for arbitration to make sensible use of multi-queues. Signed-off-by: Zhu Yi Signed-off-by: Mark Jonas Reviewed-by: Heiko Schocher Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 055aaf5ed9af..a83e1f632eb7 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -143,7 +143,12 @@ u8 can_dlc2len(u8 can_dlc); /* map the sanitized data length to an appropriate data length code */ u8 can_len2dlc(u8 len); -struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max); +struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, + unsigned int txqs, unsigned int rxqs); +#define alloc_candev(sizeof_priv, echo_skb_max) \ + alloc_candev_mqs(sizeof_priv, echo_skb_max, 1, 1) +#define alloc_candev_mq(sizeof_priv, echo_skb_max, count) \ + alloc_candev_mqs(sizeof_priv, echo_skb_max, count, count) void free_candev(struct net_device *dev); /* a candev safe wrapper around netdev_priv */ -- cgit v1.2.3 From 22a65aa8b1a84ca429c0fc8415dee5681ab36eb3 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 25 Dec 2017 18:40:52 +0200 Subject: net/mlx5e: Vxlan, check maximum number of UDP ports The NIC has a limited number of offloaded VXLAN UDP ports (usually 4). Instead of letting the firmware fail when trying to add more ports than it can handle, let the driver check it on its own. Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 22f54bedfaae..60c2308fe062 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -668,7 +668,9 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 swp[0x1]; u8 swp_csum[0x1]; u8 swp_lso[0x1]; - u8 reserved_at_23[0x1b]; + u8 reserved_at_23[0xd]; + u8 max_vxlan_udp_ports[0x8]; + u8 reserved_at_38[0x6]; u8 max_geneve_opt_len[0x1]; u8 tunnel_stateless_geneve_rx[0x1]; -- cgit v1.2.3 From 358aa5ce288aa1085f0f3ef9f315119563fa6541 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 9 May 2018 13:28:00 -0700 Subject: net/mlx5e: Vxlan, move vxlan logic to core driver Move vxlan logic and objects to mlx5 core dirver. Since it going to be used from different mlx5 interfaces. e.g. mlx5e PF NIC netdev and mlx5e E-Switch representors. Signed-off-by: Saeed Mahameed Reviewed-by: Or Gerlitz --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fd0aaa5568fe..54f385cc8811 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -818,6 +818,7 @@ struct mlx5_clock { }; struct mlx5_fw_tracer; +struct mlx5_vxlan; struct mlx5_core_dev { struct pci_dev *pdev; @@ -850,6 +851,7 @@ struct mlx5_core_dev { atomic_t num_qps; u32 issi; struct mlx5e_resources mlx5e_res; + struct mlx5_vxlan *vxlan; struct { struct mlx5_rsvd_gids reserved_gids; u32 roce_en; -- cgit v1.2.3 From 5cbf777cfdf6e5a7b7149006e4881a255da78fdd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 27 Jul 2018 16:37:28 +0800 Subject: route: add support for directed broadcast forwarding This patch implements the feature described in rfc1812#section-5.3.5.2 and rfc2644. It allows the router to forward directed broadcast when sysctl bc_forwarding is enabled. Note that this feature could be done by iptables -j TEE, but it would cause some problems: - target TEE's gateway param has to be set with a specific address, and it's not flexible especially when the route wants forward all directed broadcasts. - this duplicates the directed broadcasts so this may cause side effects to applications. Besides, to keep consistent with other os router like BSD, it's also necessary to implement it in the route rx path. Note that route cache needs to be flushed when bc_forwarding is changed. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 27650f1bff3d..c759d1cbcedd 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -93,6 +93,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) #define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING) #define IN_DEV_MFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), MC_FORWARDING) +#define IN_DEV_BFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), BC_FORWARDING) #define IN_DEV_RPFILTER(in_dev) IN_DEV_MAXCONF((in_dev), RP_FILTER) #define IN_DEV_SRC_VMARK(in_dev) IN_DEV_ORCONF((in_dev), SRC_VMARK) #define IN_DEV_SOURCE_ROUTE(in_dev) IN_DEV_ANDCONF((in_dev), \ -- cgit v1.2.3 From 7a4c53bee3324ac00bf964aa2f82d15d279e86e4 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 27 Jul 2018 13:43:23 -0700 Subject: net: report invalid mtu value via netlink extack If an invalid MTU value is set through rtnetlink return extra error information instead of putting message in kernel log. For other cases where there is no visible API, keep the error report in the log. Example: # ip li set dev enp12s0 mtu 10000 Error: mtu greater than device maximum. # ifconfig enp12s0 mtu 10000 SIOCSIFMTU: Invalid argument # dmesg | tail -1 [ 2047.795467] enp12s0: mtu greater than device maximum Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c1295c7a452e..9c917467a2c7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3546,6 +3546,8 @@ int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int dev_change_net_namespace(struct net_device *, struct net *, const char *); int __dev_set_mtu(struct net_device *, int); +int dev_set_mtu_ext(struct net_device *dev, int mtu, + struct netlink_ext_ack *extack); int dev_set_mtu(struct net_device *, int); int dev_change_tx_queue_len(struct net_device *, unsigned long); void dev_set_group(struct net_device *, int); -- cgit v1.2.3 From 51c23b47e6b8590ea7a6a6776ffb21810ece73bf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 13 Jul 2018 14:54:45 +0200 Subject: netfilter: nf_osf: add nf_osf_find() This new function returns the OS genre as a string. Plan is to use to from the new nft_osf extension. Note that this doesn't yet support ttl options, but it could be easily extended to do so. Tested-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_osf.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h index 0e114c492fb8..aee460fcbd31 100644 --- a/include/linux/netfilter/nf_osf.h +++ b/include/linux/netfilter/nf_osf.h @@ -1,3 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NFOSF_H +#define _NFOSF_H + #include /* Initial window size option state machine: multiple of mss, mtu or @@ -31,3 +35,8 @@ bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, int hooknum, struct net_device *in, struct net_device *out, const struct nf_osf_info *info, struct net *net, const struct list_head *nf_osf_fingers); + +const char *nf_osf_find(const struct sk_buff *skb, + const struct list_head *nf_osf_fingers); + +#endif /* _NFOSF_H */ -- cgit v1.2.3 From c29c2ebd2ae0066c026045a21aa33ccbfcd8bb3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 30 Jul 2018 20:43:51 -0700 Subject: net: update real_num_rx_queues even when !CONFIG_SYSFS We used to depend on real_num_rx_queues as a upper bound for sanity checks. For AF_XDP socket validation it's useful if the check behaves the same regardless of CONFIG_SYSFS setting. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9c917467a2c7..3bf7e93c9e96 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3431,8 +3431,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq); int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq); #else static inline int netif_set_real_num_rx_queues(struct net_device *dev, - unsigned int rxq) + unsigned int rxqs) { + dev->real_num_rx_queues = rxqs; return 0; } #endif -- cgit v1.2.3 From 84c6b86875e01a08a0daa6fdd4a01b36bf0bf0b2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 30 Jul 2018 20:43:53 -0700 Subject: xsk: don't allow umem replace at stack level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently drivers have to check if they already have a umem installed for a given queue and return an error if so. Make better use of XDP_QUERY_XSK_UMEM and move this functionality to the core. We need to keep rtnl across the calls now. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Björn Töpel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3bf7e93c9e96..282e2e95ad5b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -872,10 +872,10 @@ struct netdev_bpf { struct { struct bpf_offloaded_map *offmap; }; - /* XDP_SETUP_XSK_UMEM */ + /* XDP_QUERY_XSK_UMEM, XDP_SETUP_XSK_UMEM */ struct { - struct xdp_umem *umem; - u16 queue_id; + struct xdp_umem *umem; /* out for query*/ + u16 queue_id; /* in for query */ } xsk; }; }; @@ -3568,6 +3568,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags); u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, enum bpf_netdev_command cmd); +int xdp_umem_query(struct net_device *dev, u16 queue_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); -- cgit v1.2.3 From e6476c21447c4b17c47e476aade6facf050f31e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:45:07 +0200 Subject: net: remove bogus RCU annotations on socket.wq We never use RCU protection for it, just a lot of cargo-cult rcu_deference_protects calls. Note that we do keep the kfree_rcu call for it, as the references through struct sock are RCU protected and thus might require a grace period before freeing. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Dumazet Acked-by: Paul E. McKenney Signed-off-by: David S. Miller --- include/linux/net.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 6554d3ba4396..e0930678c8bf 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -114,7 +114,7 @@ struct socket { unsigned long flags; - struct socket_wq __rcu *wq; + struct socket_wq *wq; struct file *file; struct sock *sk; -- cgit v1.2.3 From ba113c3aa79a7f941ac162d05a3620bdc985c58d Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:21 -0700 Subject: tcp: add data bytes sent stats Introduce a new TCP stat to record the number of bytes sent (RFC4898 tcpEStatsPerfHCDataOctetsOut) and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 58a8d7d71354..d0798dcd2cab 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -181,6 +181,9 @@ struct tcp_sock { u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut * total number of data segments sent. */ + u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut + * total number of data bytes sent. + */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. -- cgit v1.2.3 From fb31c9b9f6c85b1bad569ecedbde78d9e37cd87b Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:22 -0700 Subject: tcp: add data bytes retransmitted stats Introduce a new TCP stat to record the number of bytes retransmitted (RFC4898 tcpEStatsPerfOctetsRetrans) and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d0798dcd2cab..fb67f9a51b95 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -333,6 +333,9 @@ struct tcp_sock { * the first SYN. */ u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ + u64 bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans + * Total data bytes retransmitted + */ u32 total_retrans; /* Total retransmits for entire connection */ u32 urg_seq; /* Seq of received urgent pointer */ -- cgit v1.2.3 From 7e10b6554ff2ce7f86d5d3eec3af5db8db482caa Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:23 -0700 Subject: tcp: add dsack blocks received stats Introduce a new TCP stat to record the number of DSACK blocks received (RFC4989 tcpEStatsStackDSACKDups) and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index fb67f9a51b95..da6281c549a5 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -188,6 +188,9 @@ struct tcp_sock { * sum(delta(snd_una)), or how many bytes * were acked. */ + u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups + * total number of DSACK blocks received + */ u32 snd_una; /* First byte we want an ack for */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ -- cgit v1.2.3 From 7ec65372ca534217b53fd208500cf7aac223a383 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 31 Jul 2018 17:46:24 -0700 Subject: tcp: add stat of data packet reordering events Introduce a new TCP stats to record the number of reordering events seen and expose it in both tcp_info (TCP_INFO) and opt_stats (SOF_TIMESTAMPING_OPT_STATS). Application can use this stats to track the frequency of the reordering events in addition to the existing reordering stats which tracks the magnitude of the latest reordering event. Note: this new stats tracks reordering events triggered by ACKs, which could often be fewer than the actual number of packets being delivered out-of-order. Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index da6281c549a5..263e37271afd 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -220,8 +220,7 @@ struct tcp_sock { #define TCP_RACK_RECOVERY_THRESH 16 u8 reo_wnd_persist:5, /* No. of recovery since last adj */ dsack_seen:1, /* Whether DSACK seen after last adj */ - advanced:1, /* mstamp advanced since last lost marking */ - reord:1; /* reordering detected */ + advanced:1; /* mstamp advanced since last lost marking */ } rack; u16 advmss; /* Advertised MSS */ u8 compressed_ack; @@ -267,6 +266,7 @@ struct tcp_sock { u8 ecn_flags; /* ECN status bits. */ u8 keepalive_probes; /* num of allowed keep alive probes */ u32 reordering; /* Packet reordering metric. */ + u32 reord_seen; /* number of data packet reordering events */ u32 snd_up; /* Urgent pointer */ /* -- cgit v1.2.3 From 0a4c58f5702858822621fa1177c7d3475f181ccb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:17 -0700 Subject: bpf: add ability to charge bpf maps memory dynamically This commits extends existing bpf maps memory charging API to support dynamic charging/uncharging. This is required to account memory used by maps, if all entries are created dynamically after the map initialization. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5b5ad95cf339..5a4a256473c3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -435,6 +435,8 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); +int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); +void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); void *bpf_map_area_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); -- cgit v1.2.3 From de9cbbaadba5adf88a19e46df61f7054000838f6 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:18 -0700 Subject: bpf: introduce cgroup storage maps This commit introduces BPF_MAP_TYPE_CGROUP_STORAGE maps: a special type of maps which are implementing the cgroup storage. >From the userspace point of view it's almost a generic hash map with the (cgroup inode id, attachment type) pair used as a key. The only difference is that some operations are restricted: 1) a user can't create new entries, 2) a user can't remove existing entries. The lookup from userspace is o(log(n)). Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 38 ++++++++++++++++++++++++++++++++++++++ include/linux/bpf.h | 1 + include/linux/bpf_types.h | 3 +++ 3 files changed, 42 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index d50c2f0a655a..7d00d58869ed 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -4,19 +4,39 @@ #include #include +#include #include struct sock; struct sockaddr; struct cgroup; struct sk_buff; +struct bpf_map; +struct bpf_prog; struct bpf_sock_ops_kern; +struct bpf_cgroup_storage; #ifdef CONFIG_CGROUP_BPF extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +struct bpf_cgroup_storage_map; + +struct bpf_storage_buffer { + struct rcu_head rcu; + char data[0]; +}; + +struct bpf_cgroup_storage { + struct bpf_storage_buffer *buf; + struct bpf_cgroup_storage_map *map; + struct bpf_cgroup_storage_key key; + struct list_head list; + struct rb_node node; + struct rcu_head rcu; +}; + struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; @@ -77,6 +97,15 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog); +void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); +void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, + struct cgroup *cgroup, + enum bpf_attach_type type); +void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); +int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); +void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -221,6 +250,15 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, + struct bpf_map *map) { return 0; } +static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, + struct bpf_map *map) {} +static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( + struct bpf_prog *prog) { return 0; } +static inline void bpf_cgroup_storage_free( + struct bpf_cgroup_storage *storage) {} + #define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5a4a256473c3..9d1e4727495e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -282,6 +282,7 @@ struct bpf_prog_aux { struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ + struct bpf_map *cgroup_storage; char name[BPF_OBJ_NAME_LEN]; #ifdef CONFIG_SECURITY void *security; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index c5700c2d5549..add08be53b6f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -37,6 +37,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops) #ifdef CONFIG_CGROUPS BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) #endif +#ifdef CONFIG_CGROUP_BPF +BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops) -- cgit v1.2.3 From aa0ad5b0391e268bdecf6cda31268388844f8afd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:19 -0700 Subject: bpf: pass a pointer to a cgroup storage using pcpu variable This commit introduces the bpf_cgroup_storage_set() helper, which will be used to pass a pointer to a cgroup storage to the bpf helper. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 7d00d58869ed..9a144ddbbc8f 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -21,6 +22,8 @@ struct bpf_cgroup_storage; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +DECLARE_PER_CPU(void*, bpf_cgroup_storage); + struct bpf_cgroup_storage_map; struct bpf_storage_buffer { @@ -97,6 +100,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); +static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) +{ + struct bpf_storage_buffer *buf; + + if (!storage) + return; + + buf = READ_ONCE(storage->buf); + this_cpu_write(bpf_cgroup_storage, &buf->data[0]); +} + struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog); void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, @@ -250,6 +264,7 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map) { return 0; } static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, -- cgit v1.2.3 From d7bf2c10af053191e931b58704cd862fccb7f9de Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:20 -0700 Subject: bpf: allocate cgroup storage entries on attaching bpf programs If a bpf program is using cgroup local storage, allocate a bpf_cgroup_storage structure automatically on attaching the program to a cgroup and save the pointer into the corresponding bpf_prog_list entry. Analogically, release the cgroup local storage on detaching of the bpf program. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 9a144ddbbc8f..f91b0f8ff3a9 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -43,6 +43,7 @@ struct bpf_cgroup_storage { struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; + struct bpf_cgroup_storage *storage; }; struct bpf_prog_array; -- cgit v1.2.3 From 394e40a29788820c9c0526b1c3497c9e0ec2a126 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:21 -0700 Subject: bpf: extend bpf_prog_array to store pointers to the cgroup storage This patch converts bpf_prog_array from an array of prog pointers to the array of struct bpf_prog_array_item elements. This allows to save a cgroup storage pointer for each bpf program efficiently attached to a cgroup. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9d1e4727495e..16be67888c30 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -349,9 +349,14 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, * The 'struct bpf_prog_array *' should only be replaced with xchg() * since other cpus are walking the array of pointers in parallel. */ +struct bpf_prog_array_item { + struct bpf_prog *prog; + struct bpf_cgroup_storage *cgroup_storage; +}; + struct bpf_prog_array { struct rcu_head rcu; - struct bpf_prog *progs[0]; + struct bpf_prog_array_item items[0]; }; struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); @@ -372,7 +377,8 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \ ({ \ - struct bpf_prog **_prog, *__prog; \ + struct bpf_prog_array_item *_item; \ + struct bpf_prog *_prog; \ struct bpf_prog_array *_array; \ u32 _ret = 1; \ preempt_disable(); \ @@ -380,10 +386,11 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, _array = rcu_dereference(array); \ if (unlikely(check_non_null && !_array))\ goto _out; \ - _prog = _array->progs; \ - while ((__prog = READ_ONCE(*_prog))) { \ - _ret &= func(__prog, ctx); \ - _prog++; \ + _item = &_array->items[0]; \ + while ((_prog = READ_ONCE(_item->prog))) { \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ + _ret &= func(_prog, ctx); \ + _item++; \ } \ _out: \ rcu_read_unlock(); \ -- cgit v1.2.3 From 3e6a4b3e0289dc9540a2c1d8a20657f4707fbabb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:22 -0700 Subject: bpf/verifier: introduce BPF_PTR_TO_MAP_VALUE BPF_MAP_TYPE_CGROUP_STORAGE maps are special in a way that the access from the bpf program side is lookup-free. That means the result is guaranteed to be a valid pointer to the cgroup storage; no NULL-check is required. This patch introduces BPF_PTR_TO_MAP_VALUE return type, which is required to cause the verifier accept programs, which are not checking the map value pointer for being NULL. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 16be67888c30..ca4ac2a39def 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -155,6 +155,7 @@ enum bpf_arg_type { enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ + RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ }; -- cgit v1.2.3 From cd3394317653837e2eb5c5d0904a8996102af9fc Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 2 Aug 2018 14:27:24 -0700 Subject: bpf: introduce the bpf_get_local_storage() helper function The bpf_get_local_storage() helper function is used to get a pointer to the bpf local storage from a bpf program. It takes a pointer to a storage map and flags as arguments. Right now it accepts only cgroup storage maps, and flags argument has to be 0. Further it can be extended to support other types of local storage: e.g. thread local storage etc. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ca4ac2a39def..cd8790d2c6ed 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -788,6 +788,8 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; +extern const struct bpf_func_proto bpf_get_local_storage_proto; + /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); -- cgit v1.2.3 From 7cca1ed0bb248b8d5768d17f5afe297a832d66c0 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 31 Jul 2018 20:25:00 +0200 Subject: netfilter: nf_osf: move nf_osf_fingers to non-uapi header file All warnings (new ones prefixed by >>): >> ./usr/include/linux/netfilter/nf_osf.h:73: userspace cannot reference function or variable defined in the kernel Fixes: f9324952088f ("netfilter: nfnetlink_osf: extract nfnetlink_subsystem code from xt_osf.c") Reported-by: kbuild test robot Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_osf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h index aee460fcbd31..3e455d6f94d5 100644 --- a/include/linux/netfilter/nf_osf.h +++ b/include/linux/netfilter/nf_osf.h @@ -25,6 +25,8 @@ enum osf_fmatch_states { FMATCH_OPT_WRONG, }; +extern struct list_head nf_osf_fingers[2]; + struct nf_osf_finger { struct rcu_head rcu_head; struct list_head finger_entry; -- cgit v1.2.3 From ddba40be59c9be4059288464f8e6f38fbba27495 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 31 Jul 2018 20:25:01 +0200 Subject: netfilter: nfnetlink_osf: rename nf_osf header file to nfnetlink_osf The first client of the nf_osf.h userspace header is nft_osf, coming in this batch, rename it to nfnetlink_osf.h as there are no userspace clients for this yet, hence this looks consistent with other nfnetlink subsystem. Suggested-by: Jan Engelhardt Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_osf.h | 44 --------------------------------- include/linux/netfilter/nfnetlink_osf.h | 44 +++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 44 deletions(-) delete mode 100644 include/linux/netfilter/nf_osf.h create mode 100644 include/linux/netfilter/nfnetlink_osf.h (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h deleted file mode 100644 index 3e455d6f94d5..000000000000 --- a/include/linux/netfilter/nf_osf.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NFOSF_H -#define _NFOSF_H - -#include - -/* Initial window size option state machine: multiple of mss, mtu or - * plain numeric value. Can also be made as plain numeric value which - * is not a multiple of specified value. - */ -enum nf_osf_window_size_options { - OSF_WSS_PLAIN = 0, - OSF_WSS_MSS, - OSF_WSS_MTU, - OSF_WSS_MODULO, - OSF_WSS_MAX, -}; - -enum osf_fmatch_states { - /* Packet does not match the fingerprint */ - FMATCH_WRONG = 0, - /* Packet matches the fingerprint */ - FMATCH_OK, - /* Options do not match the fingerprint, but header does */ - FMATCH_OPT_WRONG, -}; - -extern struct list_head nf_osf_fingers[2]; - -struct nf_osf_finger { - struct rcu_head rcu_head; - struct list_head finger_entry; - struct nf_osf_user_finger finger; -}; - -bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, - int hooknum, struct net_device *in, struct net_device *out, - const struct nf_osf_info *info, struct net *net, - const struct list_head *nf_osf_fingers); - -const char *nf_osf_find(const struct sk_buff *skb, - const struct list_head *nf_osf_fingers); - -#endif /* _NFOSF_H */ diff --git a/include/linux/netfilter/nfnetlink_osf.h b/include/linux/netfilter/nfnetlink_osf.h new file mode 100644 index 000000000000..a7311bc03d3a --- /dev/null +++ b/include/linux/netfilter/nfnetlink_osf.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NFOSF_H +#define _NFOSF_H + +#include + +/* Initial window size option state machine: multiple of mss, mtu or + * plain numeric value. Can also be made as plain numeric value which + * is not a multiple of specified value. + */ +enum nf_osf_window_size_options { + OSF_WSS_PLAIN = 0, + OSF_WSS_MSS, + OSF_WSS_MTU, + OSF_WSS_MODULO, + OSF_WSS_MAX, +}; + +enum osf_fmatch_states { + /* Packet does not match the fingerprint */ + FMATCH_WRONG = 0, + /* Packet matches the fingerprint */ + FMATCH_OK, + /* Options do not match the fingerprint, but header does */ + FMATCH_OPT_WRONG, +}; + +extern struct list_head nf_osf_fingers[2]; + +struct nf_osf_finger { + struct rcu_head rcu_head; + struct list_head finger_entry; + struct nf_osf_user_finger finger; +}; + +bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, + int hooknum, struct net_device *in, struct net_device *out, + const struct nf_osf_info *info, struct net *net, + const struct list_head *nf_osf_fingers); + +const char *nf_osf_find(const struct sk_buff *skb, + const struct list_head *nf_osf_fingers); + +#endif /* _NFOSF_H */ -- cgit v1.2.3 From 94276fa8a2a4c08ccb2e9d55e88b95dc972ccea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Eckl?= Date: Fri, 3 Aug 2018 13:36:13 +0200 Subject: netfilter: bridge: Expose nf_tables bridge hook priorities through uapi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Netfilter exposes standard hook priorities in case of ipv4, ipv6 and arp but not in case of bridge. This patch exposes the hook priority values of the bridge family (which are different from the formerly mentioned) via uapi so that they can be used by user-space applications just like the others. Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index b671fdfd212b..fa0686500970 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -5,17 +5,6 @@ #include #include -enum nf_br_hook_priorities { - NF_BR_PRI_FIRST = INT_MIN, - NF_BR_PRI_NAT_DST_BRIDGED = -300, - NF_BR_PRI_FILTER_BRIDGED = -200, - NF_BR_PRI_BRNF = 0, - NF_BR_PRI_NAT_DST_OTHER = 100, - NF_BR_PRI_FILTER_OTHER = 200, - NF_BR_PRI_NAT_SRC = 300, - NF_BR_PRI_LAST = INT_MAX, -}; - #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); -- cgit v1.2.3 From 91305f2812624c0cf7ccbb44133b66d3b24676e4 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 1 Aug 2018 18:05:54 +0800 Subject: ptp_qoriq: support automatic configuration for ptp timer This patch is to support automatic configuration for ptp timer. If required ptp dts properties are not provided, driver could try to calculate a set of default configurations to initialize the ptp timer. This makes the driver work for many boards which don't have the required ptp dts properties in current kernel. Also the users could set dts properties by themselves according to their requirement. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- include/linux/fsl/ptp_qoriq.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h index dc3dac40f069..c1f003aadcce 100644 --- a/include/linux/fsl/ptp_qoriq.h +++ b/include/linux/fsl/ptp_qoriq.h @@ -127,9 +127,13 @@ struct qoriq_ptp_registers { #define DRIVER "ptp_qoriq" -#define DEFAULT_CKSEL 1 #define N_EXT_TS 2 +#define DEFAULT_CKSEL 1 +#define DEFAULT_TMR_PRSC 2 +#define DEFAULT_FIPER1_PERIOD 1000000000 +#define DEFAULT_FIPER2_PERIOD 100000 + struct qoriq_ptp { void __iomem *base; struct qoriq_ptp_registers regs; -- cgit v1.2.3 From 385114dec8a49b5e5945e77ba7de6356106713f4 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 2 Aug 2018 23:34:38 +0000 Subject: net: modify skb_rbtree_purge to return the truesize of all purged skbs. Tested: see the next patch is the series. Suggested-by: Eric Dumazet Signed-off-by: Peter Oskolkov Signed-off-by: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fd3cb1b247df..47848367c816 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2585,7 +2585,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) kfree_skb(skb); } -void skb_rbtree_purge(struct rb_root *root); +unsigned int skb_rbtree_purge(struct rb_root *root); void *netdev_alloc_frag(unsigned int fragsz); -- cgit v1.2.3 From fa0f527358bd900ef92f925878ed6bfbd51305cc Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 2 Aug 2018 23:34:39 +0000 Subject: ip: use rb trees for IP frag queue. Similar to TCP OOO RX queue, it makes sense to use rb trees to store IP fragments, so that OOO fragments are inserted faster. Tested: - a follow-up patch contains a rather comprehensive ip defrag self-test (functional) - ran neper `udp_stream -c -H -F 100 -l 300 -T 20`: netstat --statistics Ip: 282078937 total packets received 0 forwarded 0 incoming packets discarded 946760 incoming packets delivered 18743456 requests sent out 101 fragments dropped after timeout 282077129 reassemblies required 944952 packets reassembled ok 262734239 packet reassembles failed (The numbers/stats above are somewhat better re: reassemblies vs a kernel without this patchset. More comprehensive performance testing TBD). Reported-by: Jann Horn Reported-by: Juha-Matti Tilli Suggested-by: Eric Dumazet Signed-off-by: Peter Oskolkov Signed-off-by: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 47848367c816..7ebdf158a795 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -676,13 +676,16 @@ struct sk_buff { * UDP receive path is one user. */ unsigned long dev_scratch; - int ip_defrag_offset; }; }; - struct rb_node rbnode; /* used in netem & tcp stack */ + struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; }; - struct sock *sk; + + union { + struct sock *sk; + int ip_defrag_offset; + }; union { ktime_t tstamp; -- cgit v1.2.3 From 6fdecfe32f903d9f5f35ee4464fd32ede470dcad Mon Sep 17 00:00:00 2001 From: Arun Parameswaran Date: Tue, 7 Aug 2018 10:02:44 -0700 Subject: net: phy: Add support for Broadcom Omega internal Combo GPHY Add support for the Broadcom Omega SoC internal Combo Ethernet GPHY to the bcm7xxx phy driver. Signed-off-by: Arun Parameswaran Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index daa9234a9baf..949e9af8d9d6 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -45,6 +45,7 @@ #define PHY_ID_BCM7445 0x600d8510 #define PHY_ID_BCM_CYGNUS 0xae025200 +#define PHY_ID_BCM_OMEGA 0xae025100 #define PHY_BCM_OUI_MASK 0xfffffc00 #define PHY_BCM_OUI_1 0x00206000 -- cgit v1.2.3 From 212dfd909ea8b630e5d6fa4d25aeec9c4b4b14a5 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 7 Aug 2018 22:06:52 +0200 Subject: netfilter: nfnetlink_osf: add missing enum in nfnetlink_osf uapi header xt_osf_window_size_options was originally part of include/uapi/linux/netfilter/xt_osf.h, restore it. Fixes: bfb15f2a95cb ("netfilter: extract Passive OS fingerprint infrastructure from xt_osf") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink_osf.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink_osf.h b/include/linux/netfilter/nfnetlink_osf.h index a7311bc03d3a..ecf7dab81e9e 100644 --- a/include/linux/netfilter/nfnetlink_osf.h +++ b/include/linux/netfilter/nfnetlink_osf.h @@ -4,18 +4,6 @@ #include -/* Initial window size option state machine: multiple of mss, mtu or - * plain numeric value. Can also be made as plain numeric value which - * is not a multiple of specified value. - */ -enum nf_osf_window_size_options { - OSF_WSS_PLAIN = 0, - OSF_WSS_MSS, - OSF_WSS_MTU, - OSF_WSS_MODULO, - OSF_WSS_MAX, -}; - enum osf_fmatch_states { /* Packet does not match the fingerprint */ FMATCH_WRONG = 0, -- cgit v1.2.3 From 342ac8448f1fb213908656ae5581d0f37a5954e8 Mon Sep 17 00:00:00 2001 From: Denis Drozdov Date: Wed, 8 Aug 2018 16:23:48 -0700 Subject: net/mlx5: Use max_num_eqs for calculation of required MSIX vectors New firmware has defined new HCA capability field called "max_num_eqs", that is the number of available EQs after subtracting reserved FW EQs. Before this capability the FW reported the EQ number in "log_max_eqs", the reported value also contained FW reserved EQs, but the driver might be failing to load on 320 cpus systems due to the fact that FW reserved EQs were not available to the driver. Now the driver has to obtain max_num_eqs value from new FW to get real number of EQs available. Signed-off-by: Denis Drozdov Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 60c2308fe062..63e1ce4c4826 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1133,7 +1133,10 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 general_obj_types[0x40]; - u8 reserved_at_440[0x40]; + u8 reserved_at_440[0x20]; + + u8 reserved_at_460[0x10]; + u8 max_num_eqs[0x10]; u8 reserved_at_480[0x3]; u8 log_max_l2_table[0x5]; -- cgit v1.2.3 From cc9c82a8668cf98748bfbaa83c5c092d5115c25b Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Wed, 8 Aug 2018 16:23:49 -0700 Subject: net/mlx5: Rename modify/query_vport state related enums Modify and query vport state commands share the same admin_state and op_mod values, rename the enums to fit them both. In addition, remove the esw prefix from the admin state enum as this also applied for vnic. Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 6 +++--- include/linux/mlx5/mlx5_ifc.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index d489494b0a84..11fa4e66afc5 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -946,9 +946,9 @@ enum { }; enum { - MLX5_ESW_VPORT_ADMIN_STATE_DOWN = 0x0, - MLX5_ESW_VPORT_ADMIN_STATE_UP = 0x1, - MLX5_ESW_VPORT_ADMIN_STATE_AUTO = 0x2, + MLX5_VPORT_ADMIN_STATE_DOWN = 0x0, + MLX5_VPORT_ADMIN_STATE_UP = 0x1, + MLX5_VPORT_ADMIN_STATE_AUTO = 0x2, }; enum { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 63e1ce4c4826..6ead9c1a5396 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3764,8 +3764,8 @@ struct mlx5_ifc_query_vport_state_out_bits { }; enum { - MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT = 0x0, - MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT = 0x1, + MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT = 0x0, + MLX5_VPORT_STATE_OP_MOD_ESW_VPORT = 0x1, }; struct mlx5_ifc_query_vport_state_in_bits { -- cgit v1.2.3 From 29d8ebd44de8999e292dbb4de76744d4a41039ec Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 8 Aug 2018 16:23:51 -0700 Subject: net/mlx5: Remove unused mlx5_query_vport_admin_state mlx5_query_vport_admin_state() is not used anywhere. Remove it. Signed-off-by: Eli Cohen Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/vport.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 9208cb8809ac..7e7c6dfcfb09 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -43,8 +43,6 @@ enum { }; u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport); -u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, - u16 vport); int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 state); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, -- cgit v1.2.3 From 209b43759d65b2cc99ce7757249aacc82b03c4e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20B=C3=BCsch?= Date: Tue, 31 Jul 2018 22:15:09 +0200 Subject: ssb: Remove SSB_WARN_ON, SSB_BUG_ON and SSB_DEBUG Use the standard WARN_ON instead. If a small kernel is desired, WARN_ON can be disabled globally. Also remove SSB_DEBUG. Besides WARN_ON it only adds a tiny debug check. Include this check unconditionally. Signed-off-by: Michael Buesch Signed-off-by: Kalle Valo --- include/linux/ssb/ssb.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h index 3b43655cabe6..0d5a2691e7e9 100644 --- a/include/linux/ssb/ssb.h +++ b/include/linux/ssb/ssb.h @@ -499,11 +499,9 @@ struct ssb_bus { /* Internal-only stuff follows. Do not touch. */ struct list_head list; -#ifdef CONFIG_SSB_DEBUG /* Is the bus already powered up? */ bool powered_up; int power_warn_count; -#endif /* DEBUG */ }; enum ssb_quirks { -- cgit v1.2.3 From 624c0f0239f04cce78c86afa95eb2841c84fbab1 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 9 Aug 2018 15:38:38 +0200 Subject: phylink: add helper for configuring 2500BaseX modes Add a helper for MAC drivers to use in their validate callback to deal with 2500BaseX vs 1000BaseX modes, where the hardware supports both but it is not possible to automatically select between them. This helper defaults to 1000BaseX, as that is the 802.3 standard, and will allow users to select 2500BaseX either by forcing the speed if AN is disabled, or by changing the advertising mask if AN is enabled. Disabling AN is not recommended as it is only the speed that we're interested in controlling, not the duplex or pause mode parameters. Signed-off-by: Russell King Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phylink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 50eeae025f1e..021fc6595856 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -234,5 +234,6 @@ int phylink_mii_ioctl(struct phylink *, struct ifreq *, int); #define phylink_test(bm, mode) __phylink_do_bit(test_bit, bm, mode) void phylink_set_port_modes(unsigned long *bits); +void phylink_helper_basex_speed(struct phylink_link_state *state); #endif -- cgit v1.2.3 From 48ae5554a076c1bca31448d60263e4038def9f6f Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Wed, 8 Aug 2018 09:04:29 +0100 Subject: net: stmmac: Add XGMAC 2.10 HWIF entry Add a new entry to HWIF table for XGMAC 2.10. For now we fill it with empty callbacks which will be added in posterior patches. Signed-off-by: Jose Abreu Cc: David S. Miller Cc: Joao Pinto Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 32feac5bbd75..c43e9a01b892 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -190,5 +190,6 @@ struct plat_stmmacenet_data { bool tso_en; int mac_port_sel_speed; bool en_tx_lpi_clockgating; + int has_xgmac; }; #endif -- cgit v1.2.3 From 5e7baf0fcb2a3aef7329f3c7543d4695a46bd321 Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Thu, 9 Aug 2018 11:13:49 -0700 Subject: qed/qede: Multi CoS support. This patch adds support for tc mqprio offload, using this different traffic classes on the adapter can be utilized based on configured priority to tc map. For example - tc qdisc add dev eth0 root mqprio num_tc 4 map 0 1 2 3 This will cause SKBs with priority 0,1,2,3 to transmit over tc 0,1,2,3 hardware queues respectively. Signed-off-by: Manish Chopra Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_eth_if.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h index 2978fa4add42..a1310482c4ed 100644 --- a/include/linux/qed/qed_eth_if.h +++ b/include/linux/qed/qed_eth_if.h @@ -39,6 +39,10 @@ #include #include +/* 64 max queues * (1 rx + 4 tx-cos + 1 xdp) */ +#define QED_MIN_L2_CONS (2 + NUM_PHYS_TCS_4PORT_K2) +#define QED_MAX_L2_CONS (64 * (QED_MIN_L2_CONS)) + struct qed_queue_start_common_params { /* Should always be relative to entity sending this. */ u8 vport_id; @@ -49,6 +53,8 @@ struct qed_queue_start_common_params { struct qed_sb_info *p_sb; u8 sb_idx; + + u8 tc; }; struct qed_rxq_start_ret_params { -- cgit v1.2.3 From 15693fd37fc3a49da356164ed15b571c175efc34 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 8 Aug 2018 19:40:31 +0800 Subject: net: skbuff.h: fix using plain integer as NULL warning Fixes the following sparse warning: ./include/linux/skbuff.h:2365:58: warning: Using plain integer as NULL pointer Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7ebdf158a795..7e237a63a70c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2362,7 +2362,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb, if (skb_transport_header_was_set(skb)) return; - if (skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0)) + if (skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) skb_set_transport_header(skb, keys.control.thoff); else skb_set_transport_header(skb, offset_hint); -- cgit v1.2.3 From b0768a86585d4d951a30ff565f19598dbbd67897 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:09 +0900 Subject: net: Export skb_headers_offset_update This is needed for veth XDP which does skb_copy_expand()-like operation. v2: - Drop skb_copy_header part because it has already been exported now. Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7ebdf158a795..e93b157f526c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1038,6 +1038,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, } struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); +void skb_headers_offset_update(struct sk_buff *skb, int off); int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old); -- cgit v1.2.3 From 0b19cc0a8694d4295383f88bc3441765875a57bc Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:15 +0900 Subject: bpf: Make redirect_info accessible from modules We are going to add kern_flags field in redirect_info for kernel internal use. In order to avoid function call to access the flags, make redirect_info accessible from modules. Also as it is now non-static, add prefix bpf_ to redirect_info. v6: - Fix sparse warning around EXPORT_SYMBOL. Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index c73dd7396886..4717af8b95e6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -537,6 +537,16 @@ struct sk_msg_buff { struct list_head list; }; +struct bpf_redirect_info { + u32 ifindex; + u32 flags; + struct bpf_map *map; + struct bpf_map *map_to_flush; + unsigned long map_owner; +}; + +DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) -- cgit v1.2.3 From 2539650fadbf63a431e76535a9de7bff6ea5e409 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:16 +0900 Subject: xdp: Helpers for disabling napi_direct of xdp_return_frame We need some mechanism to disable napi_direct on calling xdp_return_frame_rx_napi() from some context. When veth gets support of XDP_REDIRECT, it will redirects packets which are redirected from other devices. On redirection veth will reuse xdp_mem_info of the redirection source device to make return_frame work. But in this case .ndo_xdp_xmit() called from veth redirection uses xdp_mem_info which is not guarded by NAPI, because the .ndo_xdp_xmit() is not called directly from the rxq which owns the xdp_mem_info. This approach introduces a flag in bpf_redirect_info to indicate that napi_direct should be disabled even when _rx_napi variant is used as well as helper functions to use it. A NAPI handler who wants to use this flag needs to call xdp_set_return_frame_no_direct() before processing packets, and call xdp_clear_return_frame_no_direct() after xdp_do_flush_map() before exiting NAPI. v4: - Use bpf_redirect_info for storing the flag instead of xdp_mem_info to avoid per-frame copy cost. Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4717af8b95e6..2b072dab32c0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -543,10 +543,14 @@ struct bpf_redirect_info { struct bpf_map *map; struct bpf_map *map_to_flush; unsigned long map_owner; + u32 kern_flags; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +/* flags for bpf_redirect_info kern_flags */ +#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) @@ -775,6 +779,27 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +static inline bool xdp_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_set_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_clear_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; +} + static inline int xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen) { -- cgit v1.2.3 From c9fbb2d25295a566b97d62e6904741e8e1702d83 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 10 Aug 2018 10:47:43 +0200 Subject: net: Provide stub for __netif_set_xps_queue if there is no CONFIG_XPS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building virtio_net driver without CONFIG_XPS fails with: drivers/net/virtio_net.c: In function ‘virtnet_set_affinity’: drivers/net/virtio_net.c:1910:3: error: implicit declaration of function ‘__netif_set_xps_queue’ [-Werror=implicit-function-declaration] __netif_set_xps_queue(vi->dev, mask, i, false); ^ Fixes: 4d99f6602cb5 ("net: allow to call netif_reset_xps_queues() under cpus_read_lock") Signed-off-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 282e2e95ad5b..ca5ab98053c8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3412,6 +3412,13 @@ static inline int netif_set_xps_queue(struct net_device *dev, { return 0; } + +static inline int __netif_set_xps_queue(struct net_device *dev, + const unsigned long *mask, + u16 index, bool is_rxqs_map) +{ + return 0; +} #endif /** -- cgit v1.2.3 From 5dc4c4b7d4e8115e7cde96a030f98cb3ab2e458c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:24 -0700 Subject: bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY This patch introduces a new map type BPF_MAP_TYPE_REUSEPORT_SOCKARRAY. To unleash the full potential of a bpf prog, it is essential for the userspace to be capable of directly setting up a bpf map which can then be consumed by the bpf prog to make decision. In this case, decide which SO_REUSEPORT sk to serve the incoming request. By adding BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, the userspace has total control and visibility on where a SO_REUSEPORT sk should be located in a bpf map. The later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT such that the bpf prog can directly select a sk from the bpf map. That will raise the programmability of the bpf prog attached to a reuseport group (a group of sk serving the same IP:PORT). For example, in UDP, the bpf prog can peek into the payload (e.g. through the "data" pointer introduced in the later patch) to learn the application level's connection information and then decide which sk to pick from a bpf map. The userspace can tightly couple the sk's location in a bpf map with the application logic in generating the UDP payload's connection information. This connection info contact/API stays within the userspace. Also, when used with map-in-map, the userspace can switch the old-server-process's inner map to a new-server-process's inner map in one call "bpf_map_update_elem(outer_map, &index, &new_reuseport_array)". The bpf prog will then direct incoming requests to the new process instead of the old process. The old process can finish draining the pending requests (e.g. by "accept()") before closing the old-fds. [Note that deleting a fd from a bpf map does not necessary mean the fd is closed] During map_update_elem(), Only SO_REUSEPORT sk (i.e. which has already been added to a reuse->socks[]) can be used. That means a SO_REUSEPORT sk that is "bind()" for UDP or "bind()+listen()" for TCP. These conditions are ensured in "reuseport_array_update_check()". A SO_REUSEPORT sk can only be added once to a map (i.e. the same sk cannot be added twice even to the same map). SO_REUSEPORT already allows another sk to be created for the same IP:PORT. There is no need to re-create a similar usage in the BPF side. When a SO_REUSEPORT is deleted from the "reuse->socks[]" (e.g. "close()"), it will notify the bpf map to remove it from the map also. It is done through "bpf_sk_reuseport_detach()" and it will only be called if >=1 of the "reuse->sock[]" has ever been added to a bpf map. The map_update()/map_delete() has to be in-sync with the "reuse->socks[]". Hence, the same "reuseport_lock" used by "reuse->socks[]" has to be used here also. Care has been taken to ensure the lock is only acquired when the adding sk passes some strict tests. and freeing the map does not require the reuseport_lock. The reuseport_array will also support lookup from the syscall side. It will return a sock_gen_cookie(). The sock_gen_cookie() is on-demand (i.e. a sk's cookie is not generated until the very first map_lookup_elem()). The lookup cookie is 64bits but it goes against the logical userspace expectation on 32bits sizeof(fd) (and as other fd based bpf maps do also). It may catch user in surprise if we enforce value_size=8 while userspace still pass a 32bits fd during update. Supporting different value_size between lookup and update seems unintuitive also. We also need to consider what if other existing fd based maps want to return 64bits value from syscall's lookup in the future. Hence, reuseport_array supports both value_size 4 and 8, and assuming user will usually use value_size=4. The syscall's lookup will return ENOSPC on value_size=4. It will will only return 64bits value from sock_gen_cookie() when user consciously choose value_size=8 (as a signal that lookup is desired) which then requires a 64bits value in both lookup and update. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 28 ++++++++++++++++++++++++++++ include/linux/bpf_types.h | 3 +++ 2 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cd8790d2c6ed..db11662faea6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -524,6 +524,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); +int array_map_alloc_check(union bpf_attr *attr); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) @@ -769,6 +770,33 @@ static inline void __xsk_map_flush(struct bpf_map *map) } #endif +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); +#else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} + +#ifdef CONFIG_BPF_SYSCALL +static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, + void *key, void *value) +{ + return -EOPNOTSUPP; +} + +static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, + void *key, void *value, + u64 map_flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_SYSCALL */ +#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index add08be53b6f..14fd6c02d258 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -60,4 +60,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) #endif +#ifdef CONFIG_INET +BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) +#endif #endif -- cgit v1.2.3 From 2dbb9b9e6df67d444fbe425c7f6014858d337adf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:25 -0700 Subject: bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY. Like other non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN. BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern" to store the bpf context instead of using the skb->cb[48]. At the SO_REUSEPORT sk lookup time, it is in the middle of transiting from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp). At this point, it is not always clear where the bpf context can be appended in the skb->cb[48] to avoid saving-and-restoring cb[]. Even putting aside the difference between ipv4-vs-ipv6 and udp-vs-tcp. It is not clear if the lower layer is only ipv4 and ipv6 in the future and will it not touch the cb[] again before transiting to the upper layer. For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB instead of IP[6]CB and it may still modify the cb[] after calling the udp[46]_lib_lookup_skb(). Because of the above reason, if sk->cb is used for the bpf ctx, saving-and-restoring is needed and likely the whole 48 bytes cb[] has to be saved and restored. Instead of saving, setting and restoring the cb[], this patch opts to create a new "struct sk_reuseport_kern" and setting the needed values in there. The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)" will serve all ipv4/ipv6 + udp/tcp combinations. There is no protocol specific usage at this point and it is also inline with the current sock_reuseport.c implementation (i.e. no protocol specific requirement). In "struct sk_reuseport_md", this patch exposes data/data_end/len with semantic similar to other existing usages. Together with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()", the bpf prog can peek anywhere in the skb. The "bind_inany" tells the bpf prog that the reuseport group is bind-ed to a local INANY address which cannot be learned from skb. The new "bind_inany" is added to "struct sock_reuseport" which will be used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order to avoid repeating the "bind INANY" test on "sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run. It can only be properly initialized when a "sk->sk_reuseport" enabled sk is adding to a hashtable (i.e. during "reuseport_alloc()" and "reuseport_add_sock()"). The new "sk_select_reuseport()" is the main helper that the bpf prog will use to select a SO_REUSEPORT sk. It is the only function that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY. As mentioned in the earlier patch, the validity of a selected sk is checked in run time in "sk_select_reuseport()". Doing the check in verification time is difficult and inflexible (consider the map-in-map use case). The runtime check is to compare the selected sk's reuseport_id with the reuseport_id that we want. This helper will return -EXXX if the selected sk cannot serve the incoming request (e.g. reuseport_id not match). The bpf prog can decide if it wants to do SK_DROP as its discretion. When the bpf prog returns SK_PASS, the kernel will check if a valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL"). If it does , it will use the selected sk. If not, the kernel will select one from "reuse->socks[]" (as before this patch). The SK_DROP and SK_PASS handling logic will be in the next patch. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 3 +++ include/linux/filter.h | 15 +++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 14fd6c02d258..cd26c090e7c0 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #endif +#ifdef CONFIG_INET +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/filter.h b/include/linux/filter.h index 2b072dab32c0..70e9d57677fe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -32,6 +32,7 @@ struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; +struct sock_reuseport; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -833,6 +834,20 @@ void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); struct sock *do_msg_redirect_map(struct sk_msg_buff *md); +#ifdef CONFIG_INET +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash); +#else +static inline struct sock * +bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + return NULL; +} +#endif + #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; -- cgit v1.2.3 From 8217ca653ec601246832d562207bc24bdf652d2f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:26 -0700 Subject: bpf: Enable BPF_PROG_TYPE_SK_REUSEPORT bpf prog in reuseport selection This patch allows a BPF_PROG_TYPE_SK_REUSEPORT bpf prog to select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY introduced in the earlier patch. "bpf_run_sk_reuseport()" will return -ECONNREFUSED when the BPF_PROG_TYPE_SK_REUSEPORT prog returns SK_DROP. The callers, in inet[6]_hashtable.c and ipv[46]/udp.c, are modified to handle this case and return NULL immediately instead of continuing the sk search from its hashtable. It re-uses the existing SO_ATTACH_REUSEPORT_EBPF setsockopt to attach BPF_PROG_TYPE_SK_REUSEPORT. The "sk_reuseport_attach_bpf()" will check if the attaching bpf prog is in the new SK_REUSEPORT or the existing SOCKET_FILTER type and then check different things accordingly. One level of "__reuseport_attach_prog()" call is removed. The "sk_unhashed() && ..." and "sk->sk_reuseport_cb" tests are pushed back to "reuseport_attach_prog()" in sock_reuseport.c. sock_reuseport.c seems to have more knowledge on those test requirements than filter.c. In "reuseport_attach_prog()", after new_prog is attached to reuse->prog, the old_prog (if any) is also directly freed instead of returning the old_prog to the caller and asking the caller to free. The sysctl_optmem_max check is moved back to the "sk_reuseport_attach_filter()" and "sk_reuseport_attach_bpf()". As of other bpf prog types, the new BPF_PROG_TYPE_SK_REUSEPORT is only bounded by the usual "bpf_prog_charge_memlock()" during load time instead of bounded by both bpf_prog_charge_memlock and sysctl_optmem_max. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 70e9d57677fe..5d565c50bcb2 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -753,6 +753,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); +void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); -- cgit v1.2.3 From 19e226e8cc5da02f17ed119f9137036c0f0f5d80 Mon Sep 17 00:00:00 2001 From: Caleb Raitto Date: Thu, 9 Aug 2018 18:18:28 -0700 Subject: virtio: Make vp_set_vq_affinity() take a mask. Make vp_set_vq_affinity() take a cpumask instead of taking a single CPU. If there are fewer queues than cores, queue affinity should be able to map to multiple cores. Link: https://patchwork.ozlabs.org/patch/948149/ Suggested-by: Willem de Bruijn Signed-off-by: Caleb Raitto Acked-by: Gonglei Signed-off-by: David S. Miller --- include/linux/virtio_config.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 5559a2d31c46..32baf8e26735 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -79,7 +79,8 @@ struct virtio_config_ops { u64 (*get_features)(struct virtio_device *vdev); int (*finalize_features)(struct virtio_device *vdev); const char *(*bus_name)(struct virtio_device *vdev); - int (*set_vq_affinity)(struct virtqueue *vq, int cpu); + int (*set_vq_affinity)(struct virtqueue *vq, + const struct cpumask *cpu_mask); const struct cpumask *(*get_vq_affinity)(struct virtio_device *vdev, int index); }; @@ -236,11 +237,11 @@ const char *virtio_bus_name(struct virtio_device *vdev) * */ static inline -int virtqueue_set_affinity(struct virtqueue *vq, int cpu) +int virtqueue_set_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) { struct virtio_device *vdev = vq->vdev; if (vdev->config->set_vq_affinity) - return vdev->config->set_vq_affinity(vq, cpu); + return vdev->config->set_vq_affinity(vq, cpu_mask); return 0; } -- cgit v1.2.3 From e8d2bec0457962e8f348a9a3627b398f7fe5c5fc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 12 Aug 2018 01:59:17 +0200 Subject: bpf: decouple btf from seq bpf fs dump and enable more maps Commit a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") and 699c86d6ec21 ("bpf: btf: add pretty print for hash/lru_hash maps") enabled support for BTF and dumping via BPF fs for array and hash/lru map. However, both can be decoupled from each other such that regular BPF maps can be supported for attaching BTF key/value information, while not all maps necessarily need to dump via map_seq_show_elem() callback. The basic sanity check which is a prerequisite for all maps is that key/value size has to match in any case, and some maps can have extra checks via map_check_btf() callback, e.g. probing certain types or indicating no support in general. With that we can also enable retrieving BTF info for per-cpu map types and lpm. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Yonghong Song --- include/linux/bpf.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index db11662faea6..523481a3471b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,7 +23,7 @@ struct bpf_prog; struct bpf_map; struct sock; struct seq_file; -struct btf; +struct btf_type; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { @@ -48,8 +48,9 @@ struct bpf_map_ops { u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); - int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, - u32 key_type_id, u32 value_type_id); + int (*map_check_btf)(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type); }; struct bpf_map { @@ -118,9 +119,13 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map) static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { - return map->ops->map_seq_show_elem && map->ops->map_check_btf; + return map->btf && map->ops->map_seq_show_elem; } +int map_check_no_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type); + extern const struct bpf_map_ops bpf_map_offload_ops; /* function argument constraints */ -- cgit v1.2.3 From 7723628101aaeb1d723786747529b4ea65c5b5c5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Sun, 12 Aug 2018 10:49:27 -0700 Subject: bpf: Introduce bpf_skb_ancestor_cgroup_id helper == Problem description == It's useful to be able to identify cgroup associated with skb in TC so that a policy can be applied to this skb, and existing bpf_skb_cgroup_id helper can help with this. Though in real life cgroup hierarchy and hierarchy to apply a policy to don't map 1:1. It's often the case that there is a container and corresponding cgroup, but there are many more sub-cgroups inside container, e.g. because it's delegated to containerized application to control resources for its subsystems, or to separate application inside container from infra that belongs to containerization system (e.g. sshd). At the same time it may be useful to apply a policy to container as a whole. If multiple containers like this are run on a host (what is often the case) and many of them have sub-cgroups, it may not be possible to apply per-container policy in TC with existing helpers such as bpf_skb_under_cgroup or bpf_skb_cgroup_id: * bpf_skb_cgroup_id will return id of immediate cgroup associated with skb, i.e. if it's a sub-cgroup inside container, it can't be used to identify container's cgroup; * bpf_skb_under_cgroup can work only with one cgroup and doesn't scale, i.e. if there are N containers on a host and a policy has to be applied to M of them (0 <= M <= N), it'd require M calls to bpf_skb_under_cgroup, and, if M changes, it'd require to rebuild & load new BPF program. == Solution == The patch introduces new helper bpf_skb_ancestor_cgroup_id that can be used to get id of cgroup v2 that is an ancestor of cgroup associated with skb at specified level of cgroup hierarchy. That way admin can place all containers on one level of cgroup hierarchy (what is a good practice in general and already used in many configurations) and identify specific cgroup on this level no matter what sub-cgroup skb is associated with. E.g. if there is a cgroup hierarchy: root/ root/container1/ root/container1/app11/ root/container1/app11/sub-app-a/ root/container1/app12/ root/container2/ root/container2/app21/ root/container2/app22/ root/container2/app22/sub-app-b/ , then having skb associated with root/container1/app11/sub-app-a/ it's possible to get ancestor at level 1, what is container1 and apply policy for this container, or apply another policy if it's container2. Policies can be kept e.g. in a hash map where key is a container cgroup id and value is an action. Levels where container cgroups are created are usually known in advance whether cgroup hierarchy inside container may be hard to predict especially in case when its creation is delegated to containerized application. == Implementation details == The helper gets ancestor by walking parents up to specified level. Another option would be to get different kind of "id" from cgroup->ancestor_ids[level] and use it with idr_find() to get struct cgroup for ancestor. But that would require radix lookup what doesn't seem to be better (at least it's not obviously better). Format of return value of the new helper is same as that of bpf_skb_cgroup_id. Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann --- include/linux/cgroup.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c9fdf6f57913..32c553556bbd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -553,6 +553,36 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, return cgrp->ancestor_ids[ancestor->level] == ancestor->id; } +/** + * cgroup_ancestor - find ancestor of cgroup + * @cgrp: cgroup to find ancestor of + * @ancestor_level: level of ancestor to find starting from root + * + * Find ancestor of cgroup at specified level starting from root if it exists + * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at + * @ancestor_level. + * + * This function is safe to call as long as @cgrp is accessible. + */ +static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, + int ancestor_level) +{ + struct cgroup *ptr; + + if (cgrp->level < ancestor_level) + return NULL; + + for (ptr = cgrp; + ptr && ptr->level > ancestor_level; + ptr = cgroup_parent(ptr)) + ; + + if (ptr && ptr->level == ancestor_level) + return ptr; + + return NULL; +} + /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested -- cgit v1.2.3 From 9af18e56d43ca4864ce65c3542c513827c2697de Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 12 Aug 2018 09:14:03 -0400 Subject: cpumask: make cpumask_next_wrap available without smp The kbuild robot shows build failure on machines without CONFIG_SMP: drivers/net/virtio_net.c:1916:10: error: implicit declaration of function 'cpumask_next_wrap' cpumask_next_wrap is exported from lib/cpumask.o, which has lib-$(CONFIG_SMP) += cpumask.o same as other functions, also define it as static inline in the NR_CPUS==1 branch in include/linux/cpumask.h. If wrap is true and next == start, return nr_cpumask_bits, or 1. Else wrap across the range of valid cpus, here [0]. Fixes: 2ca653d607ce ("virtio_net: Stripe queue affinities across cores.") Signed-off-by: Willem de Bruijn Tested-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- include/linux/cpumask.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 57f20a0a7794..147bdec42215 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -159,6 +159,13 @@ static inline unsigned int cpumask_next_and(int n, return n+1; } +static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, + int start, bool wrap) +{ + /* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */ + return (wrap && n == 0); +} + /* cpu must be a valid cpu, ie 0, so there's no other choice. */ static inline unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) -- cgit v1.2.3 From cf916ffbe0c628bb072ea829f300cff1874162ce Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Sun, 29 Apr 2018 09:17:34 -0500 Subject: net/mlx5: Improve argument name for add flow API The last argument to mlx5_add_flow_rules passes the number of destinations in the struct pointed to by the dest arg. Change the name to better reflect this fact. Signed-off-by: Eli Cohen Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index c40f2fc68655..71fb503b2b52 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -177,7 +177,7 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft, struct mlx5_flow_spec *spec, struct mlx5_flow_act *flow_act, struct mlx5_flow_destination *dest, - int dest_num); + int num_dest); void mlx5_del_flow_rules(struct mlx5_flow_handle *fr); int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler, -- cgit v1.2.3