diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-06 18:39:49 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-06 18:39:49 -0700 |
commit | 1c8c5a9d38f607c0b6fd12c91cbe1a4418762a21 (patch) | |
tree | dcc97181d4d187252e0cc8fdf29d9b365fa3ffd0 /include/net | |
parent | 285767604576148fc1be7fcd112e4a90eb0d6ad2 (diff) | |
parent | 7170e6045a6a8b33f4fa5753589dc77b16198e2d (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) Add Maglev hashing scheduler to IPVS, from Inju Song.
2) Lots of new TC subsystem tests from Roman Mashak.
3) Add TCP zero copy receive and fix delayed acks and autotuning with
SO_RCVLOWAT, from Eric Dumazet.
4) Add XDP_REDIRECT support to mlx5 driver, from Jesper Dangaard
Brouer.
5) Add ttl inherit support to vxlan, from Hangbin Liu.
6) Properly separate ipv6 routes into their logically independant
components. fib6_info for the routing table, and fib6_nh for sets of
nexthops, which thus can be shared. From David Ahern.
7) Add bpf_xdp_adjust_tail helper, which can be used to generate ICMP
messages from XDP programs. From Nikita V. Shirokov.
8) Lots of long overdue cleanups to the r8169 driver, from Heiner
Kallweit.
9) Add BTF ("BPF Type Format"), from Martin KaFai Lau.
10) Add traffic condition monitoring to iwlwifi, from Luca Coelho.
11) Plumb extack down into fib_rules, from Roopa Prabhu.
12) Add Flower classifier offload support to igb, from Vinicius Costa
Gomes.
13) Add UDP GSO support, from Willem de Bruijn.
14) Add documentation for eBPF helpers, from Quentin Monnet.
15) Add TLS tx offload to mlx5, from Ilya Lesokhin.
16) Allow applications to be given the number of bytes available to read
on a socket via a control message returned from recvmsg(), from
Soheil Hassas Yeganeh.
17) Add x86_32 eBPF JIT compiler, from Wang YanQing.
18) Add AF_XDP sockets, with zerocopy support infrastructure as well.
From Björn Töpel.
19) Remove indirect load support from all of the BPF JITs and handle
these operations in the verifier by translating them into native BPF
instead. From Daniel Borkmann.
20) Add GRO support to ipv6 gre tunnels, from Eran Ben Elisha.
21) Allow XDP programs to do lookups in the main kernel routing tables
for forwarding. From David Ahern.
22) Allow drivers to store hardware state into an ELF section of kernel
dump vmcore files, and use it in cxgb4. From Rahul Lakkireddy.
23) Various RACK and loss detection improvements in TCP, from Yuchung
Cheng.
24) Add TCP SACK compression, from Eric Dumazet.
25) Add User Mode Helper support and basic bpfilter infrastructure, from
Alexei Starovoitov.
26) Support ports and protocol values in RTM_GETROUTE, from Roopa
Prabhu.
27) Support bulking in ->ndo_xdp_xmit() API, from Jesper Dangaard
Brouer.
28) Add lots of forwarding selftests, from Petr Machata.
29) Add generic network device failover driver, from Sridhar Samudrala.
* ra.kernel.org:/pub/scm/linux/kernel/git/davem/net-next: (1959 commits)
strparser: Add __strp_unpause and use it in ktls.
rxrpc: Fix terminal retransmission connection ID to include the channel
net: hns3: Optimize PF CMDQ interrupt switching process
net: hns3: Fix for VF mailbox receiving unknown message
net: hns3: Fix for VF mailbox cannot receiving PF response
bnx2x: use the right constant
Revert "net: sched: cls: Fix offloading when ingress dev is vxlan"
net: dsa: b53: Fix for brcm tag issue in Cygnus SoC
enic: fix UDP rss bits
netdev-FAQ: clarify DaveM's position for stable backports
rtnetlink: validate attributes in do_setlink()
mlxsw: Add extack messages for port_{un, }split failures
netdevsim: Add extack error message for devlink reload
devlink: Add extack to reload and port_{un, }split operations
net: metrics: add proper netlink validation
ipmr: fix error path when ipmr_new_table fails
ip6mr: only set ip6mr_table from setsockopt when ip6mr_new_table succeeds
net: hns3: remove unused hclgevf_cfg_func_mta_filter
netfilter: provide udp*_lib_lookup for nf_tproxy
qed*: Utilize FW 8.37.2.0
...
Diffstat (limited to 'include/net')
65 files changed, 1456 insertions, 471 deletions
diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 378d601258be..5f43f7a70fe6 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -59,6 +59,19 @@ struct in6_validator_info { struct netlink_ext_ack *extack; }; +struct ifa6_config { + const struct in6_addr *pfx; + unsigned int plen; + + const struct in6_addr *peer_pfx; + + u32 rt_priority; + u32 ifa_flags; + u32 preferred_lft; + u32 valid_lft; + u16 scope; +}; + int addrconf_init(void); void addrconf_cleanup(void); @@ -223,6 +236,22 @@ struct ipv6_stub { const struct in6_addr *addr); int (*ipv6_dst_lookup)(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); + + struct fib6_table *(*fib6_get_table)(struct net *net, u32 id); + struct fib6_info *(*fib6_lookup)(struct net *net, int oif, + struct flowi6 *fl6, int flags); + struct fib6_info *(*fib6_table_lookup)(struct net *net, + struct fib6_table *table, + int oif, struct flowi6 *fl6, + int flags); + struct fib6_info *(*fib6_multipath_select)(const struct net *net, + struct fib6_info *f6i, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, + int strict); + u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr); + void (*udpv6_encap_enable)(void); void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, @@ -308,6 +337,20 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) } /** + * __in6_dev_get_safely - get inet6_dev pointer from netdevice + * @dev: network device + * + * This is a safer version of __in6_dev_get + */ +static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev) +{ + if (likely(dev)) + return rcu_dereference_rtnl(dev->ip6_ptr); + else + return NULL; +} + +/** * in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device * diff --git a/include/net/ax88796.h b/include/net/ax88796.h index b9a3beca0ce4..84b3785d0e66 100644 --- a/include/net/ax88796.h +++ b/include/net/ax88796.h @@ -12,6 +12,10 @@ #ifndef __NET_AX88796_PLAT_H #define __NET_AX88796_PLAT_H +struct sk_buff; +struct net_device; +struct platform_device; + #define AXFLG_HAS_EEPROM (1<<0) #define AXFLG_MAC_FROMDEV (1<<1) /* device already has MAC */ #define AXFLG_HAS_93CX6 (1<<2) /* use eeprom_93cx6 driver */ @@ -26,6 +30,16 @@ struct ax_plat_data { u32 *reg_offsets; /* register offsets */ u8 *mac_addr; /* MAC addr (only used when AXFLG_MAC_FROMPLATFORM is used */ + + /* uses default ax88796 buffer if set to NULL */ + void (*block_output)(struct net_device *dev, int count, + const unsigned char *buf, int star_page); + void (*block_input)(struct net_device *dev, int count, + struct sk_buff *skb, int ring_offset); + /* returns nonzero if a pending interrupt request might by caused by + * the ax88786. Handles all interrupts if set to NULL + */ + int (*check_irq)(struct platform_device *pdev); }; #endif /* __NET_AX88796_PLAT_H */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index b619a190ff12..893bbbb5d2fa 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1393,6 +1393,8 @@ struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout); struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout); +int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param); int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, const void *param); diff --git a/include/net/bonding.h b/include/net/bonding.h index b52235158836..808f1d167349 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -285,8 +285,15 @@ static inline bool bond_needs_speed_duplex(const struct bonding *bond) static inline bool bond_is_nondyn_tlb(const struct bonding *bond) { - return (BOND_MODE(bond) == BOND_MODE_TLB) && - (bond->params.tlb_dynamic_lb == 0); + return (bond_is_lb(bond) && bond->params.tlb_dynamic_lb == 0); +} + +static inline bool bond_mode_can_use_xmit_hash(const struct bonding *bond) +{ + return (BOND_MODE(bond) == BOND_MODE_8023AD || + BOND_MODE(bond) == BOND_MODE_XOR || + BOND_MODE(bond) == BOND_MODE_TLB || + BOND_MODE(bond) == BOND_MODE_ALB); } static inline bool bond_mode_uses_xmit_hash(const struct bonding *bond) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 250dac390806..5fbfe61f41c6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1080,6 +1080,37 @@ struct sta_bss_parameters { }; /** + * struct cfg80211_txq_stats - TXQ statistics for this TID + * @filled: bitmap of flags using the bits of &enum nl80211_txq_stats to + * indicate the relevant values in this struct are filled + * @backlog_bytes: total number of bytes currently backlogged + * @backlog_packets: total number of packets currently backlogged + * @flows: number of new flows seen + * @drops: total number of packets dropped + * @ecn_marks: total number of packets marked with ECN CE + * @overlimit: number of drops due to queue space overflow + * @overmemory: number of drops due to memory limit overflow + * @collisions: number of hash collisions + * @tx_bytes: total number of bytes dequeued + * @tx_packets: total number of packets dequeued + * @max_flows: maximum number of flows supported + */ +struct cfg80211_txq_stats { + u32 filled; + u32 backlog_bytes; + u32 backlog_packets; + u32 flows; + u32 drops; + u32 ecn_marks; + u32 overlimit; + u32 overmemory; + u32 collisions; + u32 tx_bytes; + u32 tx_packets; + u32 max_flows; +}; + +/** * struct cfg80211_tid_stats - per-TID statistics * @filled: bitmap of flags using the bits of &enum nl80211_tid_stats to * indicate the relevant values in this struct are filled @@ -1088,6 +1119,7 @@ struct sta_bss_parameters { * @tx_msdu_retries: number of retries (not counting the first) for * transmitted MSDUs * @tx_msdu_failed: number of failed transmitted MSDUs + * @txq_stats: TXQ statistics */ struct cfg80211_tid_stats { u32 filled; @@ -1095,6 +1127,7 @@ struct cfg80211_tid_stats { u64 tx_msdu; u64 tx_msdu_retries; u64 tx_msdu_failed; + struct cfg80211_txq_stats txq_stats; }; #define IEEE80211_MAX_CHAINS 4 @@ -1151,7 +1184,10 @@ struct cfg80211_tid_stats { * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last * (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs. + * Note that this doesn't use the @filled bit, but is used if non-NULL. * @ack_signal: signal strength (in dBm) of the last ACK frame. + * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has + * been sent. */ struct station_info { u64 filled; @@ -1195,8 +1231,9 @@ struct station_info { u64 rx_beacon; u64 rx_duration; u8 rx_beacon_signal_avg; - struct cfg80211_tid_stats pertid[IEEE80211_NUM_TIDS + 1]; + struct cfg80211_tid_stats *pertid; s8 ack_signal; + s8 avg_ack_signal; }; #if IS_ENABLED(CONFIG_CFG80211) @@ -2188,9 +2225,14 @@ struct cfg80211_connect_params { * have to be updated as part of update_connect_params() call. * * @UPDATE_ASSOC_IES: Indicates whether association request IEs are updated + * @UPDATE_FILS_ERP_INFO: Indicates that FILS connection parameters (realm, + * username, erp sequence number and rrk) are updated + * @UPDATE_AUTH_TYPE: Indicates that authentication type is updated */ enum cfg80211_connect_params_changed { UPDATE_ASSOC_IES = BIT(0), + UPDATE_FILS_ERP_INFO = BIT(1), + UPDATE_AUTH_TYPE = BIT(2), }; /** @@ -2201,6 +2243,9 @@ enum cfg80211_connect_params_changed { * @WIPHY_PARAM_RTS_THRESHOLD: wiphy->rts_threshold has changed * @WIPHY_PARAM_COVERAGE_CLASS: coverage class changed * @WIPHY_PARAM_DYN_ACK: dynack has been enabled + * @WIPHY_PARAM_TXQ_LIMIT: TXQ packet limit has been changed + * @WIPHY_PARAM_TXQ_MEMORY_LIMIT: TXQ memory limit has been changed + * @WIPHY_PARAM_TXQ_QUANTUM: TXQ scheduler quantum */ enum wiphy_params_flags { WIPHY_PARAM_RETRY_SHORT = 1 << 0, @@ -2209,6 +2254,9 @@ enum wiphy_params_flags { WIPHY_PARAM_RTS_THRESHOLD = 1 << 3, WIPHY_PARAM_COVERAGE_CLASS = 1 << 4, WIPHY_PARAM_DYN_ACK = 1 << 5, + WIPHY_PARAM_TXQ_LIMIT = 1 << 6, + WIPHY_PARAM_TXQ_MEMORY_LIMIT = 1 << 7, + WIPHY_PARAM_TXQ_QUANTUM = 1 << 8, }; /** @@ -2961,6 +3009,9 @@ struct cfg80211_external_auth_params { * * @set_multicast_to_unicast: configure multicast to unicast conversion for BSS * + * @get_txq_stats: Get TXQ stats for interface or phy. If wdev is %NULL, this + * function should return phy stats, and interface stats otherwise. + * * @set_pmk: configure the PMK to be used for offloaded 802.1X 4-Way handshake. * If not deleted through @del_pmk the PMK remains valid until disconnect * upon which the driver should clear it. @@ -3262,6 +3313,10 @@ struct cfg80211_ops { struct net_device *dev, const bool enabled); + int (*get_txq_stats)(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct cfg80211_txq_stats *txqstats); + int (*set_pmk)(struct wiphy *wiphy, struct net_device *dev, const struct cfg80211_pmk_conf *conf); int (*del_pmk)(struct wiphy *wiphy, struct net_device *dev, @@ -3806,6 +3861,10 @@ struct wiphy_iftype_ext_capab { * bitmap of &enum nl80211_band values. For instance, for * NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * + * @txq_limit: configuration of internal TX queue frame limit + * @txq_memory_limit: configuration internal TX queue memory limit + * @txq_quantum: configuration of internal TX queue scheduler quantum */ struct wiphy { /* assign these fields before you register the wiphy */ @@ -3940,6 +3999,10 @@ struct wiphy { u8 nan_supported_bands; + u32 txq_limit; + u32 txq_memory_limit; + u32 txq_quantum; + char priv[0] __aligned(NETDEV_ALIGN); }; @@ -5363,6 +5426,30 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp) #endif /** + * struct cfg80211_fils_resp_params - FILS connection response params + * @kek: KEK derived from a successful FILS connection (may be %NULL) + * @kek_len: Length of @fils_kek in octets + * @update_erp_next_seq_num: Boolean value to specify whether the value in + * @erp_next_seq_num is valid. + * @erp_next_seq_num: The next sequence number to use in ERP message in + * FILS Authentication. This value should be specified irrespective of the + * status for a FILS connection. + * @pmk: A new PMK if derived from a successful FILS connection (may be %NULL). + * @pmk_len: Length of @pmk in octets + * @pmkid: A new PMKID if derived from a successful FILS connection or the PMKID + * used for this FILS connection (may be %NULL). + */ +struct cfg80211_fils_resp_params { + const u8 *kek; + size_t kek_len; + bool update_erp_next_seq_num; + u16 erp_next_seq_num; + const u8 *pmk; + size_t pmk_len; + const u8 *pmkid; +}; + +/** * struct cfg80211_connect_resp_params - Connection response params * @status: Status code, %WLAN_STATUS_SUCCESS for successful connection, use * %WLAN_STATUS_UNSPECIFIED_FAILURE if your device cannot give you @@ -5380,17 +5467,7 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp) * @req_ie_len: Association request IEs length * @resp_ie: Association response IEs (may be %NULL) * @resp_ie_len: Association response IEs length - * @fils_kek: KEK derived from a successful FILS connection (may be %NULL) - * @fils_kek_len: Length of @fils_kek in octets - * @update_erp_next_seq_num: Boolean value to specify whether the value in - * @fils_erp_next_seq_num is valid. - * @fils_erp_next_seq_num: The next sequence number to use in ERP message in - * FILS Authentication. This value should be specified irrespective of the - * status for a FILS connection. - * @pmk: A new PMK if derived from a successful FILS connection (may be %NULL). - * @pmk_len: Length of @pmk in octets - * @pmkid: A new PMKID if derived from a successful FILS connection or the PMKID - * used for this FILS connection (may be %NULL). + * @fils: FILS connection response parameters. * @timeout_reason: Reason for connection timeout. This is used when the * connection fails due to a timeout instead of an explicit rejection from * the AP. %NL80211_TIMEOUT_UNSPECIFIED is used when the timeout reason is @@ -5406,13 +5483,7 @@ struct cfg80211_connect_resp_params { size_t req_ie_len; const u8 *resp_ie; size_t resp_ie_len; - const u8 *fils_kek; - size_t fils_kek_len; - bool update_erp_next_seq_num; - u16 fils_erp_next_seq_num; - const u8 *pmk; - size_t pmk_len; - const u8 *pmkid; + struct cfg80211_fils_resp_params fils; enum nl80211_timeout_reason timeout_reason; }; @@ -5558,6 +5629,7 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid, * @req_ie_len: association request IEs length * @resp_ie: association response IEs (may be %NULL) * @resp_ie_len: assoc response IEs length + * @fils: FILS related roaming information. */ struct cfg80211_roam_info { struct ieee80211_channel *channel; @@ -5567,6 +5639,7 @@ struct cfg80211_roam_info { size_t req_ie_len; const u8 *resp_ie; size_t resp_ie_len; + struct cfg80211_fils_resp_params fils; }; /** @@ -5648,6 +5721,26 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie, struct ieee80211_channel *chan, gfp_t gfp); +/** + * cfg80211_sinfo_alloc_tid_stats - allocate per-tid statistics. + * + * @sinfo: the station information + * @gfp: allocation flags + */ +int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp); + +/** + * cfg80211_sinfo_release_content - release contents of station info + * @sinfo: the station information + * + * Releases any potentially allocated sub-information of the station + * information, but not the struct itself (since it's typically on + * the stack.) + */ +static inline void cfg80211_sinfo_release_content(struct station_info *sinfo) +{ + kfree(sinfo->pertid); +} /** * cfg80211_new_sta - notify userspace about station diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h index 207d9ba1f92c..0e5e91be2d30 100644 --- a/include/net/dcbnl.h +++ b/include/net/dcbnl.h @@ -101,6 +101,10 @@ struct dcbnl_rtnl_ops { /* CEE peer */ int (*cee_peer_getpg) (struct net_device *, struct cee_pg *); int (*cee_peer_getpfc) (struct net_device *, struct cee_pfc *); + + /* buffer settings */ + int (*dcbnl_getbuffer)(struct net_device *, struct dcbnl_buffer *); + int (*dcbnl_setbuffer)(struct net_device *, struct dcbnl_buffer *); }; #endif /* __NET_DCBNL_H__ */ diff --git a/include/net/devlink.h b/include/net/devlink.h index 2e4f71e16e95..e336ea9c73df 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -35,6 +35,14 @@ struct devlink { char priv[0] __aligned(NETDEV_ALIGN); }; +struct devlink_port_attrs { + bool set; + enum devlink_port_flavour flavour; + u32 port_number; /* same value as "split group" */ + bool split; + u32 split_subport_number; +}; + struct devlink_port { struct list_head list; struct devlink *devlink; @@ -43,8 +51,7 @@ struct devlink_port { enum devlink_port_type type; enum devlink_port_type desired_type; void *type_dev; - bool split; - u32 split_group; + struct devlink_port_attrs attrs; }; struct devlink_sb_pool_info { @@ -289,12 +296,13 @@ struct devlink_resource { #define DEVLINK_RESOURCE_ID_PARENT_TOP 0 struct devlink_ops { - int (*reload)(struct devlink *devlink); + int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); int (*port_split)(struct devlink *devlink, unsigned int port_index, - unsigned int count); - int (*port_unsplit)(struct devlink *devlink, unsigned int port_index); + unsigned int count, struct netlink_ext_ack *extack); + int (*port_unsplit)(struct devlink *devlink, unsigned int port_index, + struct netlink_ext_ack *extack); int (*sb_pool_get)(struct devlink *devlink, unsigned int sb_index, u16 pool_index, struct devlink_sb_pool_info *pool_info); @@ -367,8 +375,12 @@ void devlink_port_type_eth_set(struct devlink_port *devlink_port, void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev); void devlink_port_type_clear(struct devlink_port *devlink_port); -void devlink_port_split_set(struct devlink_port *devlink_port, - u32 split_group); +void devlink_port_attrs_set(struct devlink_port *devlink_port, + enum devlink_port_flavour flavour, + u32 port_number, bool split, + u32 split_subport_number); +int devlink_port_get_phys_port_name(struct devlink_port *devlink_port, + char *name, size_t len); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, @@ -466,11 +478,20 @@ static inline void devlink_port_type_clear(struct devlink_port *devlink_port) { } -static inline void devlink_port_split_set(struct devlink_port *devlink_port, - u32 split_group) +static inline void devlink_port_attrs_set(struct devlink_port *devlink_port, + enum devlink_port_flavour flavour, + u32 port_number, bool split, + u32 split_subport_number) { } +static inline int +devlink_port_get_phys_port_name(struct devlink_port *devlink_port, + char *name, size_t len) +{ + return -EOPNOTSUPP; +} + static inline int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, diff --git a/include/net/dsa.h b/include/net/dsa.h index 60fb4ec8ba61..fdbd6082945d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -20,12 +20,14 @@ #include <linux/of.h> #include <linux/ethtool.h> #include <linux/net_tstamp.h> +#include <linux/phy.h> #include <net/devlink.h> #include <net/switchdev.h> struct tc_action; struct phy_device; struct fixed_phy_status; +struct phylink_link_state; enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = 0, @@ -199,6 +201,7 @@ struct dsa_port { u8 stp_state; struct net_device *bridge_dev; struct devlink_port devlink_port; + struct phylink *pl; /* * Original copy of the master netdev ethtool_ops */ @@ -354,12 +357,36 @@ struct dsa_switch_ops { struct fixed_phy_status *st); /* + * PHYLINK integration + */ + void (*phylink_validate)(struct dsa_switch *ds, int port, + unsigned long *supported, + struct phylink_link_state *state); + int (*phylink_mac_link_state)(struct dsa_switch *ds, int port, + struct phylink_link_state *state); + void (*phylink_mac_config)(struct dsa_switch *ds, int port, + unsigned int mode, + const struct phylink_link_state *state); + void (*phylink_mac_an_restart)(struct dsa_switch *ds, int port); + void (*phylink_mac_link_down)(struct dsa_switch *ds, int port, + unsigned int mode, + phy_interface_t interface); + void (*phylink_mac_link_up)(struct dsa_switch *ds, int port, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev); + void (*phylink_fixed_state)(struct dsa_switch *ds, int port, + struct phylink_link_state *state); + /* * ethtool hardware statistics. */ - void (*get_strings)(struct dsa_switch *ds, int port, uint8_t *data); + void (*get_strings)(struct dsa_switch *ds, int port, + u32 stringset, uint8_t *data); void (*get_ethtool_stats)(struct dsa_switch *ds, int port, uint64_t *data); - int (*get_sset_count)(struct dsa_switch *ds, int port); + int (*get_sset_count)(struct dsa_switch *ds, int port, int sset); + void (*get_ethtool_phy_stats)(struct dsa_switch *ds, + int port, uint64_t *data); /* * ethtool Wake-on-LAN @@ -588,4 +615,10 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev, #define BRCM_TAG_GET_PORT(v) ((v) >> 8) #define BRCM_TAG_GET_QUEUE(v) ((v) & 0xff) + +int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data); +int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data); +int dsa_port_get_phy_sset_count(struct dsa_port *dp); +void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); + #endif diff --git a/include/net/erspan.h b/include/net/erspan.h index d044aa60cc76..b39643ef4c95 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -219,6 +219,33 @@ static inline __be32 erspan_get_timestamp(void) return htonl((u32)h_usecs); } +/* ERSPAN BSO (Bad/Short/Oversized), see RFC1757 + * 00b --> Good frame with no error, or unknown integrity + * 01b --> Payload is a Short Frame + * 10b --> Payload is an Oversized Frame + * 11b --> Payload is a Bad Frame with CRC or Alignment Error + */ +enum erspan_bso { + BSO_NOERROR = 0x0, + BSO_SHORT = 0x1, + BSO_OVERSIZED = 0x2, + BSO_BAD = 0x3, +}; + +static inline u8 erspan_detect_bso(struct sk_buff *skb) +{ + /* BSO_BAD is not handled because the frame CRC + * or alignment error information is in FCS. + */ + if (skb->len < ETH_ZLEN) + return BSO_SHORT; + + if (skb->len > ETH_FRAME_LEN) + return BSO_OVERSIZED; + + return BSO_NOERROR; +} + static inline void erspan_build_header_v2(struct sk_buff *skb, u32 id, u8 direction, u16 hwid, bool truncate, bool is_ipv4) @@ -248,6 +275,7 @@ static inline void erspan_build_header_v2(struct sk_buff *skb, vlan_tci = ntohs(qp->tci); } + bso = erspan_detect_bso(skb); skb_push(skb, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); ershdr = (struct erspan_base_hdr *)skb->data; memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); diff --git a/include/net/failover.h b/include/net/failover.h new file mode 100644 index 000000000000..bb15438f39c7 --- /dev/null +++ b/include/net/failover.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018, Intel Corporation. */ + +#ifndef _FAILOVER_H +#define _FAILOVER_H + +#include <linux/netdevice.h> + +struct failover_ops { + int (*slave_pre_register)(struct net_device *slave_dev, + struct net_device *failover_dev); + int (*slave_register)(struct net_device *slave_dev, + struct net_device *failover_dev); + int (*slave_pre_unregister)(struct net_device *slave_dev, + struct net_device *failover_dev); + int (*slave_unregister)(struct net_device *slave_dev, + struct net_device *failover_dev); + int (*slave_link_change)(struct net_device *slave_dev, + struct net_device *failover_dev); + int (*slave_name_change)(struct net_device *slave_dev, + struct net_device *failover_dev); + rx_handler_result_t (*slave_handle_frame)(struct sk_buff **pskb); +}; + +struct failover { + struct list_head list; + struct net_device __rcu *failover_dev; + struct failover_ops __rcu *ops; +}; + +struct failover *failover_register(struct net_device *dev, + struct failover_ops *ops); +void failover_unregister(struct failover *failover); +int failover_slave_unregister(struct net_device *slave_dev); + +#endif /* _FAILOVER_H */ diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index e5cfcfc7dd93..b473df5b9512 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -75,7 +75,8 @@ struct fib_rules_ops { int (*configure)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *, - struct nlattr **); + struct nlattr **, + struct netlink_ext_ack *); int (*delete)(struct fib_rule *); int (*compare)(struct fib_rule *, struct fib_rule_hdr *, diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index d1fcf2442a42..adc24df56b90 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -226,6 +226,11 @@ struct flow_dissector { unsigned short int offset[FLOW_DISSECTOR_KEY_MAX]; }; +struct flow_keys_basic { + struct flow_dissector_key_control control; + struct flow_dissector_key_basic basic; +}; + struct flow_keys { struct flow_dissector_key_control control; #define FLOW_KEYS_HASH_START_FIELD basic @@ -244,7 +249,7 @@ __be32 flow_get_u32_src(const struct flow_keys *flow); __be32 flow_get_u32_dst(const struct flow_keys *flow); extern struct flow_dissector flow_keys_dissector; -extern struct flow_dissector flow_keys_buf_dissector; +extern struct flow_dissector flow_keys_basic_dissector; /* struct flow_keys_digest: * diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index d4088d1a688d..d7578cf49c3a 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -42,6 +42,7 @@ enum { struct inet6_ifaddr { struct in6_addr addr; __u32 prefix_len; + __u32 rt_priority; /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */ __u32 valid_lft; @@ -64,7 +65,7 @@ struct inet6_ifaddr { struct delayed_work dad_work; struct inet6_dev *idev; - struct rt6_info *rt; + struct fib6_info *rt; struct hlist_node addr_lst; struct list_head if_list; @@ -143,8 +144,7 @@ struct ipv6_ac_socklist { struct ifacaddr6 { struct in6_addr aca_addr; - struct inet6_dev *aca_idev; - struct rt6_info *aca_rt; + struct fib6_info *aca_rt; struct ifacaddr6 *aca_next; int aca_users; refcount_t aca_refcnt; diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index b68fea022a82..0a6c9e0f2b5a 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -23,8 +23,6 @@ #include <net/inet_sock.h> #include <net/request_sock.h> -#define INET_CSK_DEBUG 1 - /* Cancel timers, when they are not required. */ #undef INET_CSK_CLEAR_TIMERS @@ -77,6 +75,7 @@ struct inet_connection_sock_af_ops { * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data + * @icsk_clean_acked Clean acked data hook * @icsk_listen_portaddr_node hash to the portaddr listener hashtable * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts @@ -102,6 +101,7 @@ struct inet_connection_sock { const struct inet_connection_sock_af_ops *icsk_af_ops; const struct tcp_ulp_ops *icsk_ulp_ops; void *icsk_ulp_data; + void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); struct hlist_node icsk_listen_portaddr_node; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:6, @@ -194,10 +194,6 @@ static inline void inet_csk_delack_init(struct sock *sk) void inet_csk_delete_keepalive_timer(struct sock *sk); void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout); -#ifdef INET_CSK_DEBUG -extern const char inet_csk_timer_bug_msg[]; -#endif - static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -212,12 +208,9 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_delack_timer); #endif + } else { + pr_debug("inet_csk BUG: unknown timer value\n"); } -#ifdef INET_CSK_DEBUG - else { - pr_debug("%s", inet_csk_timer_bug_msg); - } -#endif } /* @@ -230,10 +223,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, struct inet_connection_sock *icsk = inet_csk(sk); if (when > max_when) { -#ifdef INET_CSK_DEBUG pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr()); -#endif when = max_when; } @@ -247,12 +238,9 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, icsk->icsk_ack.pending |= ICSK_ACK_TIMER; icsk->icsk_ack.timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + } else { + pr_debug("inet_csk BUG: unknown timer value\n"); } -#ifdef INET_CSK_DEBUG - else { - pr_debug("%s", inet_csk_timer_bug_msg); - } -#endif } static inline unsigned long diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 0a671c32d6b9..83d5b3c2ac42 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -147,6 +147,7 @@ struct inet_cork { __u8 ttl; __s16 tos; char priority; + __u16 gso_size; }; struct inet_cork_full { diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index c7be1ca8e562..78775038f011 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -61,7 +61,7 @@ struct inet_timewait_sock { #define tw_cookie __tw_common.skc_cookie #define tw_dr __tw_common.skc_tw_dr - int tw_timeout; + __u32 tw_mark; volatile unsigned char tw_substate; unsigned char tw_rcv_wscale; diff --git a/include/net/ip.h b/include/net/ip.h index ecffd843e7b8..0d2281b4b27a 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -76,6 +76,7 @@ struct ipcm_cookie { __u8 ttl; __s16 tos; char priority; + __u16 gso_size; }; #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb)) @@ -171,7 +172,7 @@ struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, - unsigned int flags); + struct inet_cork *cork, unsigned int flags); static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4) { @@ -396,6 +397,9 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk, return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); } +int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, + u32 *metrics); + u32 ip_idents_reserve(u32 hash, int segs); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); @@ -660,4 +664,7 @@ extern int sysctl_icmp_msgs_burst; int ip_misc_proc_init(void); #endif +int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, + struct netlink_ext_ack *extack); + #endif /* _IP_H */ diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 0e79c3408569..5cba71d2dc44 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -38,6 +38,7 @@ #endif struct rt6_info; +struct fib6_info; struct fib6_config { u32 fc_table; @@ -74,12 +75,12 @@ struct fib6_node { #ifdef CONFIG_IPV6_SUBTREES struct fib6_node __rcu *subtree; #endif - struct rt6_info __rcu *leaf; + struct fib6_info __rcu *leaf; __u16 fn_bit; /* bit key */ __u16 fn_flags; int fn_sernum; - struct rt6_info __rcu *rr_ptr; + struct fib6_info __rcu *rr_ptr; struct rcu_head rcu; }; @@ -94,11 +95,6 @@ struct fib6_gc_args { #define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif -struct mx6_config { - const u32 *mx; - DECLARE_BITMAP(mx_valid, RTAX_MAX); -}; - /* * routing information * @@ -127,92 +123,104 @@ struct rt6_exception { #define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT) #define FIB6_MAX_DEPTH 5 -struct rt6_info { - struct dst_entry dst; - struct rt6_info __rcu *rt6_next; - struct rt6_info *from; +struct fib6_nh { + struct in6_addr nh_gw; + struct net_device *nh_dev; + struct lwtunnel_state *nh_lwtstate; - /* - * Tail elements of dst_entry (__refcnt etc.) - * and these elements (rarely used in hot path) are in - * the same cache line. - */ - struct fib6_table *rt6i_table; - struct fib6_node __rcu *rt6i_node; + unsigned int nh_flags; + atomic_t nh_upper_bound; + int nh_weight; +}; - struct in6_addr rt6i_gateway; +struct fib6_info { + struct fib6_table *fib6_table; + struct fib6_info __rcu *fib6_next; + struct fib6_node __rcu *fib6_node; /* Multipath routes: - * siblings is a list of rt6_info that have the the same metric/weight, + * siblings is a list of fib6_info that have the the same metric/weight, * destination, but not the same gateway. nsiblings is just a cache * to speed up lookup. */ - struct list_head rt6i_siblings; - unsigned int rt6i_nsiblings; - atomic_t rt6i_nh_upper_bound; + struct list_head fib6_siblings; + unsigned int fib6_nsiblings; - atomic_t rt6i_ref; + atomic_t fib6_ref; + unsigned long expires; + struct dst_metrics *fib6_metrics; +#define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] - unsigned int rt6i_nh_flags; + struct rt6key fib6_dst; + u32 fib6_flags; + struct rt6key fib6_src; + struct rt6key fib6_prefsrc; - /* These are in a separate cache line. */ - struct rt6key rt6i_dst ____cacheline_aligned_in_smp; - u32 rt6i_flags; + struct rt6_info * __percpu *rt6i_pcpu; + struct rt6_exception_bucket __rcu *rt6i_exception_bucket; + + u32 fib6_metric; + u8 fib6_protocol; + u8 fib6_type; + u8 exception_bucket_flushed:1, + should_flush:1, + dst_nocount:1, + dst_nopolicy:1, + dst_host:1, + unused:3; + + struct fib6_nh fib6_nh; +}; + +struct rt6_info { + struct dst_entry dst; + struct fib6_info __rcu *from; + + struct rt6key rt6i_dst; struct rt6key rt6i_src; + struct in6_addr rt6i_gateway; + struct inet6_dev *rt6i_idev; + u32 rt6i_flags; struct rt6key rt6i_prefsrc; struct list_head rt6i_uncached; struct uncached_list *rt6i_uncached_list; - struct inet6_dev *rt6i_idev; - struct rt6_info * __percpu *rt6i_pcpu; - struct rt6_exception_bucket __rcu *rt6i_exception_bucket; - - u32 rt6i_metric; - u32 rt6i_pmtu; /* more non-fragment space at head required */ - int rt6i_nh_weight; unsigned short rt6i_nfheader_len; - u8 rt6i_protocol; - u8 exception_bucket_flushed:1, - should_flush:1, - unused:6; }; #define for_each_fib6_node_rt_rcu(fn) \ for (rt = rcu_dereference((fn)->leaf); rt; \ - rt = rcu_dereference(rt->rt6_next)) + rt = rcu_dereference(rt->fib6_next)) #define for_each_fib6_walker_rt(w) \ for (rt = (w)->leaf; rt; \ - rt = rcu_dereference_protected(rt->rt6_next, 1)) + rt = rcu_dereference_protected(rt->fib6_next, 1)) static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) { return ((struct rt6_info *)dst)->rt6i_idev; } -static inline void rt6_clean_expires(struct rt6_info *rt) +static inline void fib6_clean_expires(struct fib6_info *f6i) { - rt->rt6i_flags &= ~RTF_EXPIRES; - rt->dst.expires = 0; + f6i->fib6_flags &= ~RTF_EXPIRES; + f6i->expires = 0; } -static inline void rt6_set_expires(struct rt6_info *rt, unsigned long expires) +static inline void fib6_set_expires(struct fib6_info *f6i, + unsigned long expires) { - rt->dst.expires = expires; - rt->rt6i_flags |= RTF_EXPIRES; + f6i->expires = expires; + f6i->fib6_flags |= RTF_EXPIRES; } -static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) +static inline bool fib6_check_expired(const struct fib6_info *f6i) { - struct rt6_info *rt; - - for (rt = rt0; rt && !(rt->rt6i_flags & RTF_EXPIRES); rt = rt->from); - if (rt && rt != rt0) - rt0->dst.expires = rt->dst.expires; - dst_set_expires(&rt0->dst, timeout); - rt0->rt6i_flags |= RTF_EXPIRES; + if (f6i->fib6_flags & RTF_EXPIRES) + return time_after(jiffies, f6i->expires); + return false; } /* Function to safely get fn->sernum for passed in rt @@ -220,14 +228,13 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) * Return true if we can get cookie safely * Return false if not */ -static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, - u32 *cookie) +static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, + u32 *cookie) { struct fib6_node *fn; bool status = false; - rcu_read_lock(); - fn = rcu_dereference(rt->rt6i_node); + fn = rcu_dereference(f6i->fib6_node); if (fn) { *cookie = fn->fn_sernum; @@ -236,19 +243,22 @@ static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, status = true; } - rcu_read_unlock(); return status; } static inline u32 rt6_get_cookie(const struct rt6_info *rt) { + struct fib6_info *from; u32 cookie = 0; - if (rt->rt6i_flags & RTF_PCPU || - (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) - rt = rt->from; + rcu_read_lock(); + + from = rcu_dereference(rt->from); + if (from && (rt->rt6i_flags & RTF_PCPU || + unlikely(!list_empty(&rt->rt6i_uncached)))) + fib6_get_cookie_safe(from, &cookie); - rt6_get_cookie_safe(rt, &cookie); + rcu_read_unlock(); return cookie; } @@ -262,20 +272,18 @@ static inline void ip6_rt_put(struct rt6_info *rt) dst_release(&rt->dst); } -void rt6_free_pcpu(struct rt6_info *non_pcpu_rt); +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags); +void fib6_info_destroy(struct fib6_info *f6i); -static inline void rt6_hold(struct rt6_info *rt) +static inline void fib6_info_hold(struct fib6_info *f6i) { - atomic_inc(&rt->rt6i_ref); + atomic_inc(&f6i->fib6_ref); } -static inline void rt6_release(struct rt6_info *rt) +static inline void fib6_info_release(struct fib6_info *f6i) { - if (atomic_dec_and_test(&rt->rt6i_ref)) { - rt6_free_pcpu(rt); - dst_dev_put(&rt->dst); - dst_release(&rt->dst); - } + if (f6i && atomic_dec_and_test(&f6i->fib6_ref)) + fib6_info_destroy(f6i); } enum fib6_walk_state { @@ -291,7 +299,7 @@ enum fib6_walk_state { struct fib6_walker { struct list_head lh; struct fib6_node *root, *node; - struct rt6_info *leaf; + struct fib6_info *leaf; enum fib6_walk_state state; unsigned int skip; unsigned int count; @@ -355,7 +363,7 @@ typedef struct rt6_info *(*pol_lookup_t)(struct net *, struct fib6_entry_notifier_info { struct fib_notifier_info info; /* must be first */ - struct rt6_info *rt; + struct fib6_info *rt; }; /* @@ -368,24 +376,49 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); -struct fib6_node *fib6_lookup(struct fib6_node *root, - const struct in6_addr *daddr, - const struct in6_addr *saddr); +/* called with rcu lock held; can return error pointer + * caller needs to select path + */ +struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + int flags); + +/* called with rcu lock held; caller needs to select path */ +struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, int strict); + +struct fib6_info *fib6_multipath_select(const struct net *net, + struct fib6_info *match, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, int strict); + +struct fib6_node *fib6_node_lookup(struct fib6_node *root, + const struct in6_addr *daddr, + const struct in6_addr *saddr); struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match); -void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), +void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg), void *arg); -int fib6_add(struct fib6_node *root, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc, - struct netlink_ext_ack *extack); -int fib6_del(struct rt6_info *rt, struct nl_info *info); +int fib6_add(struct fib6_node *root, struct fib6_info *rt, + struct nl_info *info, struct netlink_ext_ack *extack); +int fib6_del(struct fib6_info *rt, struct nl_info *info); + +static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i) +{ + return f6i->fib6_nh.nh_dev; +} + +static inline +struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i) +{ + return f6i->fib6_nh.nh_lwtstate; +} -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, +void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); void fib6_run_gc(unsigned long expires, struct net *net, bool force); @@ -416,8 +449,14 @@ void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); -void fib6_update_sernum(struct rt6_info *rt); -void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt); +void fib6_update_sernum(struct net *net, struct fib6_info *rt); +void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt); + +void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val); +static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) +{ + return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); +} #ifdef CONFIG_IPV6_MULTIPLE_TABLES int fib6_rules_init(void); diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 08b132381984..59656fc580df 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,12 +66,6 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } -static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) -{ - return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; -} - void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, @@ -100,29 +94,29 @@ void ip6_route_cleanup(void); int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); -int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); -int ip6_ins_rt(struct rt6_info *); -int ip6_del_rt(struct rt6_info *); +int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack); +int ip6_ins_rt(struct net *net, struct fib6_info *f6i); +int ip6_del_rt(struct net *net, struct fib6_info *f6i); -void rt6_flush_exceptions(struct rt6_info *rt); -int rt6_remove_exception_rt(struct rt6_info *rt); -void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args, +void rt6_flush_exceptions(struct fib6_info *f6i); +void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, unsigned long now); -static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, +static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = - rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; int err = 0; - if (rt && rt->rt6i_prefsrc.plen) - *saddr = rt->rt6i_prefsrc.addr; - else - err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, - daddr, prefs, saddr); + if (f6i && f6i->fib6_prefsrc.plen) { + *saddr = f6i->fib6_prefsrc.addr; + } else { + struct net_device *dev = f6i ? fib6_info_nh_dev(f6i) : NULL; + + err = ipv6_dev_get_saddr(net, dev, daddr, prefs, saddr); + } return err; } @@ -137,8 +131,9 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); -struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, - const struct in6_addr *addr, bool anycast); +struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, + const struct in6_addr *addr, bool anycast, + gfp_t gfp_flags); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags); @@ -147,9 +142,11 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, * support functions for ND * */ -struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, +struct fib6_info *rt6_get_dflt_router(struct net *net, + const struct in6_addr *addr, struct net_device *dev); -struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, +struct fib6_info *rt6_add_dflt_router(struct net *net, + const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); void rt6_purge_dflt_routers(struct net *net); @@ -174,14 +171,14 @@ struct rt6_rtnl_dump_arg { struct net *net; }; -int rt6_dump_route(struct rt6_info *rt, void *p_arg); +int rt6_dump_route(struct fib6_info *f6i, void *p_arg); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); -void rt6_multipath_rebalance(struct rt6_info *rt); +void rt6_multipath_rebalance(struct fib6_info *f6i); void rt6_uncached_list_add(struct rt6_info *rt); void rt6_uncached_list_del(struct rt6_info *rt); @@ -269,12 +266,38 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, return daddr; } -static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b) +static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { - return a->dst.dev == b->dst.dev && - a->rt6i_idev == b->rt6i_idev && - ipv6_addr_equal(&a->rt6i_gateway, &b->rt6i_gateway) && - !lwtunnel_cmp_encap(a->dst.lwtstate, b->dst.lwtstate); + return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && + ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) && + !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } +static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) +{ + struct inet6_dev *idev; + unsigned int mtu; + + if (dst_metric_locked(dst, RTAX_MTU)) { + mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + return mtu; + } + + mtu = IPV6_MIN_MTU; + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + if (idev) + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + + return mtu; +} + +u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr); + +struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, + struct net_device *dev, struct sk_buff *skb, + const void *daddr); #endif diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 81d0f2107ff1..69c91d1934c1 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -449,4 +449,6 @@ static inline void fib_proc_exit(struct net *net) } #endif +u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr); + #endif /* _NET_FIB_H */ diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 540a4b4417bf..90ff430f5e9d 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -379,6 +379,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, return 0; } +static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, + const struct sk_buff *skb) +{ + if (skb->protocol == htons(ETH_P_IP)) + return iph->ttl; + else if (skb->protocol == htons(ETH_P_IPV6)) + return ((const struct ipv6hdr *)iph)->hop_limit; + else + return 0; +} + /* Propogate ECN bits out */ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, const struct sk_buff *skb) @@ -466,12 +477,12 @@ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstat return (struct ip_tunnel_info *)lwtstate->data; } -extern struct static_key ip_tunnel_metadata_cnt; +DECLARE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt); /* Returns > 0 if metadata should be collected */ static inline int ip_tunnel_collect_metadata(void) { - return static_key_false(&ip_tunnel_metadata_cnt); + return static_branch_unlikely(&ip_tunnel_metadata_cnt); } void __init ip_tunnel_core_init(void); diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index aea7a124e66b..6d6e21dee462 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -656,6 +656,7 @@ struct ip_vs_dest { volatile unsigned int flags; /* dest status flags */ atomic_t conn_flags; /* flags to copy to conn */ atomic_t weight; /* server weight */ + atomic_t last_weight; /* server latest weight */ refcount_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ @@ -750,14 +751,14 @@ struct ip_vs_app { * 2=Mangled but checksum was not updated */ int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *, - struct sk_buff *, int *diff); + struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh); /* input hook: Process packet in outin direction, diff set for TCP. * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, * 2=Mangled but checksum was not updated */ int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *, - struct sk_buff *, int *diff); + struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh); /* ip_vs_app initializer */ int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *); @@ -1315,8 +1316,10 @@ int register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 int ip_vs_app_inc_get(struct ip_vs_app *inc); void ip_vs_app_inc_put(struct ip_vs_app *inc); -int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb); -int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb); +int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb, + struct ip_vs_iphdr *ipvsh); +int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb, + struct ip_vs_iphdr *ipvsh); int register_ip_vs_pe(struct ip_vs_pe *pe); int unregister_ip_vs_pe(struct ip_vs_pe *pe); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 836f31af1369..16475c269749 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -298,6 +298,7 @@ struct ipcm6_cookie { __s16 tclass; __s8 dontfrag; struct ipv6_txoptions *opt; + __u16 gso_size; }; static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) @@ -906,6 +907,11 @@ static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel) return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel; } +static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6) +{ + return fl6->flowlabel & IPV6_FLOWLABEL_MASK; +} + /* * Prototypes exported by ipv6 */ @@ -950,6 +956,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, + struct inet_cork_full *cork, const struct sockcm_cookie *sockc); static inline struct sk_buff *ip6_finish_skb(struct sock *sk) @@ -958,8 +965,6 @@ static inline struct sk_buff *ip6_finish_skb(struct sock *sk) &inet6_sk(sk)->cork); } -unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst); - int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, @@ -1044,8 +1049,6 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); int inet6_release(struct socket *sock); -int __inet6_bind(struct sock *sock, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); diff --git a/include/net/mac80211.h b/include/net/mac80211.h index b2f3a0c018e7..851a5e19ae32 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3378,6 +3378,8 @@ enum ieee80211_reconfig_type { * frame in case that no beacon was heard from the AP/P2P GO. * The callback will be called before each transmission and upon return * mac80211 will transmit the frame right away. + * If duration is greater than zero, mac80211 hints to the driver the + * duration for which the operation is requested. * The callback is optional and can (should!) sleep. * * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending @@ -3697,7 +3699,8 @@ struct ieee80211_ops { u32 sset, u8 *data); void (*mgd_prepare_tx)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + u16 duration); void (*mgd_protect_tdls_discover)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); @@ -4450,6 +4453,19 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif); /** + * ieee80211_csa_set_counter - request mac80211 to set csa counter + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @counter: the new value for the counter + * + * The csa counter can be changed by the device, this API should be + * used by the device driver to update csa counter in mac80211. + * + * It should never be used together with ieee80211_csa_update_counter(), + * as it will cause a race condition around the counter value. + */ +void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter); + +/** * ieee80211_csa_finish - notify mac80211 about channel switch * @vif: &struct ieee80211_vif pointer from the add_interface callback. * diff --git a/include/net/neighbour.h b/include/net/neighbour.h index e421f86af043..6c1eecd56a4d 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -246,6 +246,7 @@ static inline void *neighbour_priv(const struct neighbour *n) #define NEIGH_UPDATE_F_OVERRIDE 0x00000001 #define NEIGH_UPDATE_F_WEAK_OVERRIDE 0x00000002 #define NEIGH_UPDATE_F_OVERRIDE_ISROUTER 0x00000004 +#define NEIGH_UPDATE_F_EXT_LEARNED 0x20000000 #define NEIGH_UPDATE_F_ISROUTER 0x40000000 #define NEIGH_UPDATE_F_ADMIN 0x80000000 @@ -526,5 +527,21 @@ static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n, } while (read_seqretry(&n->ha_lock, seq)); } - +static inline void neigh_update_ext_learned(struct neighbour *neigh, u32 flags, + int *notify) +{ + u8 ndm_flags = 0; + + if (!(flags & NEIGH_UPDATE_F_ADMIN)) + return; + + ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; + if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) { + if (ndm_flags & NTF_EXT_LEARNED) + neigh->flags |= NTF_EXT_LEARNED; + else + neigh->flags &= ~NTF_EXT_LEARNED; + *notify = 1; + } +} #endif diff --git a/include/net/net_failover.h b/include/net/net_failover.h new file mode 100644 index 000000000000..b12a1c469d1c --- /dev/null +++ b/include/net/net_failover.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018, Intel Corporation. */ + +#ifndef _NET_FAILOVER_H +#define _NET_FAILOVER_H + +#include <net/failover.h> + +/* failover state */ +struct net_failover_info { + /* primary netdev with same MAC */ + struct net_device __rcu *primary_dev; + + /* standby netdev */ + struct net_device __rcu *standby_dev; + + /* primary netdev stats */ + struct rtnl_link_stats64 primary_stats; + + /* standby netdev stats */ + struct rtnl_link_stats64 standby_stats; + + /* aggregated stats */ + struct rtnl_link_stats64 failover_stats; + + /* spinlock while updating stats */ + spinlock_t stats_lock; +}; + +struct failover *net_failover_create(struct net_device *standby_dev); +void net_failover_destroy(struct failover *failover); + +#define FAILOVER_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ + NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \ + NETIF_F_HIGHDMA | NETIF_F_LRO) + +#define FAILOVER_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ + NETIF_F_RXCSUM | NETIF_F_ALL_TSO) + +#endif /* _NET_FAILOVER_H */ diff --git a/include/net/netfilter/ipv4/nf_nat_masquerade.h b/include/net/netfilter/ipv4/nf_nat_masquerade.h index ebd869473603..cd24be4c4a99 100644 --- a/include/net/netfilter/ipv4/nf_nat_masquerade.h +++ b/include/net/netfilter/ipv4/nf_nat_masquerade.h @@ -6,7 +6,7 @@ unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, const struct net_device *out); void nf_nat_masquerade_ipv4_register_notifier(void); diff --git a/include/net/netfilter/ipv6/nf_nat_masquerade.h b/include/net/netfilter/ipv6/nf_nat_masquerade.h index 1ed4f2631ed6..0c3b5ebf0bb8 100644 --- a/include/net/netfilter/ipv6/nf_nat_masquerade.h +++ b/include/net/netfilter/ipv6/nf_nat_masquerade.h @@ -3,7 +3,7 @@ #define _NF_NAT_MASQUERADE_IPV6_H_ unsigned int -nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, +nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, const struct net_device *out); void nf_nat_masquerade_ipv6_register_notifier(void); void nf_nat_masquerade_ipv6_unregister_notifier(void); diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index e61184fbfb71..1910b6572430 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -13,4 +13,15 @@ unsigned int nf_conncount_count(struct net *net, const u32 *key, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); + +unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone, + bool *addit); + +bool nf_conncount_add(struct hlist_head *head, + const struct nf_conntrack_tuple *tuple); + +void nf_conncount_cache_free(struct hlist_head *hhead); + #endif diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 833752dd0c58..ba9fa4592f2b 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -6,6 +6,7 @@ #include <linux/netdevice.h> #include <linux/rhashtable.h> #include <linux/rcupdate.h> +#include <linux/netfilter/nf_conntrack_tuple_common.h> #include <net/dst.h> struct nf_flowtable; @@ -13,25 +14,24 @@ struct nf_flowtable; struct nf_flowtable_type { struct list_head list; int family; - void (*gc)(struct work_struct *work); + int (*init)(struct nf_flowtable *ft); void (*free)(struct nf_flowtable *ft); - const struct rhashtable_params *params; nf_hookfn *hook; struct module *owner; }; struct nf_flowtable { + struct list_head list; struct rhashtable rhashtable; const struct nf_flowtable_type *type; struct delayed_work gc_work; }; enum flow_offload_tuple_dir { - FLOW_OFFLOAD_DIR_ORIGINAL, - FLOW_OFFLOAD_DIR_REPLY, - __FLOW_OFFLOAD_DIR_MAX = FLOW_OFFLOAD_DIR_REPLY, + FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL, + FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY, + FLOW_OFFLOAD_DIR_MAX = IP_CT_DIR_MAX }; -#define FLOW_OFFLOAD_DIR_MAX (__FLOW_OFFLOAD_DIR_MAX + 1) struct flow_offload_tuple { union { @@ -55,6 +55,8 @@ struct flow_offload_tuple { int oifidx; + u16 mtu; + struct dst_entry *dst_cache; }; @@ -66,6 +68,7 @@ struct flow_offload_tuple_rhash { #define FLOW_OFFLOAD_SNAT 0x1 #define FLOW_OFFLOAD_DNAT 0x2 #define FLOW_OFFLOAD_DYING 0x4 +#define FLOW_OFFLOAD_TEARDOWN 0x8 struct flow_offload { struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; @@ -98,11 +101,14 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table, void nf_flow_table_cleanup(struct net *net, struct net_device *dev); +int nf_flow_table_init(struct nf_flowtable *flow_table); void nf_flow_table_free(struct nf_flowtable *flow_table); -void nf_flow_offload_work_gc(struct work_struct *work); -extern const struct rhashtable_params nf_flow_offload_rhash_params; -void flow_offload_dead(struct flow_offload *flow); +void flow_offload_teardown(struct flow_offload *flow); +static inline void flow_offload_dead(struct flow_offload *flow) +{ + flow->flags |= FLOW_OFFLOAD_DYING; +} int nf_flow_snat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index 207a467e7ca6..a17eb2f8d40e 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -39,7 +39,7 @@ struct nf_conn_nat { /* Set up the info structure to map into this range. */ unsigned int nf_nat_setup_info(struct nf_conn *ct, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype); extern unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, @@ -75,4 +75,8 @@ static inline bool nf_nat_oif_changed(unsigned int hooknum, #endif } +int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops, + const struct nf_hook_ops *nat_ops, unsigned int ops_count); +void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops, + unsigned int ops_count); #endif diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h index 235bd0e9a5aa..dc7cd0440229 100644 --- a/include/net/netfilter/nf_nat_core.h +++ b/include/net/netfilter/nf_nat_core.h @@ -11,6 +11,10 @@ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff *skb); +unsigned int +nf_nat_inet_fn(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state); + int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family); static inline int nf_nat_initialized(struct nf_conn *ct, @@ -22,11 +26,4 @@ static inline int nf_nat_initialized(struct nf_conn *ct, return ct->status & IPS_DST_NAT_DONE; } -struct nlattr; - -extern int -(*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, - enum nf_nat_manip_type manip, - const struct nlattr *attr); - #endif /* _NF_NAT_CORE_H */ diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h index ce7c2b4e64bb..d300b8f03972 100644 --- a/include/net/netfilter/nf_nat_l3proto.h +++ b/include/net/netfilter/nf_nat_l3proto.h @@ -7,7 +7,7 @@ struct nf_nat_l3proto { u8 l3proto; bool (*in_range)(const struct nf_conntrack_tuple *t, - const struct nf_nat_range *range); + const struct nf_nat_range2 *range); u32 (*secure_port)(const struct nf_conntrack_tuple *t, __be16); @@ -33,7 +33,7 @@ struct nf_nat_l3proto { struct flowi *fl); int (*nlattr_to_range)(struct nlattr *tb[], - struct nf_nat_range *range); + struct nf_nat_range2 *range); }; int nf_nat_l3proto_register(const struct nf_nat_l3proto *); @@ -44,66 +44,14 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum); -unsigned int nf_nat_ipv4_in(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); - -unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); - -unsigned int nf_nat_ipv4_local_fn(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); - -unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); - int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, unsigned int hdrlen); -unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); - -unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); - -unsigned int nf_nat_ipv6_local_fn(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); +int nf_nat_l3proto_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops); +void nf_nat_l3proto_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops); -unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); +int nf_nat_l3proto_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops); +void nf_nat_l3proto_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops); #endif /* _NF_NAT_L3PROTO_H */ diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h index 67835ff8a2d9..b4d6b29bca62 100644 --- a/include/net/netfilter/nf_nat_l4proto.h +++ b/include/net/netfilter/nf_nat_l4proto.h @@ -34,12 +34,12 @@ struct nf_nat_l4proto { */ void (*unique_tuple)(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct); int (*nlattr_to_range)(struct nlattr *tb[], - struct nf_nat_range *range); + struct nf_nat_range2 *range); }; /* Protocol registration. */ @@ -72,11 +72,11 @@ bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple, void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct, u16 *rover); int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range *range); + struct nf_nat_range2 *range); #endif /*_NF_NAT_L4PROTO_H*/ diff --git a/include/net/netfilter/nf_nat_redirect.h b/include/net/netfilter/nf_nat_redirect.h index 5ddabb08c472..c129aacc8ae8 100644 --- a/include/net/netfilter/nf_nat_redirect.h +++ b/include/net/netfilter/nf_nat_redirect.h @@ -7,7 +7,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_ipv4_multi_range_compat *mr, unsigned int hooknum); unsigned int -nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, +nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum); #endif /* _NF_NAT_REDIRECT_H_ */ diff --git a/include/net/netfilter/nf_socket.h b/include/net/netfilter/nf_socket.h index 8230fefff9f5..f9d7bee9bd4e 100644 --- a/include/net/netfilter/nf_socket.h +++ b/include/net/netfilter/nf_socket.h @@ -2,22 +2,7 @@ #ifndef _NF_SOCK_H_ #define _NF_SOCK_H_ -struct net_device; -struct sk_buff; -struct sock; -struct net; - -static inline bool nf_sk_is_transparent(struct sock *sk) -{ - switch (sk->sk_state) { - case TCP_TIME_WAIT: - return inet_twsk(sk)->tw_transparent; - case TCP_NEW_SYN_RECV: - return inet_rsk(inet_reqsk(sk))->no_srccheck; - default: - return inet_sk(sk)->transparent; - } -} +#include <net/sock.h> struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, const struct net_device *indev); diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index a1e28dd5d0bf..08c005ce56e9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -9,6 +9,7 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nf_tables.h> #include <linux/u64_stats_sync.h> +#include <linux/rhashtable.h> #include <net/netfilter/nf_flow_table.h> #include <net/netlink.h> @@ -276,23 +277,6 @@ struct nft_set_estimate { enum nft_set_class space; }; -/** - * struct nft_set_type - nf_tables set type - * - * @select_ops: function to select nft_set_ops - * @ops: default ops, used when no select_ops functions is present - * @list: used internally - * @owner: module reference - */ -struct nft_set_type { - const struct nft_set_ops *(*select_ops)(const struct nft_ctx *, - const struct nft_set_desc *desc, - u32 flags); - const struct nft_set_ops *ops; - struct list_head list; - struct module *owner; -}; - struct nft_set_ext; struct nft_expr; @@ -311,7 +295,6 @@ struct nft_expr; * @init: initialize private data of new set instance * @destroy: destroy private data of set instance * @elemsize: element private size - * @features: features supported by the implementation */ struct nft_set_ops { bool (*lookup)(const struct net *net, @@ -360,11 +343,26 @@ struct nft_set_ops { const struct nft_set_desc *desc, const struct nlattr * const nla[]); void (*destroy)(const struct nft_set *set); + void (*gc_init)(const struct nft_set *set); unsigned int elemsize; +}; + +/** + * struct nft_set_type - nf_tables set type + * + * @ops: set ops for this type + * @list: used internally + * @owner: module reference + * @features: features supported by the implementation + */ +struct nft_set_type { + const struct nft_set_ops ops; + struct list_head list; + struct module *owner; u32 features; - const struct nft_set_type *type; }; +#define to_set_type(o) container_of(o, struct nft_set_type, ops) int nft_register_set(struct nft_set_type *type); void nft_unregister_set(struct nft_set_type *type); @@ -374,6 +372,8 @@ void nft_unregister_set(struct nft_set_type *type); * * @list: table set list node * @bindings: list of set bindings + * @table: table this set belongs to + * @net: netnamespace this set belongs to * @name: name of the set * @handle: unique handle of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) @@ -397,6 +397,8 @@ void nft_unregister_set(struct nft_set_type *type); struct nft_set { struct list_head list; struct list_head bindings; + struct nft_table *table; + possible_net_t net; char *name; u64 handle; u32 ktype; @@ -590,7 +592,7 @@ static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext) return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); } -static inline unsigned long *nft_set_ext_expiration(const struct nft_set_ext *ext) +static inline u64 *nft_set_ext_expiration(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); } @@ -608,7 +610,7 @@ static inline struct nft_expr *nft_set_ext_expr(const struct nft_set_ext *ext) static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && - time_is_before_eq_jiffies(*nft_set_ext_expiration(ext)); + time_is_before_eq_jiffies64(*nft_set_ext_expiration(ext)); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, @@ -712,6 +714,7 @@ struct nft_expr_type { }; #define NFT_EXPR_STATEFUL 0x1 +#define NFT_EXPR_GC 0x2 /** * struct nft_expr_ops - nf_tables expression operations @@ -743,11 +746,15 @@ struct nft_expr_ops { const struct nft_expr *expr); void (*destroy)(const struct nft_ctx *ctx, const struct nft_expr *expr); + void (*destroy_clone)(const struct nft_ctx *ctx, + const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, const struct nft_expr *expr); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data); + bool (*gc)(struct net *net, + const struct nft_expr *expr); const struct nft_expr_type *type; void *data; }; @@ -854,6 +861,7 @@ enum nft_chain_flags { * * @rules: list of rules in the chain * @list: used internally + * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain @@ -862,8 +870,11 @@ enum nft_chain_flags { * @name: name of the chain */ struct nft_chain { + struct nft_rule *__rcu *rules_gen_0; + struct nft_rule *__rcu *rules_gen_1; struct list_head rules; struct list_head list; + struct rhlist_head rhlhead; struct nft_table *table; u64 handle; u32 use; @@ -871,8 +882,13 @@ struct nft_chain { u8 flags:6, genmask:2; char *name; + + /* Only used during control plane commit phase: */ + struct nft_rule **rules_next; }; +int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); + enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, @@ -889,8 +905,8 @@ enum nft_chain_types { * @owner: module owner * @hook_mask: mask of valid hooks * @hooks: array of hook functions - * @init: chain initialization function - * @free: chain release function + * @ops_register: base chain register function + * @ops_unregister: base chain unregister function */ struct nft_chain_type { const char *name; @@ -899,8 +915,8 @@ struct nft_chain_type { struct module *owner; unsigned int hook_mask; nf_hookfn *hooks[NF_MAX_HOOKS]; - int (*init)(struct nft_ctx *ctx); - void (*free)(struct nft_ctx *ctx); + int (*ops_register)(struct net *net, const struct nf_hook_ops *ops); + void (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops); }; int nft_chain_validate_dependency(const struct nft_chain *chain, @@ -952,7 +968,8 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); * struct nft_table - nf_tables table * * @list: used internally - * @chains: chains in the table + * @chains_ht: chains in the table + * @chains: same, for stable walks * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table @@ -966,6 +983,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); */ struct nft_table { struct list_head list; + struct rhltable chains_ht; struct list_head chains; struct list_head sets; struct list_head objects; @@ -1020,9 +1038,9 @@ static inline void *nft_obj_data(const struct nft_object *obj) #define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr)) -struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, - const struct nlattr *nla, u32 objtype, - u8 genmask); +struct nft_object *nft_obj_lookup(const struct nft_table *table, + const struct nlattr *nla, u32 objtype, + u8 genmask); void nft_obj_notify(struct net *net, struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, @@ -1067,7 +1085,8 @@ struct nft_object_ops { int (*init)(const struct nft_ctx *ctx, const struct nlattr *const tb[], struct nft_object *obj); - void (*destroy)(struct nft_object *obj); + void (*destroy)(const struct nft_ctx *ctx, + struct nft_object *obj); int (*dump)(struct sk_buff *skb, struct nft_object *obj, bool reset); @@ -1111,12 +1130,9 @@ struct nft_flowtable { struct nf_flowtable data; }; -struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, - const struct nlattr *nla, - u8 genmask); -void nft_flow_table_iterate(struct net *net, - void (*iter)(struct nf_flowtable *flowtable, void *data), - void *data); +struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, + const struct nlattr *nla, + u8 genmask); void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index ea5aab568be8..e0c0c2558ec4 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -2,6 +2,8 @@ #ifndef _NET_NF_TABLES_CORE_H #define _NET_NF_TABLES_CORE_H +#include <net/netfilter/nf_tables.h> + extern struct nft_expr_type nft_imm_type; extern struct nft_expr_type nft_cmp_type; extern struct nft_expr_type nft_lookup_type; @@ -10,6 +12,9 @@ extern struct nft_expr_type nft_byteorder_type; extern struct nft_expr_type nft_payload_type; extern struct nft_expr_type nft_dynset_type; extern struct nft_expr_type nft_range_type; +extern struct nft_expr_type nft_meta_type; +extern struct nft_expr_type nft_rt_type; +extern struct nft_expr_type nft_exthdr_type; int nf_tables_core_module_init(void); void nf_tables_core_module_exit(void); @@ -20,6 +25,12 @@ struct nft_cmp_fast_expr { u8 len; }; +struct nft_immediate_expr { + struct nft_data data; + enum nft_registers dreg:8; + u8 dlen; +}; + /* Calculate the mask for the nft_cmp_fast expression. On big endian the * mask needs to include the *upper* bytes when interpreting that data as * something smaller than the full u32, therefore a cpu_to_le32 is done. diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h new file mode 100644 index 000000000000..9754a50ecde9 --- /dev/null +++ b/include/net/netfilter/nf_tproxy.h @@ -0,0 +1,113 @@ +#ifndef _NF_TPROXY_H_ +#define _NF_TPROXY_H_ + +#include <net/tcp.h> + +enum nf_tproxy_lookup_t { + NF_TPROXY_LOOKUP_LISTENER, + NF_TPROXY_LOOKUP_ESTABLISHED, +}; + +static inline bool nf_tproxy_sk_is_transparent(struct sock *sk) +{ + if (inet_sk_transparent(sk)) + return true; + + sock_gen_put(sk); + return false; +} + +__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr); + +/** + * nf_tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections + * @skb: The skb being processed. + * @laddr: IPv4 address to redirect to or zero. + * @lport: TCP port to redirect to or zero. + * @sk: The TIME_WAIT TCP socket found by the lookup. + * + * We have to handle SYN packets arriving to TIME_WAIT sockets + * differently: instead of reopening the connection we should rather + * redirect the new connection to the proxy if there's a listener + * socket present. + * + * nf_tproxy_handle_time_wait4() consumes the socket reference passed in. + * + * Returns the listener socket if there's one, the TIME_WAIT socket if + * no such listener is found, or NULL if the TCP header is incomplete. + */ +struct sock * +nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, + __be32 laddr, __be16 lport, struct sock *sk); + +/* + * This is used when the user wants to intercept a connection matching + * an explicit iptables rule. In this case the sockets are assumed + * matching in preference order: + * + * - match: if there's a fully established connection matching the + * _packet_ tuple, it is returned, assuming the redirection + * already took place and we process a packet belonging to an + * established connection + * + * - match: if there's a listening socket matching the redirection + * (e.g. on-port & on-ip of the connection), it is returned, + * regardless if it was bound to 0.0.0.0 or an explicit + * address. The reasoning is that if there's an explicit rule, it + * does not really matter if the listener is bound to an interface + * or to 0. The user already stated that he wants redirection + * (since he added the rule). + * + * Please note that there's an overlap between what a TPROXY target + * and a socket match will match. Normally if you have both rules the + * "socket" match will be the first one, effectively all packets + * belonging to established connections going through that one. + */ +struct sock * +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, + const u8 protocol, + const __be32 saddr, const __be32 daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, + const enum nf_tproxy_lookup_t lookup_type); + +const struct in6_addr * +nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, + const struct in6_addr *daddr); + +/** + * nf_tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections + * @skb: The skb being processed. + * @tproto: Transport protocol. + * @thoff: Transport protocol header offset. + * @net: Network namespace. + * @laddr: IPv6 address to redirect to. + * @lport: TCP port to redirect to or zero. + * @sk: The TIME_WAIT TCP socket found by the lookup. + * + * We have to handle SYN packets arriving to TIME_WAIT sockets + * differently: instead of reopening the connection we should rather + * redirect the new connection to the proxy if there's a listener + * socket present. + * + * nf_tproxy_handle_time_wait6() consumes the socket reference passed in. + * + * Returns the listener socket if there's one, the TIME_WAIT socket if + * no such listener is found, or NULL if the TCP header is incomplete. + */ +struct sock * +nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, + struct net *net, + const struct in6_addr *laddr, + const __be16 lport, + struct sock *sk); + +struct sock * +nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, + const u8 protocol, + const struct in6_addr *saddr, const struct in6_addr *daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, + const enum nf_tproxy_lookup_t lookup_type); + +#endif /* _NF_TPROXY_H_ */ diff --git a/include/net/netfilter/nfnetlink_log.h b/include/net/netfilter/nfnetlink_log.h index 612cfb63ac68..ea32a7d3cf1b 100644 --- a/include/net/netfilter/nfnetlink_log.h +++ b/include/net/netfilter/nfnetlink_log.h @@ -1,18 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _KER_NFNETLINK_LOG_H -#define _KER_NFNETLINK_LOG_H - -void -nfulnl_log_packet(struct net *net, - u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *li_user, - const char *prefix); - -#define NFULNL_COPY_DISABLED 0xff - -#endif /* _KER_NFNETLINK_LOG_H */ - diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h deleted file mode 100644 index 5c69e9b09388..000000000000 --- a/include/net/netfilter/nft_meta.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NFT_META_H_ -#define _NFT_META_H_ - -struct nft_meta { - enum nft_meta_keys key:8; - union { - enum nft_registers dreg:8; - enum nft_registers sreg:8; - }; -}; - -extern const struct nla_policy nft_meta_policy[]; - -int nft_meta_get_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]); - -int nft_meta_set_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]); - -int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr); - -int nft_meta_set_dump(struct sk_buff *skb, - const struct nft_expr *expr); - -void nft_meta_get_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt); - -void nft_meta_set_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt); - -void nft_meta_set_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr); - -int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data); - -#endif diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8491bc9c86b1..661348f23ea5 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -160,6 +160,8 @@ struct netns_ipv4 { int sysctl_tcp_pacing_ca_ratio; int sysctl_tcp_wmem[3]; int sysctl_tcp_rmem[3]; + int sysctl_tcp_comp_sack_nr; + unsigned long sysctl_tcp_comp_sack_delay_ns; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index c29f09cfc9d7..c978a31b0f84 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -43,6 +43,7 @@ struct netns_sysctl_ipv6 { int max_hbh_opts_cnt; int max_dst_opts_len; int max_hbh_opts_len; + int seg6_flowlabel; }; struct netns_ipv6 { @@ -60,7 +61,8 @@ struct netns_ipv6 { #endif struct xt_table *ip6table_nat; #endif - struct rt6_info *ip6_null_entry; + struct fib6_info *fib6_null_entry; + struct rt6_info *ip6_null_entry; struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; struct hlist_head *fib_table_hash; diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 48134353411d..94767ea3a490 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -4,13 +4,12 @@ #include <linux/list.h> -struct nft_af_info; - struct netns_nftables { struct list_head tables; struct list_head commit_list; unsigned int base_seq; u8 gencursor; + u8 validate_state; }; #endif diff --git a/include/net/page_pool.h b/include/net/page_pool.h new file mode 100644 index 000000000000..694d055e01ef --- /dev/null +++ b/include/net/page_pool.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool.h + * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> + * Copyright (C) 2016 Red Hat, Inc. + */ + +/** + * DOC: page_pool allocator + * + * This page_pool allocator is optimized for the XDP mode that + * uses one-frame-per-page, but have fallbacks that act like the + * regular page allocator APIs. + * + * Basic use involve replacing alloc_pages() calls with the + * page_pool_alloc_pages() call. Drivers should likely use + * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). + * + * If page_pool handles DMA mapping (use page->private), then API user + * is responsible for invoking page_pool_put_page() once. In-case of + * elevated refcnt, the DMA state is released, assuming other users of + * the page will eventually call put_page(). + * + * If no DMA mapping is done, then it can act as shim-layer that + * fall-through to alloc_page. As no state is kept on the page, the + * regular put_page() call is sufficient. + */ +#ifndef _NET_PAGE_POOL_H +#define _NET_PAGE_POOL_H + +#include <linux/mm.h> /* Needed by ptr_ring */ +#include <linux/ptr_ring.h> +#include <linux/dma-direction.h> + +#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */ +#define PP_FLAG_ALL PP_FLAG_DMA_MAP + +/* + * Fast allocation side cache array/stack + * + * The cache size and refill watermark is related to the network + * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX + * ring is usually refilled and the max consumed elements will be 64, + * thus a natural max size of objects needed in the cache. + * + * Keeping room for more objects, is due to XDP_DROP use-case. As + * XDP_DROP allows the opportunity to recycle objects directly into + * this array, as it shares the same softirq/NAPI protection. If + * cache is already full (or partly full) then the XDP_DROP recycles + * would have to take a slower code path. + */ +#define PP_ALLOC_CACHE_SIZE 128 +#define PP_ALLOC_CACHE_REFILL 64 +struct pp_alloc_cache { + u32 count; + void *cache[PP_ALLOC_CACHE_SIZE]; +}; + +struct page_pool_params { + unsigned int flags; + unsigned int order; + unsigned int pool_size; + int nid; /* Numa node id to allocate from pages from */ + struct device *dev; /* device, for DMA pre-mapping purposes */ + enum dma_data_direction dma_dir; /* DMA mapping direction */ +}; + +struct page_pool { + struct rcu_head rcu; + struct page_pool_params p; + + /* + * Data structure for allocation side + * + * Drivers allocation side usually already perform some kind + * of resource protection. Piggyback on this protection, and + * require driver to protect allocation side. + * + * For NIC drivers this means, allocate a page_pool per + * RX-queue. As the RX-queue is already protected by + * Softirq/BH scheduling and napi_schedule. NAPI schedule + * guarantee that a single napi_struct will only be scheduled + * on a single CPU (see napi_schedule). + */ + struct pp_alloc_cache alloc ____cacheline_aligned_in_smp; + + /* Data structure for storing recycled pages. + * + * Returning/freeing pages is more complicated synchronization + * wise, because free's can happen on remote CPUs, with no + * association with allocation resource. + * + * Use ptr_ring, as it separates consumer and producer + * effeciently, it a way that doesn't bounce cache-lines. + * + * TODO: Implement bulk return pages into this structure. + */ + struct ptr_ring ring; +}; + +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); + +static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_pages(pool, gfp); +} + +struct page_pool *page_pool_create(const struct page_pool_params *params); + +void page_pool_destroy(struct page_pool *pool); + +/* Never call this directly, use helpers below */ +void __page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct); + +static inline void page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct) +{ + /* When page_pool isn't compiled-in, net/core/xdp.c doesn't + * allow registering MEM_TYPE_PAGE_POOL, but shield linker. + */ +#ifdef CONFIG_PAGE_POOL + __page_pool_put_page(pool, page, allow_direct); +#endif +} +/* Very limited use-cases allow recycle direct */ +static inline void page_pool_recycle_direct(struct page_pool *pool, + struct page *page) +{ + __page_pool_put_page(pool, page, true); +} + +static inline bool is_page_pool_compiled_in(void) +{ +#ifdef CONFIG_PAGE_POOL + return true; +#else + return false; +#endif +} + +#endif /* _NET_PAGE_POOL_H */ diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e828d31be5da..a3c1a2c47cd4 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -33,7 +33,7 @@ struct tcf_block_ext_info { }; struct tcf_block_cb; -bool tcf_queue_work(struct work_struct *work); +bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, @@ -683,9 +683,11 @@ static inline bool tc_skip_sw(u32 flags) /* SKIP_HW and SKIP_SW are mutually exclusive flags. */ static inline bool tc_flags_valid(u32 flags) { - if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)) + if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW | + TCA_CLS_FLAGS_VERBOSE)) return false; + flags &= TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW; if (!(flags ^ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW))) return false; @@ -705,7 +707,7 @@ tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common, cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio; - if (tc_skip_sw(flags)) + if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } @@ -776,6 +778,18 @@ struct tc_qopt_offload_stats { struct gnet_stats_queue *qstats; }; +enum tc_mq_command { + TC_MQ_CREATE, + TC_MQ_DESTROY, + TC_MQ_STATS, +}; + +struct tc_mq_qopt_offload { + enum tc_mq_command command; + u32 handle; + struct tc_qopt_offload_stats stats; +}; + enum tc_red_command { TC_RED_REPLACE, TC_RED_DESTROY, diff --git a/include/net/route.h b/include/net/route.h index dbb032d5921b..bb53cdba38dc 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -225,6 +225,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, struct in_ifaddr; void fib_add_ifaddr(struct in_ifaddr *); void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *); +void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric); void rt_add_uncached_list(struct rtable *rt); void rt_del_uncached_list(struct rtable *rt); diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 14b6b3af8918..0bbaa5488423 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -64,7 +64,7 @@ struct rtnl_link_ops { size_t priv_size; void (*setup)(struct net_device *dev); - int maxtype; + unsigned int maxtype; const struct nla_policy *policy; int (*validate)(struct nlattr *tb[], struct nlattr *data[], @@ -92,7 +92,7 @@ struct rtnl_link_ops { unsigned int (*get_num_tx_queues)(void); unsigned int (*get_num_rx_queues)(void); - int slave_maxtype; + unsigned int slave_maxtype; const struct nla_policy *slave_policy; int (*slave_changelink)(struct net_device *dev, struct net_device *slave_dev, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 5154c8300262..6488daa32f82 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -30,7 +30,6 @@ struct qdisc_rate_table { enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, - __QDISC_STATE_RUNNING, }; struct qdisc_size_table { @@ -86,6 +85,8 @@ struct Qdisc { struct net_rate_estimator __rcu *rate_est; struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; + int padded; + refcount_t refcnt; /* * For performance sake on SMP, we put highly modified fields at the end @@ -98,10 +99,9 @@ struct Qdisc { unsigned long state; struct Qdisc *next_sched; struct sk_buff_head skb_bad_txq; - int padded; - refcount_t refcnt; spinlock_t busylock ____cacheline_aligned_in_smp; + spinlock_t seqlock; }; static inline void qdisc_refcount_inc(struct Qdisc *qdisc) @@ -111,15 +111,21 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc) refcount_inc(&qdisc->refcnt); } -static inline bool qdisc_is_running(const struct Qdisc *qdisc) +static inline bool qdisc_is_running(struct Qdisc *qdisc) { + if (qdisc->flags & TCQ_F_NOLOCK) + return spin_is_locked(&qdisc->seqlock); return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; } static inline bool qdisc_run_begin(struct Qdisc *qdisc) { - if (qdisc_is_running(qdisc)) + if (qdisc->flags & TCQ_F_NOLOCK) { + if (!spin_trylock(&qdisc->seqlock)) + return false; + } else if (qdisc_is_running(qdisc)) { return false; + } /* Variant of write_seqcount_begin() telling lockdep a trylock * was attempted. */ @@ -131,6 +137,8 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) static inline void qdisc_run_end(struct Qdisc *qdisc) { write_seqcount_end(&qdisc->running); + if (qdisc->flags & TCQ_F_NOLOCK) + spin_unlock(&qdisc->seqlock); } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) @@ -342,14 +350,14 @@ static inline int qdisc_qlen(const struct Qdisc *q) static inline int qdisc_qlen_sum(const struct Qdisc *q) { - __u32 qlen = 0; + __u32 qlen = q->qstats.qlen; int i; if (q->flags & TCQ_F_NOLOCK) { for_each_possible_cpu(i) qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen; } else { - qlen = q->q.qlen; + qlen += q->q.qlen; } return qlen; diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 20ff237c5eb2..86f034b524d4 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -254,11 +254,10 @@ enum { SCTP_ARBITRARY_COOKIE_ECHO_LEN = 200 }; #define SCTP_TSN_MAP_SIZE 4096 /* We will not record more than this many duplicate TSNs between two - * SACKs. The minimum PMTU is 576. Remove all the headers and there - * is enough room for 131 duplicate reports. Round down to the + * SACKs. The minimum PMTU is 512. Remove all the headers and there + * is enough room for 117 duplicate reports. Round down to the * nearest power of 2. */ -enum { SCTP_MIN_PMTU = 576 }; enum { SCTP_MAX_DUP_TSNS = 16 }; enum { SCTP_MAX_GABS = 16 }; diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index e6d349b2a791..30b3e2fe240a 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -429,32 +429,6 @@ static inline int sctp_list_single_entry(struct list_head *head) return (head->next != head) && (head->next == head->prev); } -/* Break down data chunks at this point. */ -static inline int sctp_frag_point(const struct sctp_association *asoc, int pmtu) -{ - struct sctp_sock *sp = sctp_sk(asoc->base.sk); - struct sctp_af *af = sp->pf->af; - int frag = pmtu; - - frag -= af->ip_options_len(asoc->base.sk); - frag -= af->net_header_len; - frag -= sizeof(struct sctphdr) + sctp_datachk_len(&asoc->stream); - - if (asoc->user_frag) - frag = min_t(int, frag, asoc->user_frag); - - frag = SCTP_TRUNC4(min_t(int, frag, SCTP_MAX_CHUNK_LEN - - sctp_datachk_len(&asoc->stream))); - - return frag; -} - -static inline void sctp_assoc_pending_pmtu(struct sctp_association *asoc) -{ - sctp_assoc_sync_pmtu(asoc); - asoc->pmtu_pending = 0; -} - static inline bool sctp_chunk_pending(const struct sctp_chunk *chunk) { return !list_empty(&chunk->list); @@ -608,17 +582,29 @@ static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport * return t->dst; } -static inline bool sctp_transport_pmtu_check(struct sctp_transport *t) +/* Calculate max payload size given a MTU, or the total overhead if + * given MTU is zero + */ +static inline __u32 sctp_mtu_payload(const struct sctp_sock *sp, + __u32 mtu, __u32 extra) { - __u32 pmtu = max_t(size_t, SCTP_TRUNC4(dst_mtu(t->dst)), - SCTP_DEFAULT_MINSEGMENT); + __u32 overhead = sizeof(struct sctphdr) + extra; - if (t->pathmtu == pmtu) - return true; + if (sp) + overhead += sp->pf->af->net_header_len; + else + overhead += sizeof(struct ipv6hdr); - t->pathmtu = pmtu; + if (WARN_ON_ONCE(mtu && mtu <= overhead)) + mtu = overhead; - return false; + return mtu ? mtu - overhead : overhead; +} + +static inline __u32 sctp_dst_mtu(const struct dst_entry *dst) +{ + return SCTP_TRUNC4(max_t(__u32, dst_mtu(dst), + SCTP_DEFAULT_MINSEGMENT)); } #endif /* __net_sctp_h__ */ diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 2d0e782c9055..5ef1bad81ef5 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -207,7 +207,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, int len, __u8 flags, gfp_t gfp); struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, const __u32 lowest_tsn); -struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc); +struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc); struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, @@ -215,7 +215,7 @@ struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, struct sctp_chunk *sctp_make_shutdown_complete( const struct sctp_association *asoc, const struct sctp_chunk *chunk); -void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause, size_t paylen); +int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause, size_t paylen); struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const size_t hint); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index a0ec462bc1a9..ebf809eed33a 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -2091,16 +2091,14 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, enum sctp_transport_cmd command, sctp_sn_error_t error); struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *, __u32); -struct sctp_transport *sctp_assoc_is_match(struct sctp_association *, - struct net *, - const union sctp_addr *, - const union sctp_addr *); void sctp_assoc_migrate(struct sctp_association *, struct sock *); int sctp_assoc_update(struct sctp_association *old, struct sctp_association *new); __u32 sctp_association_get_next_tsn(struct sctp_association *); +void sctp_assoc_update_frag_point(struct sctp_association *asoc); +void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu); void sctp_assoc_sync_pmtu(struct sctp_association *asoc); void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int); void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int); diff --git a/include/net/seg6.h b/include/net/seg6.h index 099bad59dc90..e029e301faa5 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -49,7 +49,11 @@ struct seg6_pernet_data { static inline struct seg6_pernet_data *seg6_pernet(struct net *net) { +#if IS_ENABLED(CONFIG_IPV6) return net->ipv6.seg6_data; +#else + return NULL; +#endif } extern int seg6_init(void); @@ -63,5 +67,6 @@ extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len); extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto); extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh); - +extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id); #endif diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h new file mode 100644 index 000000000000..661fd5b4d3e0 --- /dev/null +++ b/include/net/seg6_local.h @@ -0,0 +1,32 @@ +/* + * SR-IPv6 implementation + * + * Authors: + * David Lebrun <david.lebrun@uclouvain.be> + * eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com> + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _NET_SEG6_LOCAL_H +#define _NET_SEG6_LOCAL_H + +#include <linux/percpu.h> +#include <linux/net.h> +#include <linux/ipv6.h> + +extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id); + +struct seg6_bpf_srh_state { + bool valid; + u16 hdrlen; +}; + +DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); + +#endif diff --git a/include/net/sock.h b/include/net/sock.h index 4d2e8ad98985..b3b75419eafe 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -481,6 +481,11 @@ struct sock { void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb); +#endif void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; struct rcu_head sk_rcu; @@ -803,10 +808,10 @@ static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) } #ifdef CONFIG_NET -extern struct static_key memalloc_socks; +DECLARE_STATIC_KEY_FALSE(memalloc_socks_key); static inline int sk_memalloc_socks(void) { - return static_key_false(&memalloc_socks); + return static_branch_unlikely(&memalloc_socks_key); } #else @@ -2330,6 +2335,22 @@ static inline bool sk_fullsock(const struct sock *sk) return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV); } +/* Checks if this SKB belongs to an HW offloaded socket + * and whether any SW fallbacks are required based on dev. + */ +static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, + struct net_device *dev) +{ +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sock *sk = skb->sk; + + if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) + skb = sk->sk_validate_xmit_skb(sk, dev, skb); +#endif + + return skb; +} + /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV * SYNACK messages can be attached to either ones (depending on SYNCOOKIE) */ diff --git a/include/net/strparser.h b/include/net/strparser.h index d96b59f45eba..f177c87ce38b 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -90,6 +90,8 @@ static inline void strp_pause(struct strparser *strp) /* May be called without holding lock for attached socket */ void strp_unpause(struct strparser *strp); +/* Must be called with process lock held (lock_sock) */ +void __strp_unpause(struct strparser *strp); static inline void save_strp_stats(struct strparser *strp, struct strp_aggr_stats *agg_stats) diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 39bc855d7fee..d574ce63bf22 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -155,6 +155,7 @@ struct switchdev_notifier_fdb_info { struct switchdev_notifier_info info; /* must be first */ const unsigned char *addr; u16 vid; + bool added_by_user; }; static inline struct net_device * diff --git a/include/net/tcp.h b/include/net/tcp.h index f88f8a2cab0d..0448e7c5d2b4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -245,6 +245,7 @@ extern long sysctl_tcp_mem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ +#define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */ extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; @@ -333,8 +334,7 @@ void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); -void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th); +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); @@ -401,6 +401,10 @@ void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); +int tcp_set_rcvlowat(struct sock *sk, int val); +void tcp_data_ready(struct sock *sk); +int tcp_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); @@ -552,7 +556,12 @@ void tcp_fin(struct sock *sk); void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) { - hrtimer_cancel(&tcp_sk(sk)->pacing_timer); + if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1) + __sock_put(sk); + + if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1) + __sock_put(sk); + inet_csk_clear_xmit_timers(sk); } @@ -809,9 +818,8 @@ struct tcp_skb_cb { #endif } header; /* For incoming skbs */ struct { - __u32 key; __u32 flags; - struct bpf_map *map; + struct sock *sk_redir; void *data_end; } bpf; }; @@ -1865,6 +1873,10 @@ void tcp_v4_init(void); void tcp_init(void); /* tcp_recovery.c */ +void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb); +void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced); +extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, + u32 reo_wnd); extern void tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); @@ -2095,4 +2107,12 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) #if IS_ENABLED(CONFIG_SMC) extern struct static_key_false tcp_have_smc; #endif + +#if IS_ENABLED(CONFIG_TLS_DEVICE) +void clean_acked_data_enable(struct inet_connection_sock *icsk, + void (*cad)(struct sock *sk, u32 ack_seq)); +void clean_acked_data_disable(struct inet_connection_sock *icsk); + +#endif + #endif /* _TCP_H */ diff --git a/include/net/tipc.h b/include/net/tipc.h index 07670ec022a7..f0e7e6bc1bef 100644 --- a/include/net/tipc.h +++ b/include/net/tipc.h @@ -44,11 +44,11 @@ struct tipc_basic_hdr { __be32 w[4]; }; -static inline u32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr) +static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr) { u32 w0 = ntohl(hdr->w[0]); bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK; - int key; + __be32 key; /* Return source node identity as key */ if (likely(!keepalive_msg)) diff --git a/include/net/tls.h b/include/net/tls.h index f5fb16da3860..70c273777fe9 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -83,24 +83,10 @@ struct tls_device { void (*unhash)(struct tls_device *device, struct sock *sk); }; -struct tls_sw_context { +struct tls_sw_context_tx { struct crypto_aead *aead_send; - struct crypto_aead *aead_recv; struct crypto_wait async_wait; - /* Receive context */ - struct strparser strp; - void (*saved_data_ready)(struct sock *sk); - unsigned int (*sk_poll)(struct file *file, struct socket *sock, - struct poll_table_struct *wait); - struct sk_buff *recv_pkt; - u8 control; - bool decrypted; - - char rx_aad_ciphertext[TLS_AAD_SPACE_SIZE]; - char rx_aad_plaintext[TLS_AAD_SPACE_SIZE]; - - /* Sending context */ char aad_space[TLS_AAD_SPACE_SIZE]; unsigned int sg_plaintext_size; @@ -117,6 +103,54 @@ struct tls_sw_context { struct scatterlist sg_aead_out[2]; }; +struct tls_sw_context_rx { + struct crypto_aead *aead_recv; + struct crypto_wait async_wait; + + struct strparser strp; + void (*saved_data_ready)(struct sock *sk); + unsigned int (*sk_poll)(struct file *file, struct socket *sock, + struct poll_table_struct *wait); + struct sk_buff *recv_pkt; + u8 control; + bool decrypted; + + char rx_aad_ciphertext[TLS_AAD_SPACE_SIZE]; + char rx_aad_plaintext[TLS_AAD_SPACE_SIZE]; + +}; + +struct tls_record_info { + struct list_head list; + u32 end_seq; + int len; + int num_frags; + skb_frag_t frags[MAX_SKB_FRAGS]; +}; + +struct tls_offload_context { + struct crypto_aead *aead_send; + spinlock_t lock; /* protects records list */ + struct list_head records_list; + struct tls_record_info *open_record; + struct tls_record_info *retransmit_hint; + u64 hint_record_sn; + u64 unacked_record_sn; + + struct scatterlist sg_tx_data[MAX_SKB_FRAGS]; + void (*sk_destruct)(struct sock *sk); + u8 driver_state[]; + /* The TLS layer reserves room for driver specific state + * Currently the belief is that there is not enough + * driver specific state to justify another layer of indirection + */ +#define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, sizeof(void *))) +}; + +#define TLS_OFFLOAD_CONTEXT_SIZE \ + (ALIGN(sizeof(struct tls_offload_context), sizeof(void *)) + \ + TLS_DRIVER_STATE_SIZE) + enum { TLS_PENDING_CLOSED_RECORD }; @@ -141,9 +175,15 @@ struct tls_context { struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128; }; - void *priv_ctx; + struct list_head list; + struct net_device *netdev; + refcount_t refcount; + + void *priv_ctx_tx; + void *priv_ctx_rx; - u8 conf:3; + u8 tx_conf:3; + u8 rx_conf:3; struct cipher_context tx; struct cipher_context rx; @@ -181,7 +221,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_sw_close(struct sock *sk, long timeout); -void tls_sw_free_resources(struct sock *sk); +void tls_sw_free_resources_tx(struct sock *sk); +void tls_sw_free_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); unsigned int tls_sw_poll(struct file *file, struct socket *sock, @@ -190,9 +231,28 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); -void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); -void tls_icsk_clean_acked(struct sock *sk); +int tls_set_device_offload(struct sock *sk, struct tls_context *ctx); +int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +int tls_device_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags); +void tls_device_sk_destruct(struct sock *sk); +void tls_device_init(void); +void tls_device_cleanup(void); + +struct tls_record_info *tls_get_record(struct tls_offload_context *context, + u32 seq, u64 *p_record_sn); + +static inline bool tls_record_is_start_marker(struct tls_record_info *rec) +{ + return rec->len == 0; +} +static inline u32 tls_record_start_seq(struct tls_record_info *rec) +{ + return rec->end_seq - rec->len; +} + +void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); int tls_push_sg(struct sock *sk, struct tls_context *ctx, struct scatterlist *sg, u16 first_offset, int flags); @@ -229,6 +289,13 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) return tls_ctx->pending_open_record_frags; } +static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) +{ + return sk_fullsock(sk) && + /* matches smp_store_release in tls_set_device_offload */ + smp_load_acquire(&sk->sk_destruct) == &tls_device_sk_destruct; +} + static inline void tls_err_abort(struct sock *sk, int err) { sk->sk_err = err; @@ -301,16 +368,22 @@ static inline struct tls_context *tls_get_ctx(const struct sock *sk) return icsk->icsk_ulp_data; } -static inline struct tls_sw_context *tls_sw_ctx( +static inline struct tls_sw_context_rx *tls_sw_ctx_rx( const struct tls_context *tls_ctx) { - return (struct tls_sw_context *)tls_ctx->priv_ctx; + return (struct tls_sw_context_rx *)tls_ctx->priv_ctx_rx; +} + +static inline struct tls_sw_context_tx *tls_sw_ctx_tx( + const struct tls_context *tls_ctx) +{ + return (struct tls_sw_context_tx *)tls_ctx->priv_ctx_tx; } static inline struct tls_offload_context *tls_offload_ctx( const struct tls_context *tls_ctx) { - return (struct tls_offload_context *)tls_ctx->priv_ctx; + return (struct tls_offload_context *)tls_ctx->priv_ctx_tx; } int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, @@ -318,4 +391,12 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, void tls_register_device(struct tls_device *device); void tls_unregister_device(struct tls_device *device); +struct sk_buff *tls_validate_xmit_skb(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb); + +int tls_sw_fallback_init(struct sock *sk, + struct tls_offload_context *offload_ctx, + struct tls_crypto_info *crypto_info); + #endif /* _TLS_OFFLOAD_H */ diff --git a/include/net/udp.h b/include/net/udp.h index d8ca3b26964d..7ba0ed252c52 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -174,6 +174,9 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, struct udphdr *uh, udp_lookup_t lookup); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); +struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, + netdev_features_t features); + static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) { struct udphdr *uh; @@ -269,6 +272,7 @@ int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); +int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); diff --git a/include/net/vxlan.h b/include/net/vxlan.h index ad73d8b3fcc2..b99a02ae3934 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -262,6 +262,7 @@ struct vxlan_dev { #define VXLAN_F_COLLECT_METADATA 0x2000 #define VXLAN_F_GPE 0x4000 #define VXLAN_F_IPV6_LINKLOCAL 0x8000 +#define VXLAN_F_TTL_INHERIT 0x10000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable diff --git a/include/net/xdp.h b/include/net/xdp.h index b2362ddfa694..2deea7166a34 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -33,16 +33,115 @@ * also mandatory during RX-ring setup. */ +enum xdp_mem_type { + MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ + MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ + MEM_TYPE_PAGE_POOL, + MEM_TYPE_ZERO_COPY, + MEM_TYPE_MAX, +}; + +/* XDP flags for ndo_xdp_xmit */ +#define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ +#define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH + +struct xdp_mem_info { + u32 type; /* enum xdp_mem_type, but known size type */ + u32 id; +}; + +struct page_pool; + +struct zero_copy_allocator { + void (*free)(struct zero_copy_allocator *zca, unsigned long handle); +}; + struct xdp_rxq_info { struct net_device *dev; u32 queue_index; u32 reg_state; + struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ +struct xdp_buff { + void *data; + void *data_end; + void *data_meta; + void *data_hard_start; + unsigned long handle; + struct xdp_rxq_info *rxq; +}; + +struct xdp_frame { + void *data; + u16 len; + u16 headroom; + u16 metasize; + /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, + * while mem info is valid on remote CPU. + */ + struct xdp_mem_info mem; + struct net_device *dev_rx; /* used by cpumap */ +}; + +/* Convert xdp_buff to xdp_frame */ +static inline +struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) +{ + struct xdp_frame *xdp_frame; + int metasize; + int headroom; + + /* TODO: implement clone, copy, use "native" MEM_TYPE */ + if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) + return NULL; + + /* Assure headroom is available for storing info */ + headroom = xdp->data - xdp->data_hard_start; + metasize = xdp->data - xdp->data_meta; + metasize = metasize > 0 ? metasize : 0; + if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) + return NULL; + + /* Store info in top of packet */ + xdp_frame = xdp->data_hard_start; + + xdp_frame->data = xdp->data; + xdp_frame->len = xdp->data_end - xdp->data; + xdp_frame->headroom = headroom - sizeof(*xdp_frame); + xdp_frame->metasize = metasize; + + /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ + xdp_frame->mem = xdp->rxq->mem; + + return xdp_frame; +} + +void xdp_return_frame(struct xdp_frame *xdpf); +void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); +void xdp_return_buff(struct xdp_buff *xdp); + int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator); + +/* Drivers not supporting XDP metadata can use this helper, which + * rejects any room expansion for metadata as a result. + */ +static __always_inline void +xdp_set_data_meta_invalid(struct xdp_buff *xdp) +{ + xdp->data_meta = xdp->data + 1; +} + +static __always_inline bool +xdp_data_meta_unsupported(const struct xdp_buff *xdp) +{ + return unlikely(xdp->data_meta > xdp->data); +} #endif /* __LINUX_NET_XDP_H__ */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h new file mode 100644 index 000000000000..9fe472f2ac95 --- /dev/null +++ b/include/net/xdp_sock.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* AF_XDP internal functions + * Copyright(c) 2018 Intel Corporation. + */ + +#ifndef _LINUX_XDP_SOCK_H +#define _LINUX_XDP_SOCK_H + +#include <linux/workqueue.h> +#include <linux/if_xdp.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <net/sock.h> + +struct net_device; +struct xsk_queue; + +struct xdp_umem_props { + u64 chunk_mask; + u64 size; +}; + +struct xdp_umem_page { + void *addr; + dma_addr_t dma; +}; + +struct xdp_umem { + struct xsk_queue *fq; + struct xsk_queue *cq; + struct xdp_umem_page *pages; + struct xdp_umem_props props; + u32 headroom; + u32 chunk_size_nohr; + struct user_struct *user; + struct pid *pid; + unsigned long address; + refcount_t users; + struct work_struct work; + struct page **pgs; + u32 npgs; + struct net_device *dev; + u16 queue_id; + bool zc; + spinlock_t xsk_list_lock; + struct list_head xsk_list; +}; + +struct xdp_sock { + /* struct sock must be the first member of struct xdp_sock */ + struct sock sk; + struct xsk_queue *rx; + struct net_device *dev; + struct xdp_umem *umem; + struct list_head flush_node; + u16 queue_id; + struct xsk_queue *tx ____cacheline_aligned_in_smp; + struct list_head list; + bool zc; + /* Protects multiple processes in the control path */ + struct mutex mutex; + u64 rx_dropped; +}; + +struct xdp_buff; +#ifdef CONFIG_XDP_SOCKETS +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +void xsk_flush(struct xdp_sock *xs); +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +/* Used from netdev driver */ +u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); +void xsk_umem_discard_addr(struct xdp_umem *umem); +void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); +bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len); +void xsk_umem_consume_tx_done(struct xdp_umem *umem); +#else +static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} + +static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} + +static inline void xsk_flush(struct xdp_sock *xs) +{ +} + +static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) +{ + return false; +} +#endif /* CONFIG_XDP_SOCKETS */ + +#endif /* _LINUX_XDP_SOCK_H */ |