From 4749c3ef854e3a5d3dd3cc0ccd2dcb7e05d583bd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 30 Apr 2015 12:12:00 +0200 Subject: net: sched: remove TC_MUNGED bits Not used. pedit sets TC_MUNGED when packet content was altered, but all the core does is unset MUNGED again and then set OK2MUNGE. And the latter isn't tested anywhere. So lets remove both TC_MUNGED and TC_OK2MUNGE. Signed-off-by: Florian Westphal Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index bf08e76bf505..6810ca43a80a 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -35,6 +35,8 @@ bits 9,10,11: redirect counter - redirect TTL. Loop avoidance * * */ +#ifndef __KERNEL__ +/* backwards compat for userspace only */ #define TC_MUNGED _TC_MAKEMASK1(0) #define SET_TC_MUNGED(v) ( TC_MUNGED | (v & ~TC_MUNGED)) #define CLR_TC_MUNGED(v) ( v & ~TC_MUNGED) @@ -42,6 +44,7 @@ bits 9,10,11: redirect counter - redirect TTL. Loop avoidance #define TC_OK2MUNGE _TC_MAKEMASK1(1) #define SET_TC_OK2MUNGE(v) ( TC_OK2MUNGE | (v & ~TC_OK2MUNGE)) #define CLR_TC_OK2MUNGE(v) ( v & ~TC_OK2MUNGE) +#endif #define S_TC_VERD _TC_MAKE32(2) #define M_TC_VERD _TC_MAKEMASK(4,S_TC_VERD) -- cgit v1.2.3 From c19ae86a510cf4332af64caab04718bc853d3184 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Fri, 1 May 2015 22:19:43 -0700 Subject: tc: remove unused redirect ttl improves ingress+u32 performance from 22.4 Mpps to 22.9 Mpps Signed-off-by: Jamal Hadi Salim Signed-off-by: Alexei Starovoitov Acked-by: Florian Westphal Acked-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 6810ca43a80a..596ffa0c7084 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -65,11 +65,13 @@ bits 9,10,11: redirect counter - redirect TTL. Loop avoidance #define SET_TC_NCLS(v) ( TC_NCLS | (v & ~TC_NCLS)) #define CLR_TC_NCLS(v) ( v & ~TC_NCLS) +#ifndef __KERNEL__ #define S_TC_RTTL _TC_MAKE32(9) #define M_TC_RTTL _TC_MAKEMASK(3,S_TC_RTTL) #define G_TC_RTTL(x) _TC_GETVALUE(x,S_TC_RTTL,M_TC_RTTL) #define V_TC_RTTL(x) _TC_MAKEVALUE(x,S_TC_RTTL) #define SET_TC_RTTL(v,n) ((V_TC_RTTL(n)) | (v & ~M_TC_RTTL)) +#endif #define S_TC_AT _TC_MAKE32(12) #define M_TC_AT _TC_MAKEMASK(2,S_TC_AT) -- cgit v1.2.3 From cd8ae85299d54155702a56811b2e035e63064d3d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 3 May 2015 21:34:46 -0700 Subject: tcp: provide SYN headers for passive connections This patch allows a server application to get the TCP SYN headers for its passive connections. This is useful if the server is doing fingerprinting of clients based on SYN packet contents. Two socket options are added: TCP_SAVE_SYN and TCP_SAVED_SYN. The first is used on a socket to enable saving the SYN headers for child connections. This can be set before or after the listen() call. The latter is used to retrieve the SYN headers for passive connections, if the parent listener has enabled TCP_SAVE_SYN. TCP_SAVED_SYN is read once, it frees the saved SYN headers. The data returned in TCP_SAVED_SYN are network (IPv4/IPv6) and TCP headers. Original patch was written by Tom Herbert, I changed it to not hold a full skb (and associated dst and conntracking reference). We have used such patch for about 3 years at Google. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index faa72f4fa547..51ebedba577f 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -113,6 +113,8 @@ enum { #define TCP_TIMESTAMP 24 #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ #define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */ +#define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */ +#define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */ struct tcp_repair_opt { __u32 opt_code; -- cgit v1.2.3 From a2f11835994ed5bcd6d66c7205947cc482231b08 Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Tue, 5 May 2015 09:07:16 -0700 Subject: can.h: make padding given by gcc explicit The current definition of struct can_frame has a 16-byte size, with 8-byte alignment, but the 3 bytes of padding are not explicit like the similar 2 bytes of padding of struct canfd_frame. Make it explicit so it is easier to read. Signed-off-by: Shawn Landden Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can.h b/include/uapi/linux/can.h index 41892f720057..9692cda5f8fc 100644 --- a/include/uapi/linux/can.h +++ b/include/uapi/linux/can.h @@ -95,11 +95,17 @@ typedef __u32 can_err_mask_t; * @can_dlc: frame payload length in byte (0 .. 8) aka data length code * N.B. the DLC field from ISO 11898-1 Chapter 8.4.2.3 has a 1:1 * mapping of the 'data length code' to the real payload length + * @__pad: padding + * @__res0: reserved / padding + * @__res1: reserved / padding * @data: CAN frame payload (up to 8 byte) */ struct can_frame { canid_t can_id; /* 32 bit CAN_ID + EFF/RTR/ERR flags */ __u8 can_dlc; /* frame payload length in byte (0 .. CAN_MAX_DLEN) */ + __u8 __pad; /* padding */ + __u8 __res0; /* reserved / padding */ + __u8 __res1; /* reserved / padding */ __u8 data[CAN_MAX_DLEN] __attribute__((aligned(8))); }; -- cgit v1.2.3 From 06f207fc541862ba8902ceda0ddeade6ea6bce72 Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Wed, 6 May 2015 16:28:31 +0300 Subject: cfg80211: change GO_CONCURRENT to IR_CONCURRENT for STA The GO_CONCURRENT regulatory definition can be extended to station interfaces requesting to IR as part of TDLS off-channel operations. Rename the GO_CONCURRENT flag to IR_CONCURRENT and allow the added use-case. Change internal users of GO_CONCURRENT to use the new definition. Signed-off-by: Arik Nemtsov Reviewed-by: Johannes Berg Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 241220c43e86..c0ab6b0a3919 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2620,16 +2620,17 @@ enum nl80211_band_attr { * an indoor surroundings, i.e., it is connected to AC power (and not * through portable DC inverters) or is under the control of a master * that is acting as an AP and is connected to AC power. - * @NL80211_FREQUENCY_ATTR_GO_CONCURRENT: GO operation is allowed on this + * @NL80211_FREQUENCY_ATTR_IR_CONCURRENT: IR operation is allowed on this * channel if it's connected concurrently to a BSS on the same channel on * the 2 GHz band or to a channel in the same UNII band (on the 5 GHz - * band), and IEEE80211_CHAN_RADAR is not set. Instantiating a GO on a - * channel that has the GO_CONCURRENT attribute set can be done when there - * is a clear assessment that the device is operating under the guidance of - * an authorized master, i.e., setting up a GO while the device is also - * connected to an AP with DFS and radar detection on the UNII band (it is - * up to user-space, i.e., wpa_supplicant to perform the required - * verifications) + * band), and IEEE80211_CHAN_RADAR is not set. Instantiating a GO or TDLS + * off-channel on a channel that has the IR_CONCURRENT attribute set can be + * done when there is a clear assessment that the device is operating under + * the guidance of an authorized master, i.e., setting up a GO or TDLS + * off-channel while the device is also connected to an AP with DFS and + * radar detection on the UNII band (it is up to user-space, i.e., + * wpa_supplicant to perform the required verifications). Using this + * attribute for IR is disallowed for master interfaces (IBSS, AP). * @NL80211_FREQUENCY_ATTR_NO_20MHZ: 20 MHz operation is not allowed * on this channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_NO_10MHZ: 10 MHz operation is not allowed @@ -2641,7 +2642,7 @@ enum nl80211_band_attr { * See https://apps.fcc.gov/eas/comments/GetPublishedDocument.html?id=327&tn=528122 * for more information on the FCC description of the relaxations allowed * by NL80211_FREQUENCY_ATTR_INDOOR_ONLY and - * NL80211_FREQUENCY_ATTR_GO_CONCURRENT. + * NL80211_FREQUENCY_ATTR_IR_CONCURRENT. */ enum nl80211_frequency_attr { __NL80211_FREQUENCY_ATTR_INVALID, @@ -2659,7 +2660,7 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_NO_160MHZ, NL80211_FREQUENCY_ATTR_DFS_CAC_TIME, NL80211_FREQUENCY_ATTR_INDOOR_ONLY, - NL80211_FREQUENCY_ATTR_GO_CONCURRENT, + NL80211_FREQUENCY_ATTR_IR_CONCURRENT, NL80211_FREQUENCY_ATTR_NO_20MHZ, NL80211_FREQUENCY_ATTR_NO_10MHZ, @@ -2672,6 +2673,8 @@ enum nl80211_frequency_attr { #define NL80211_FREQUENCY_ATTR_PASSIVE_SCAN NL80211_FREQUENCY_ATTR_NO_IR #define NL80211_FREQUENCY_ATTR_NO_IBSS NL80211_FREQUENCY_ATTR_NO_IR #define NL80211_FREQUENCY_ATTR_NO_IR NL80211_FREQUENCY_ATTR_NO_IR +#define NL80211_FREQUENCY_ATTR_GO_CONCURRENT \ + NL80211_FREQUENCY_ATTR_IR_CONCURRENT /** * enum nl80211_bitrate_attr - bitrate attributes @@ -2830,7 +2833,7 @@ enum nl80211_sched_scan_match_attr { * @NL80211_RRF_AUTO_BW: maximum available bandwidth should be calculated * base on contiguous rules and wider channels will be allowed to cross * multiple contiguous/overlapping frequency ranges. - * @NL80211_RRF_GO_CONCURRENT: See &NL80211_FREQUENCY_ATTR_GO_CONCURRENT + * @NL80211_RRF_IR_CONCURRENT: See &NL80211_FREQUENCY_ATTR_IR_CONCURRENT * @NL80211_RRF_NO_HT40MINUS: channels can't be used in HT40- operation * @NL80211_RRF_NO_HT40PLUS: channels can't be used in HT40+ operation * @NL80211_RRF_NO_80MHZ: 80MHz operation not allowed @@ -2847,7 +2850,7 @@ enum nl80211_reg_rule_flags { NL80211_RRF_NO_IR = 1<<7, __NL80211_RRF_NO_IBSS = 1<<8, NL80211_RRF_AUTO_BW = 1<<11, - NL80211_RRF_GO_CONCURRENT = 1<<12, + NL80211_RRF_IR_CONCURRENT = 1<<12, NL80211_RRF_NO_HT40MINUS = 1<<13, NL80211_RRF_NO_HT40PLUS = 1<<14, NL80211_RRF_NO_80MHZ = 1<<15, @@ -2859,6 +2862,7 @@ enum nl80211_reg_rule_flags { #define NL80211_RRF_NO_IR NL80211_RRF_NO_IR #define NL80211_RRF_NO_HT40 (NL80211_RRF_NO_HT40MINUS |\ NL80211_RRF_NO_HT40PLUS) +#define NL80211_RRF_GO_CONCURRENT NL80211_RRF_IR_CONCURRENT /* For backport compatibility with older userspace */ #define NL80211_RRF_NO_IR_ALL (NL80211_RRF_NO_IR | __NL80211_RRF_NO_IBSS) -- cgit v1.2.3 From e520af48c7e5acae5f17f82a79ba7ab7cf156f3b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 May 2015 14:26:25 -0700 Subject: tcp: add TCPWinProbe and TCPKeepAlive SNMP counters Diagnosing problems related to Window Probes has been hard because we lack a counter. TCPWinProbe counts the number of ACK packets a sender has to send at regular intervals to make sure a reverse ACK packet opening back a window had not been lost. TCPKeepAlive counts the number of ACK packets sent to keep TCP flows alive (SO_KEEPALIVE) Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Nandita Dukkipati Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 6a6fb747c78d..eee8968407f0 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -276,6 +276,8 @@ enum LINUX_MIB_TCPACKSKIPPEDFINWAIT2, /* TCPACKSkippedFinWait2 */ LINUX_MIB_TCPACKSKIPPEDTIMEWAIT, /* TCPACKSkippedTimeWait */ LINUX_MIB_TCPACKSKIPPEDCHALLENGE, /* TCPACKSkippedChallenge */ + LINUX_MIB_TCPWINPROBE, /* TCPWinProbe */ + LINUX_MIB_TCPKEEPALIVE, /* TCPKeepAlive */ __LINUX_MIB_MAX }; -- cgit v1.2.3 From 59324cf35aba5336b611074028777838a963d03b Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 7 May 2015 11:02:53 +0200 Subject: netlink: allow to listen "all" netns More accurately, listen all netns that have a nsid assigned into the netns where the netlink socket is opened. For this purpose, a netlink socket option is added: NETLINK_LISTEN_ALL_NSID. When this option is set on a netlink socket, this socket will receive netlink notifications from all netns that have a nsid assigned into the netns where the socket has been opened. The nsid is sent to userland via an anscillary data. With this patch, a daemon needs only one socket to listen many netns. This is useful when the number of netns is high. Because 0 is a valid value for a nsid, the field nsid_is_set indicates if the field nsid is valid or not. skb->cb is initialized to 0 on skb allocation, thus we are sure that we will never send a nsid 0 by error to the userland. Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/uapi/linux/netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 1a85940f8ab7..3e34b7d702f8 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -108,6 +108,7 @@ struct nlmsgerr { #define NETLINK_NO_ENOBUFS 5 #define NETLINK_RX_RING 6 #define NETLINK_TX_RING 7 +#define NETLINK_LISTEN_ALL_NSID 8 struct nl_pktinfo { __u32 group; -- cgit v1.2.3 From 80ba92fa1a92dea128283f69f55b02242e213650 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 May 2015 15:05:12 -0700 Subject: codel: add ce_threshold attribute For DCTCP or similar ECN based deployments on fabrics with shallow buffers, hosts are responsible for a good part of the buffering. This patch adds an optional ce_threshold to codel & fq_codel qdiscs, so that DCTCP can have feedback from queuing in the host. A DCTCP enabled egress port simply have a queue occupancy threshold above which ECT packets get CE mark. In codel language this translates to a sojourn time, so that one doesn't have to worry about bytes or bandwidth but delays. This makes the host an active participant in the health of the whole network. This also helps experimenting DCTCP in a setup without DCTCP compliant fabric. On following example, ce_threshold is set to 1ms, and we can see from 'ldelay xxx us' that TCP is not trying to go around the 5ms codel target. Queue has more capacity to absorb inelastic bursts (say from UDP traffic), as queues are maintained to an optimal level. lpaa23:~# ./tc -s -d qd sh dev eth1 qdisc mq 1: dev eth1 root Sent 87910654696 bytes 58065331 pkt (dropped 0, overlimits 0 requeues 42961) backlog 3108242b 364p requeues 42961 qdisc codel 8063: dev eth1 parent 1:1 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms Sent 7363778701 bytes 4863809 pkt (dropped 0, overlimits 0 requeues 5503) rate 2348Mbit 193919pps backlog 255866b 46p requeues 5503 count 0 lastcount 0 ldelay 1.0ms drop_next 0us maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 72384 qdisc codel 8064: dev eth1 parent 1:2 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms Sent 7636486190 bytes 5043942 pkt (dropped 0, overlimits 0 requeues 5186) rate 2319Mbit 191538pps backlog 207418b 64p requeues 5186 count 0 lastcount 0 ldelay 694us drop_next 0us maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 69873 qdisc codel 8065: dev eth1 parent 1:3 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms Sent 11569360142 bytes 7641602 pkt (dropped 0, overlimits 0 requeues 5554) rate 3041Mbit 251096pps backlog 210446b 59p requeues 5554 count 0 lastcount 0 ldelay 889us drop_next 0us maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 37780 ... Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Daniel Borkmann Cc: Glenn Judd Cc: Nandita Dukkipati Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 534b84710745..69d88b309cc7 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -679,6 +679,7 @@ enum { TCA_CODEL_LIMIT, TCA_CODEL_INTERVAL, TCA_CODEL_ECN, + TCA_CODEL_CE_THRESHOLD, __TCA_CODEL_MAX }; @@ -695,6 +696,7 @@ struct tc_codel_xstats { __u32 drop_overlimit; /* number of time max qdisc packet limit was hit */ __u32 ecn_mark; /* number of packets we ECN marked instead of dropped */ __u32 dropping; /* are we in dropping state ? */ + __u32 ce_mark; /* number of CE marked packets because of ce_threshold */ }; /* FQ_CODEL */ @@ -707,6 +709,7 @@ enum { TCA_FQ_CODEL_ECN, TCA_FQ_CODEL_FLOWS, TCA_FQ_CODEL_QUANTUM, + TCA_FQ_CODEL_CE_THRESHOLD, __TCA_FQ_CODEL_MAX }; @@ -730,6 +733,7 @@ struct tc_fq_codel_qd_stats { */ __u32 new_flows_len; /* count of flows in new list */ __u32 old_flows_len; /* count of flows in old list */ + __u32 ce_mark; /* packets above ce_threshold */ }; struct tc_fq_codel_cl_stats { -- cgit v1.2.3 From 171a42c38c6e1a5a076d6276e94e55a0b5b7868c Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Sat, 9 May 2015 00:01:58 -0700 Subject: bonding: add netlink support for sys prio, actor sys mac, and port key Adds netlink support for the following bonding options: * BOND_OPT_AD_ACTOR_SYS_PRIO * BOND_OPT_AD_ACTOR_SYSTEM * BOND_OPT_AD_USER_PORT_KEY When setting the actor system mac address we assume the netlink message contains a binary mac and not a string representation of a mac. Signed-off-by: Andy Gospodarek [jt: completed the setting side of the netlink attributes] Signed-off-by: Jonathan Toppins Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index d9cd19214b98..6d6e502e1051 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -417,6 +417,9 @@ enum { IFLA_BOND_AD_LACP_RATE, IFLA_BOND_AD_SELECT, IFLA_BOND_AD_INFO, + IFLA_BOND_AD_ACTOR_SYS_PRIO, + IFLA_BOND_AD_USER_PORT_KEY, + IFLA_BOND_AD_ACTOR_SYSTEM, __IFLA_BOND_MAX, }; -- cgit v1.2.3 From a3eb95f891d6130b1fc03dd07a8b54cf0a5c8ab8 Mon Sep 17 00:00:00 2001 From: David Ward Date: Sat, 9 May 2015 22:01:46 -0400 Subject: net_sched: gred: add TCA_GRED_LIMIT attribute In a GRED qdisc, if the default "virtual queue" (VQ) does not have drop parameters configured, then packets for the default VQ are not subjected to RED and are only dropped if the queue is larger than the net_device's tx_queue_len. This behavior is useful for WRED mode, since these packets will still influence the calculated average queue length and (therefore) the drop probability for all of the other VQs. However, for some drivers tx_queue_len is zero. In other cases the user may wish to make the limit the same for all VQs (including the default VQ with no drop parameters). This change adds a TCA_GRED_LIMIT attribute to set the GRED queue limit, in bytes, during qdisc setup. (This limit is in bytes to be consistent with the drop parameters.) The default limit is the same as for a bfifo queue (tx_queue_len * psched_mtu). If the drop parameters of any VQ are configured with a smaller limit than the GRED queue limit, that VQ will still observe the smaller limit instead. Signed-off-by: David Ward Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 69d88b309cc7..8d2530daca9f 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -268,7 +268,8 @@ enum { TCA_GRED_STAB, TCA_GRED_DPS, TCA_GRED_MAX_P, - __TCA_GRED_MAX, + TCA_GRED_LIMIT, + __TCA_GRED_MAX, }; #define TCA_GRED_MAX (__TCA_GRED_MAX - 1) -- cgit v1.2.3 From e578d9c02587d57bfa7b560767c698a668a468c6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 May 2015 19:50:41 +0200 Subject: net: sched: use counter to break reclassify loops Seems all we want here is to avoid endless 'goto reclassify' loop. tc_classify_compat even resets this counter when something other than TC_ACT_RECLASSIFY is returned, so this skb-counter doesn't break hypothetical loops induced by something other than perpetual TC_ACT_RECLASSIFY return values. skb_act_clone is now identical to skb_clone, so just use that. Tested with following (bogus) filter: tc filter add dev eth0 parent ffff: \ protocol ip u32 match u32 0 0 police rate 10Kbit burst \ 64000 mtu 1500 action reclassify Acked-by: Daniel Borkmann Signed-off-by: Florian Westphal Acked-by: Alexei Starovoitov Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 596ffa0c7084..ffc112c8e1c2 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -44,13 +44,13 @@ bits 9,10,11: redirect counter - redirect TTL. Loop avoidance #define TC_OK2MUNGE _TC_MAKEMASK1(1) #define SET_TC_OK2MUNGE(v) ( TC_OK2MUNGE | (v & ~TC_OK2MUNGE)) #define CLR_TC_OK2MUNGE(v) ( v & ~TC_OK2MUNGE) -#endif #define S_TC_VERD _TC_MAKE32(2) #define M_TC_VERD _TC_MAKEMASK(4,S_TC_VERD) #define G_TC_VERD(x) _TC_GETVALUE(x,S_TC_VERD,M_TC_VERD) #define V_TC_VERD(x) _TC_MAKEVALUE(x,S_TC_VERD) #define SET_TC_VERD(v,n) ((V_TC_VERD(n)) | (v & ~M_TC_VERD)) +#endif #define S_TC_FROM _TC_MAKE32(6) #define M_TC_FROM _TC_MAKEMASK(2,S_TC_FROM) -- cgit v1.2.3 From 77b9900ef53ae047e36a37d13a2aa33bb2d60641 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 12 May 2015 14:56:21 +0200 Subject: tc: introduce Flower classifier This patch introduces a flow-based filter. So far, the very essential packet fields are supported. This patch is only the first step. There is a lot of potential performance improvements possible to implement. Also a lot of features are missing now. They will be addressed in follow-up patches. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index ffc112c8e1c2..39fb53d67b11 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -409,6 +409,36 @@ enum { #define TCA_BPF_MAX (__TCA_BPF_MAX - 1) +/* Flower classifier */ + +enum { + TCA_FLOWER_UNSPEC, + TCA_FLOWER_CLASSID, + TCA_FLOWER_INDEV, + TCA_FLOWER_ACT, + TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_TYPE, /* be16 */ + TCA_FLOWER_KEY_IP_PROTO, /* u8 */ + TCA_FLOWER_KEY_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_TCP_SRC, /* be16 */ + TCA_FLOWER_KEY_TCP_DST, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC, /* be16 */ + TCA_FLOWER_KEY_UDP_DST, /* be16 */ + __TCA_FLOWER_MAX, +}; + +#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1) + /* Extended Matches */ struct tcf_ematch_tree_hdr { -- cgit v1.2.3 From a9b6391814d5d6b8668fca2dace86949b7244e2e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 12 May 2015 11:56:50 -0400 Subject: packet: rollover statistics Rollover indicates exceptional conditions. Export a counter to inform socket owners of this state. If no socket with sufficient room is found, rollover fails. Also count these events. Finally, also count when flows are rolled over early thanks to huge flow detection, to validate its correctness. Tested: Read counters in bench_rollover on all other tests in the patchset Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 053bd102fbe0..d3d715f8c88f 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -54,6 +54,7 @@ struct sockaddr_ll { #define PACKET_FANOUT 18 #define PACKET_TX_HAS_OFF 19 #define PACKET_QDISC_BYPASS 20 +#define PACKET_ROLLOVER_STATS 21 #define PACKET_FANOUT_HASH 0 #define PACKET_FANOUT_LB 1 @@ -75,6 +76,12 @@ struct tpacket_stats_v3 { unsigned int tp_freeze_q_cnt; }; +struct tpacket_rollover_stats { + __aligned_u64 tp_all; + __aligned_u64 tp_huge; + __aligned_u64 tp_failed; +}; + union tpacket_stats_u { struct tpacket_stats stats1; struct tpacket_stats_v3 stats3; -- cgit v1.2.3 From 2d07dc79fe04a43d82a346ced6bbf07bdb523f1b Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Wed, 13 May 2015 12:57:30 -0400 Subject: geneve: add initial netdev driver for GENEVE tunnels This is an initial implementation of a netdev driver for GENEVE tunnels. This implementation uses a fixed UDP port, and only supports point-to-point links with specific partner endpoints. Only IPv4 links are supported at this time. Signed-off-by: John W. Linville Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6d6e502e1051..afccc9393fef 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -390,6 +390,15 @@ struct ifla_vxlan_port_range { __be16 high; }; +/* GENEVE section */ +enum { + IFLA_GENEVE_UNSPEC, + IFLA_GENEVE_ID, + IFLA_GENEVE_REMOTE, + __IFLA_GENEVE_MAX +}; +#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) + /* Bonding section */ enum { -- cgit v1.2.3 From e687ad60af09010936bbd0b2a3b5d90a8ee8353c Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 13 May 2015 18:19:38 +0200 Subject: netfilter: add netfilter ingress hook after handle_ing() under unique static key This patch adds the Netfilter ingress hook just after the existing tc ingress hook, that seems to be the consensus solution for this. Note that the Netfilter hook resides under the global static key that enables ingress filtering. Nonetheless, Netfilter still also has its own static key for minimal impact on the existing handle_ing(). * Without this patch: Result: OK: 6216490(c6216338+d152) usec, 100000000 (60byte,0frags) 16086246pps 7721Mb/sec (7721398080bps) errors: 100000000 42.46% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 25.92% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 7.81% kpktgend_0 [pktgen] [k] pktgen_thread_worker 5.62% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.70% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 2.34% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 1.44% kpktgend_0 [kernel.kallsyms] [k] __build_skb * With this patch: Result: OK: 6214833(c6214731+d101) usec, 100000000 (60byte,0frags) 16090536pps 7723Mb/sec (7723457280bps) errors: 100000000 41.23% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 26.57% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 7.72% kpktgend_0 [pktgen] [k] pktgen_thread_worker 5.55% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.78% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 2.06% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 1.43% kpktgend_0 [kernel.kallsyms] [k] __build_skb * Without this patch + tc ingress: tc filter add dev eth4 parent ffff: protocol ip prio 1 \ u32 match ip dst 4.3.2.1/32 Result: OK: 9269001(c9268821+d179) usec, 100000000 (60byte,0frags) 10788648pps 5178Mb/sec (5178551040bps) errors: 100000000 40.99% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 17.50% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 11.77% kpktgend_0 [cls_u32] [k] u32_classify 5.62% kpktgend_0 [kernel.kallsyms] [k] tc_classify_compat 5.18% kpktgend_0 [pktgen] [k] pktgen_thread_worker 3.23% kpktgend_0 [kernel.kallsyms] [k] tc_classify 2.97% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 1.83% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 1.50% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk 0.99% kpktgend_0 [kernel.kallsyms] [k] __build_skb * With this patch + tc ingress: tc filter add dev eth4 parent ffff: protocol ip prio 1 \ u32 match ip dst 4.3.2.1/32 Result: OK: 9308218(c9308091+d126) usec, 100000000 (60byte,0frags) 10743194pps 5156Mb/sec (5156733120bps) errors: 100000000 42.01% kpktgend_0 [kernel.kallsyms] [k] __netif_receive_skb_core 17.78% kpktgend_0 [kernel.kallsyms] [k] kfree_skb 11.70% kpktgend_0 [cls_u32] [k] u32_classify 5.46% kpktgend_0 [kernel.kallsyms] [k] tc_classify_compat 5.16% kpktgend_0 [pktgen] [k] pktgen_thread_worker 2.98% kpktgend_0 [kernel.kallsyms] [k] ip_rcv 2.84% kpktgend_0 [kernel.kallsyms] [k] tc_classify 1.96% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_internal 1.57% kpktgend_0 [kernel.kallsyms] [k] netif_receive_skb_sk Note that the results are very similar before and after. I can see gcc gets the code under the ingress static key out of the hot path. Then, on that cold branch, it generates the code to accomodate the netfilter ingress static key. My explanation for this is that this reduces the pressure on the instruction cache for non-users as the new code is out of the hot path, and it comes with minimal impact for tc ingress users. Using gcc version 4.8.4 on: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 8 [...] L1d cache: 16K L1i cache: 64K L2 cache: 2048K L3 cache: 8192K Signed-off-by: Pablo Neira Ayuso Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/netfilter.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index ef1b1f88ca18..177027cce6b3 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -51,11 +51,17 @@ enum nf_inet_hooks { NF_INET_NUMHOOKS }; +enum nf_dev_hooks { + NF_NETDEV_INGRESS, + NF_NETDEV_NUMHOOKS +}; + enum { NFPROTO_UNSPEC = 0, NFPROTO_INET = 1, NFPROTO_IPV4 = 2, NFPROTO_ARP = 3, + NFPROTO_NETDEV = 5, NFPROTO_BRIDGE = 7, NFPROTO_IPV6 = 10, NFPROTO_DECNET = 12, -- cgit v1.2.3 From c9a70d43461d83818825ae065bb8fc887421e150 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 18 May 2015 16:31:47 +0530 Subject: net-next: ethtool: Added port speed macros. Signed-off-by: Parav Pandit Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 2e49fc880d29..ae832b45b44c 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1264,15 +1264,19 @@ enum ethtool_sfeatures_retval_bits { * it was forced up into this mode or autonegotiated. */ -/* The forced speed, 10Mb, 100Mb, gigabit, [2.5|10|20|40|56]GbE. */ +/* The forced speed, 10Mb, 100Mb, gigabit, [2.5|5|10|20|25|40|50|56|100]GbE. */ #define SPEED_10 10 #define SPEED_100 100 #define SPEED_1000 1000 #define SPEED_2500 2500 +#define SPEED_5000 5000 #define SPEED_10000 10000 #define SPEED_20000 20000 +#define SPEED_25000 25000 #define SPEED_40000 40000 +#define SPEED_50000 50000 #define SPEED_56000 56000 +#define SPEED_100000 100000 #define SPEED_UNKNOWN -1 -- cgit v1.2.3 From 04fd61ab36ec065e194ab5e74ae34a5240d992bb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 19 May 2015 16:59:03 -0700 Subject: bpf: allow bpf programs to tail-call other bpf programs introduce bpf_tail_call(ctx, &jmp_table, index) helper function which can be used from BPF programs like: int bpf_prog(struct pt_regs *ctx) { ... bpf_tail_call(ctx, &jmp_table, index); ... } that is roughly equivalent to: int bpf_prog(struct pt_regs *ctx) { ... if (jmp_table[index]) return (*jmp_table[index])(ctx); ... } The important detail that it's not a normal call, but a tail call. The kernel stack is precious, so this helper reuses the current stack frame and jumps into another BPF program without adding extra call frame. It's trivially done in interpreter and a bit trickier in JITs. In case of x64 JIT the bigger part of generated assembler prologue is common for all programs, so it is simply skipped while jumping. Other JITs can do similar prologue-skipping optimization or do stack unwind before jumping into the next program. bpf_tail_call() arguments: ctx - context pointer jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table index - index in the jump table Since all BPF programs are idenitified by file descriptor, user space need to populate the jmp_table with FDs of other BPF programs. If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere and program execution continues as normal. New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can populate this jmp_table array with FDs of other bpf programs. Programs can share the same jmp_table array or use multiple jmp_tables. The chain of tail calls can form unpredictable dynamic loops therefore tail_call_cnt is used to limit the number of calls and currently is set to 32. Use cases: Acked-by: Daniel Borkmann ========== - simplify complex programs by splitting them into a sequence of small programs - dispatch routine For tracing and future seccomp the program may be triggered on all system calls, but processing of syscall arguments will be different. It's more efficient to implement them as: int syscall_entry(struct seccomp_data *ctx) { bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */); ... default: process unknown syscall ... } int sys_write_event(struct seccomp_data *ctx) {...} int sys_read_event(struct seccomp_data *ctx) {...} syscall_jmp_table[__NR_write] = sys_write_event; syscall_jmp_table[__NR_read] = sys_read_event; For networking the program may call into different parsers depending on packet format, like: int packet_parser(struct __sk_buff *skb) { ... parse L2, L3 here ... __u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol)); bpf_tail_call(skb, &ipproto_jmp_table, ipproto); ... default: process unknown protocol ... } int parse_tcp(struct __sk_buff *skb) {...} int parse_udp(struct __sk_buff *skb) {...} ipproto_jmp_table[IPPROTO_TCP] = parse_tcp; ipproto_jmp_table[IPPROTO_UDP] = parse_udp; - for TC use case, bpf_tail_call() allows to implement reclassify-like logic - bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table are atomic, so user space can build chains of BPF programs on the fly Implementation details: ======================= - high performance of bpf_tail_call() is the goal. It could have been implemented without JIT changes as a wrapper on top of BPF_PROG_RUN() macro, but with two downsides: . all programs would have to pay performance penalty for this feature and tail call itself would be slower, since mandatory stack unwind, return, stack allocate would be done for every tailcall. . tailcall would be limited to programs running preempt_disabled, since generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would need to be either global per_cpu variable accessed by helper and by wrapper or global variable protected by locks. In this implementation x64 JIT bypasses stack unwind and jumps into the callee program after prologue. - bpf_prog_array_compatible() ensures that prog_type of callee and caller are the same and JITed/non-JITed flag is the same, since calling JITed program from non-JITed is invalid, since stack frames are different. Similarly calling kprobe type program from socket type program is invalid. - jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map' abstraction, its user space API and all of verifier logic. It's in the existing arraymap.c file, since several functions are shared with regular array map. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a9ebdf5701e8..f0a9af8b4dae 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -113,6 +113,7 @@ enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_ARRAY, + BPF_MAP_TYPE_PROG_ARRAY, }; enum bpf_prog_type { @@ -210,6 +211,15 @@ enum bpf_func_id { * Return: 0 on success */ BPF_FUNC_l4_csum_replace, + + /** + * bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program + * @ctx: context pointer passed to next program + * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY + * @index: index inside array that selects specific program to run + * Return: 0 on success + */ + BPF_FUNC_tail_call, __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From 2efd055c53c06b7e89c167c98069bab9afce7e59 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 20 May 2015 16:35:41 -0700 Subject: tcp: add tcpi_segs_in and tcpi_segs_out to tcp_info This patch tracks the total number of inbound and outbound segments on a TCP socket. One may use this number to have an idea on connection quality when compared against the retransmissions. RFC4898 named these : tcpEStatsPerfSegsIn and tcpEStatsPerfSegsOut These are a 32bit field each and can be fetched both from TCP_INFO getsockopt() if one has a handle on a TCP socket, or from inet_diag netlink facility (iproute2/ss patch will follow) Note that tp->segs_out was placed near tp->snd_nxt for good data locality and minimal performance impact, while tp->segs_in was placed near tp->bytes_received for the same reason. Join work with Eric Dumazet. Note that received SYN are accounted on the listener, but sent SYNACK are not accounted. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 51ebedba577f..65a77b071e22 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -192,8 +192,10 @@ struct tcp_info { __u64 tcpi_pacing_rate; __u64 tcpi_max_pacing_rate; - __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ + __u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */ __u64 tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ + __u32 tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */ + __u32 tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */ }; /* for TCP_MD5SIG socket option */ -- cgit v1.2.3 From bd5850d39f10f9d216bff69bcbf5938680b862ae Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 21 May 2015 02:26:24 +0200 Subject: net: sched: pkt_cls: remove unused macros from uapi Jamal points out that this header also contains kernel internal magic that cannot be used from userspace for anything meaningful. Lets remove what the kernel doesn't use anymore and wrap remainder with __KERNEL__. Suggested-by: Jamal Hadi Salim Suggested-by: Alexei Starovoitov Signed-off-by: Florian Westphal Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 39fb53d67b11..4f0d1bc3647d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -4,6 +4,7 @@ #include #include +#ifdef __KERNEL__ /* I think i could have done better macros ; for now this is stolen from * some arch/mips code - jhs */ @@ -35,23 +36,6 @@ bits 9,10,11: redirect counter - redirect TTL. Loop avoidance * * */ -#ifndef __KERNEL__ -/* backwards compat for userspace only */ -#define TC_MUNGED _TC_MAKEMASK1(0) -#define SET_TC_MUNGED(v) ( TC_MUNGED | (v & ~TC_MUNGED)) -#define CLR_TC_MUNGED(v) ( v & ~TC_MUNGED) - -#define TC_OK2MUNGE _TC_MAKEMASK1(1) -#define SET_TC_OK2MUNGE(v) ( TC_OK2MUNGE | (v & ~TC_OK2MUNGE)) -#define CLR_TC_OK2MUNGE(v) ( v & ~TC_OK2MUNGE) - -#define S_TC_VERD _TC_MAKE32(2) -#define M_TC_VERD _TC_MAKEMASK(4,S_TC_VERD) -#define G_TC_VERD(x) _TC_GETVALUE(x,S_TC_VERD,M_TC_VERD) -#define V_TC_VERD(x) _TC_MAKEVALUE(x,S_TC_VERD) -#define SET_TC_VERD(v,n) ((V_TC_VERD(n)) | (v & ~M_TC_VERD)) -#endif - #define S_TC_FROM _TC_MAKE32(6) #define M_TC_FROM _TC_MAKEMASK(2,S_TC_FROM) #define G_TC_FROM(x) _TC_GETVALUE(x,S_TC_FROM,M_TC_FROM) @@ -65,20 +49,16 @@ bits 9,10,11: redirect counter - redirect TTL. Loop avoidance #define SET_TC_NCLS(v) ( TC_NCLS | (v & ~TC_NCLS)) #define CLR_TC_NCLS(v) ( v & ~TC_NCLS) -#ifndef __KERNEL__ -#define S_TC_RTTL _TC_MAKE32(9) -#define M_TC_RTTL _TC_MAKEMASK(3,S_TC_RTTL) -#define G_TC_RTTL(x) _TC_GETVALUE(x,S_TC_RTTL,M_TC_RTTL) -#define V_TC_RTTL(x) _TC_MAKEVALUE(x,S_TC_RTTL) -#define SET_TC_RTTL(v,n) ((V_TC_RTTL(n)) | (v & ~M_TC_RTTL)) -#endif - #define S_TC_AT _TC_MAKE32(12) #define M_TC_AT _TC_MAKEMASK(2,S_TC_AT) #define G_TC_AT(x) _TC_GETVALUE(x,S_TC_AT,M_TC_AT) #define V_TC_AT(x) _TC_MAKEVALUE(x,S_TC_AT) #define SET_TC_AT(v,n) ((V_TC_AT(n)) | (v & ~M_TC_AT)) +#define MAX_REC_LOOP 4 +#define MAX_RED_LOOP 4 +#endif + /* Action attributes */ enum { TCA_ACT_UNSPEC, @@ -98,8 +78,6 @@ enum { #define TCA_ACT_NOUNBIND 0 #define TCA_ACT_REPLACE 1 #define TCA_ACT_NOREPLACE 0 -#define MAX_REC_LOOP 4 -#define MAX_RED_LOOP 4 #define TC_ACT_UNSPEC (-1) #define TC_ACT_OK 0 -- cgit v1.2.3 From d52d3997f843ffefaa8d8462790ffcaca6c74192 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 22 May 2015 20:56:06 -0700 Subject: ipv6: Create percpu rt6_info After the patch 'ipv6: Only create RTF_CACHE routes after encountering pmtu exception', we need to compensate the performance hit (bouncing dst->__refcnt). Signed-off-by: Martin KaFai Lau Cc: Hannes Frederic Sowa Cc: Steffen Klassert Cc: Julian Anastasov Signed-off-by: David S. Miller --- include/uapi/linux/ipv6_route.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h index 2be7bd174751..f6598d1c886e 100644 --- a/include/uapi/linux/ipv6_route.h +++ b/include/uapi/linux/ipv6_route.h @@ -34,6 +34,7 @@ #define RTF_PREF(pref) ((pref) << 27) #define RTF_PREF_MASK 0x18000000 +#define RTF_PCPU 0x40000000 #define RTF_LOCAL 0x80000000 -- cgit v1.2.3 From ebddf1a8d78aa3436353fae75c4396e50cb2d6cf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 26 May 2015 18:41:20 +0200 Subject: netfilter: nf_tables: allow to bind table to net_device This patch adds the internal NFT_AF_NEEDS_DEV flag to indicate that you must attach this table to a net_device. This change is required by the follow up patch that introduces the new netdev table. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 5fa1cd04762e..89a671e0f5e7 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -146,12 +146,14 @@ enum nft_table_flags { * @NFTA_TABLE_NAME: name of the table (NLA_STRING) * @NFTA_TABLE_FLAGS: bitmask of enum nft_table_flags (NLA_U32) * @NFTA_TABLE_USE: number of chains in this table (NLA_U32) + * @NFTA_TABLE_DEV: net device name (NLA_STRING) */ enum nft_table_attributes { NFTA_TABLE_UNSPEC, NFTA_TABLE_NAME, NFTA_TABLE_FLAGS, NFTA_TABLE_USE, + NFTA_TABLE_DEV, __NFTA_TABLE_MAX }; #define NFTA_TABLE_MAX (__NFTA_TABLE_MAX - 1) -- cgit v1.2.3 From 8cf6f497de405ca9eb87ffb34d90699962d10125 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 26 May 2015 08:22:49 -0700 Subject: ethtool: Add helper routines to pass vf to rx_flow_spec The ring_cookie is 64 bits wide which is much larger than can be used for actual queue index values. So provide some helper routines to pack a VF index into the cookie. This is useful to steer packets to a VF ring without having to know the queue layout of the device. CC: Alex Duyck Signed-off-by: John Fastabend Signed-off-by: Jeff Kirsher --- include/uapi/linux/ethtool.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index ae832b45b44c..0594933cdf55 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -796,6 +796,31 @@ struct ethtool_rx_flow_spec { __u32 location; }; +/* How rings are layed out when accessing virtual functions or + * offloaded queues is device specific. To allow users to do flow + * steering and specify these queues the ring cookie is partitioned + * into a 32bit queue index with an 8 bit virtual function id. + * This also leaves the 3bytes for further specifiers. It is possible + * future devices may support more than 256 virtual functions if + * devices start supporting PCIe w/ARI. However at the moment I + * do not know of any devices that support this so I do not reserve + * space for this at this time. If a future patch consumes the next + * byte it should be aware of this possiblity. + */ +#define ETHTOOL_RX_FLOW_SPEC_RING 0x00000000FFFFFFFFLL +#define ETHTOOL_RX_FLOW_SPEC_RING_VF 0x000000FF00000000LL +#define ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF 32 +static inline __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie) +{ + return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie; +}; + +static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie) +{ + return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >> + ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF; +}; + /** * struct ethtool_rxnfc - command to get or set RX flow classification rules * @cmd: Specific command number - %ETHTOOL_GRXFH, %ETHTOOL_SRXFH, -- cgit v1.2.3 From 37e82c2f974b72c9ab49c787ef7b5bb1aec12768 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 27 May 2015 15:30:39 -0700 Subject: bpf: allow BPF programs access skb->skb_iif and skb->dev->ifindex fields classic BPF already exposes skb->dev->ifindex via SKF_AD_IFINDEX extension. Allow eBPF program to access it as well. Note that classic aborts execution of the program if 'skb->dev == NULL' (which is inconvenient for program writers), whereas eBPF returns zero in such case. Also expose the 'skb_iif' field, since programs triggered by redirected packet need to known the original interface index. Summary: __skb->ifindex -> skb->dev->ifindex __skb->ingress_ifindex -> skb->skb_iif Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f0a9af8b4dae..72f3080afa1e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -236,6 +236,8 @@ struct __sk_buff { __u32 vlan_tci; __u32 vlan_proto; __u32 priority; + __u32 ingress_ifindex; + __u32 ifindex; }; #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From a28c257c9eb0bd76a4adcac97c07e34044ec71fb Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Fri, 29 May 2015 17:28:07 -0400 Subject: net/rds: Declare SO_RDS_TRANSPORT and RDS_TRANS_* constants in uapi/linux/rds.h User space applications that desire to explicitly select the underlying transport for a PF_RDS socket may do so by using the SO_RDS_TRANSPORT socket option at the SOL_RDS level before bind(). The integer argument provided to the socket option would be one of the RDS_TRANS_* values, e.g., RDS_TRANS_TCP. This commit exports the constant values need by such applications via Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- include/uapi/linux/rds.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index 91950950aa59..0f9265cb2a96 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -38,6 +38,8 @@ #define RDS_IB_ABI_VERSION 0x301 +#define SOL_RDS 276 + /* * setsockopt/getsockopt for SOL_RDS */ @@ -48,6 +50,14 @@ #define RDS_RECVERR 5 #define RDS_CONG_MONITOR 6 #define RDS_GET_MR_FOR_DEST 7 +#define SO_RDS_TRANSPORT 8 + +/* supported values for SO_RDS_TRANSPORT */ +#define RDS_TRANS_IB 0 +#define RDS_TRANS_IWARP 1 +#define RDS_TRANS_TCP 2 +#define RDS_TRANS_COUNT 3 +#define RDS_TRANS_NONE (~0) /* * Control message types for SOL_RDS. -- cgit v1.2.3 From ccea74457bbdafe33dce8bffcb5cb183aeb5f2bb Mon Sep 17 00:00:00 2001 From: Neil McKee Date: Tue, 26 May 2015 20:59:43 -0700 Subject: openvswitch: include datapath actions with sampled-packet upcall to userspace If new optional attribute OVS_USERSPACE_ATTR_ACTIONS is added to an OVS_ACTION_ATTR_USERSPACE action, then include the datapath actions in the upcall. This Directly associates the sampled packet with the path it takes through the virtual switch. Path information currently includes mangling, encapsulation and decapsulation actions for tunneling protocols GRE, VXLAN, Geneve, MPLS and QinQ, but this extension requires no further changes to accommodate datapath actions that may be added in the future. Adding path information enhances visibility into complex virtual networks. Signed-off-by: Neil McKee Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index bbd49a0c46c7..1dab77601c21 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -153,6 +153,8 @@ enum ovs_packet_cmd { * flow key against the kernel's. * @OVS_PACKET_ATTR_ACTIONS: Contains actions for the packet. Used * for %OVS_PACKET_CMD_EXECUTE. It has nested %OVS_ACTION_ATTR_* attributes. + * Also used in upcall when %OVS_ACTION_ATTR_USERSPACE has optional + * %OVS_USERSPACE_ATTR_ACTIONS attribute. * @OVS_PACKET_ATTR_USERDATA: Present for an %OVS_PACKET_CMD_ACTION * notification if the %OVS_ACTION_ATTR_USERSPACE action specified an * %OVS_USERSPACE_ATTR_USERDATA attribute, with the same length and content @@ -528,6 +530,7 @@ enum ovs_sample_attr { * copied to the %OVS_PACKET_CMD_ACTION message as %OVS_PACKET_ATTR_USERDATA. * @OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: If present, u32 output port to get * tunnel info. + * @OVS_USERSPACE_ATTR_ACTIONS: If present, send actions with upcall. */ enum ovs_userspace_attr { OVS_USERSPACE_ATTR_UNSPEC, @@ -535,6 +538,7 @@ enum ovs_userspace_attr { OVS_USERSPACE_ATTR_USERDATA, /* Optional user-specified cookie. */ OVS_USERSPACE_ATTR_EGRESS_TUN_PORT, /* Optional, u32 output port * to get tunnel info. */ + OVS_USERSPACE_ATTR_ACTIONS, /* Optional flag to get actions. */ __OVS_USERSPACE_ATTR_MAX }; -- cgit v1.2.3 From 8760ce58353c2099be35ead62a572ee2d1e83b5b Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Mon, 1 Jun 2015 15:51:34 -0400 Subject: geneve: allow user to specify TTL for tunnel frames Signed-off-by: John W. Linville Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index afccc9393fef..a834f31db915 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -395,6 +395,7 @@ enum { IFLA_GENEVE_UNSPEC, IFLA_GENEVE_ID, IFLA_GENEVE_REMOTE, + IFLA_GENEVE_TTL, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) -- cgit v1.2.3 From d89511251f6519599b109dc6cda87a6ab314ed8c Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Mon, 1 Jun 2015 15:51:35 -0400 Subject: geneve: allow user to specify TOS info for tunnel frames Signed-off-by: John W. Linville Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index a834f31db915..1737b7a8272b 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -396,6 +396,7 @@ enum { IFLA_GENEVE_ID, IFLA_GENEVE_REMOTE, IFLA_GENEVE_TTL, + IFLA_GENEVE_TOS, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) -- cgit v1.2.3 From 3896d655f4d491c67d669a15f275a39f713410f8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 2 Jun 2015 16:03:14 -0700 Subject: bpf: introduce bpf_clone_redirect() helper Allow eBPF programs attached to classifier/actions to call bpf_clone_redirect(skb, ifindex, flags) helper which will mirror or redirect the packet by dynamic ifindex selection from within the program to a target device either at ingress or at egress. Can be used for various scenarios, for example, to load balance skbs into veths, split parts of the traffic to local taps, etc. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 72f3080afa1e..42aa19abab86 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -220,6 +220,16 @@ enum bpf_func_id { * Return: 0 on success */ BPF_FUNC_tail_call, + + /** + * bpf_clone_redirect(skb, ifindex, flags) - redirect to another netdev + * @skb: pointer to skb + * @ifindex: ifindex of the net device + * @flags: bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * Return: 0 on success + */ + BPF_FUNC_clone_redirect, __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From 730fc4371333636a00fed32c587fc1e85c5367e2 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 4 Jun 2015 09:16:37 -0700 Subject: mpls: Add definition for IPPROTO_MPLS Add uapi define for MPLS over IP. Acked-by: Jiri Pirko Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/in.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 589ced069e8a..641338bef651 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -69,6 +69,8 @@ enum { #define IPPROTO_SCTP IPPROTO_SCTP IPPROTO_UDPLITE = 136, /* UDP-Lite (RFC 3828) */ #define IPPROTO_UDPLITE IPPROTO_UDPLITE + IPPROTO_MPLS = 137, /* MPLS in IP (RFC 4023) */ +#define IPPROTO_MPLS IPPROTO_MPLS IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW IPPROTO_MAX -- cgit v1.2.3 From 90c337da1524863838658078ec34241f45d8394d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 6 Jun 2015 21:17:57 -0700 Subject: inet: add IP_BIND_ADDRESS_NO_PORT to overcome bind(0) limitations When an application needs to force a source IP on an active TCP socket it has to use bind(IP, port=x). As most applications do not want to deal with already used ports, x is often set to 0, meaning the kernel is in charge to find an available port. But kernel does not know yet if this socket is going to be a listener or be connected. It has very limited choices (no full knowledge of final 4-tuple for a connect()) With limited ephemeral port range (about 32K ports), it is very easy to fill the space. This patch adds a new SOL_IP socket option, asking kernel to ignore the 0 port provided by application in bind(IP, port=0) and only remember the given IP address. The port will be automatically chosen at connect() time, in a way that allows sharing a source port as long as the 4-tuples are unique. This new feature is available for both IPv4 and IPv6 (Thanks Neal) Tested: Wrote a test program and checked its behavior on IPv4 and IPv6. strace(1) shows sequences of bind(IP=127.0.0.2, port=0) followed by connect(). Also getsockname() show that the port is still 0 right after bind() but properly allocated after connect(). socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5 setsockopt(5, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0 bind(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, 16) = 0 getsockname(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0 connect(5, {sa_family=AF_INET, sin_port=htons(53174), sin_addr=inet_addr("127.0.0.3")}, 16) = 0 getsockname(5, {sa_family=AF_INET, sin_port=htons(38050), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0 IPv6 test : socket(PF_INET6, SOCK_STREAM, IPPROTO_IP) = 7 setsockopt(7, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0 bind(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0 getsockname(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0 connect(7, {sa_family=AF_INET6, sin6_port=htons(57300), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0 getsockname(7, {sa_family=AF_INET6, sin6_port=htons(60964), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0 I was able to bind()/connect() a million concurrent IPv4 sockets, instead of ~32000 before patch. lpaa23:~# ulimit -n 1000010 lpaa23:~# ./bind --connect --num-flows=1000000 & 1000000 sockets lpaa23:~# grep TCP /proc/net/sockstat TCP: inuse 2000063 orphan 0 tw 47 alloc 2000157 mem 66 Check that a given source port is indeed used by many different connections : lpaa23:~# ss -t src :40000 | head -10 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 0 0 127.0.0.2:40000 127.0.202.33:44983 ESTAB 0 0 127.0.0.2:40000 127.2.27.240:44983 ESTAB 0 0 127.0.0.2:40000 127.2.98.5:44983 ESTAB 0 0 127.0.0.2:40000 127.0.124.196:44983 ESTAB 0 0 127.0.0.2:40000 127.2.139.38:44983 ESTAB 0 0 127.0.0.2:40000 127.1.59.80:44983 ESTAB 0 0 127.0.0.2:40000 127.3.6.228:44983 ESTAB 0 0 127.0.0.2:40000 127.0.38.53:44983 ESTAB 0 0 127.0.0.2:40000 127.1.197.10:44983 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/in.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 641338bef651..83d6236a2f08 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -112,6 +112,7 @@ struct in_addr { #define IP_MINTTL 21 #define IP_NODEFRAG 22 #define IP_CHECKSUM 23 +#define IP_BIND_ADDRESS_NO_PORT 24 /* IP_MTU_DISCOVER values */ #define IP_PMTUDISC_DONT 0 /* Never send DF frames */ -- cgit v1.2.3 From d691f9e8d4405c334aa10d556e73c8bf44cb0e01 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Jun 2015 10:11:54 -0700 Subject: bpf: allow programs to write to certain skb fields allow programs read/write skb->mark, tc_index fields and ((struct qdisc_skb_cb *)cb)->data. mark and tc_index are generically useful in TC. cb[0]-cb[4] are primarily used to pass arguments from one program to another called via bpf_tail_call() which can be seen in sockex3_kern.c example. All fields of 'struct __sk_buff' are readable to socket and tc_cls_act progs. mark, tc_index are writeable from tc_cls_act only. cb[0]-cb[4] are writeable by both sockets and tc_cls_act. Add verifier tests and improve sample code. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 42aa19abab86..602f05b7a275 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -248,6 +248,8 @@ struct __sk_buff { __u32 priority; __u32 ingress_ifindex; __u32 ifindex; + __u32 tc_index; + __u32 cb[5]; }; #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 9e58095f9660f88d6a2febe87d5073a6b2e9c399 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Tue, 14 Oct 2014 02:19:46 +0200 Subject: NFC: netlink: Implement vendor command support Vendor commands are passed from userspace through the NFC_CMD_VENDOR netlink command, allowing driver and hardware specific operations implementations like for example RF tuning or production line calibration. Drivers will associate a set of vendor commands to a vendor id, which could typically be an OUI. The netlink kernel implementation will try to match the received vendor id and sub command attributes with the registered ones. When such match is found, the driver defined sub command routine is called. Signed-off-by: Samuel Ortiz --- include/uapi/linux/nfc.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index c1e2e63cf9b5..dd3f75389076 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -86,6 +86,8 @@ * for this event is the application ID (AID). * @NFC_CMD_GET_SE: Dump all discovered secure elements from an NFC controller. * @NFC_CMD_SE_IO: Send/Receive APDUs to/from the selected secure element. + * @NFC_CMD_VENDOR: Vendor specific command, to be implemented directly + * from the driver in order to support hardware specific operations. */ enum nfc_commands { NFC_CMD_UNSPEC, @@ -117,6 +119,7 @@ enum nfc_commands { NFC_CMD_GET_SE, NFC_CMD_SE_IO, NFC_CMD_ACTIVATE_TARGET, + NFC_CMD_VENDOR, /* private: internal use only */ __NFC_CMD_AFTER_LAST }; @@ -153,6 +156,10 @@ enum nfc_commands { * @NFC_ATTR_APDU: Secure element APDU * @NFC_ATTR_TARGET_ISO15693_DSFID: ISO 15693 Data Storage Format Identifier * @NFC_ATTR_TARGET_ISO15693_UID: ISO 15693 Unique Identifier + * @NFC_ATTR_VENDOR_ID: NFC manufacturer unique ID, typically an OUI + * @NFC_ATTR_VENDOR_SUBCMD: Vendor specific sub command + * @NFC_ATTR_VENDOR_DATA: Vendor specific data, to be optionally passed + * to a vendor specific command implementation */ enum nfc_attrs { NFC_ATTR_UNSPEC, @@ -184,6 +191,9 @@ enum nfc_attrs { NFC_ATTR_TARGET_ISO15693_DSFID, NFC_ATTR_TARGET_ISO15693_UID, NFC_ATTR_SE_PARAMS, + NFC_ATTR_VENDOR_ID, + NFC_ATTR_VENDOR_SUBCMD, + NFC_ATTR_VENDOR_DATA, /* private: internal use only */ __NFC_ATTR_AFTER_LAST }; -- cgit v1.2.3 From dd895d7f21b244e7fd4c7477697e274de7e44ecb Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 9 Jun 2015 08:05:10 +0200 Subject: can: cangw: introduce optional uid to reference created routing jobs Similar to referencing iptables rules by their line number this UID allows to reference created routing jobs, e.g. to alter configured data modifications. The UID is an optional non-zero value which can be provided at routing job creation time. When the UID is set the UID replaces the data modification configuration as job identification attribute e.g. at job removal time. Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/gw.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can/gw.h b/include/uapi/linux/can/gw.h index 3e6184cf2f6d..5079b9d57e31 100644 --- a/include/uapi/linux/can/gw.h +++ b/include/uapi/linux/can/gw.h @@ -78,6 +78,7 @@ enum { CGW_FILTER, /* specify struct can_filter on source CAN device */ CGW_DELETED, /* number of deleted CAN frames (see max_hops param) */ CGW_LIM_HOPS, /* limit the number of hops of this specific rule */ + CGW_MOD_UID, /* user defined identifier for modification updates */ __CGW_MAX }; @@ -162,6 +163,10 @@ enum { * load time of the can-gw module). This value is used to reduce the number of * possible hops for this gateway rule to a value smaller then max_hops. * + * CGW_MOD_UID (length 4 bytes): + * Optional non-zero user defined routing job identifier to alter existing + * modification settings at runtime. + * * CGW_CS_XOR (length 4 bytes): * Set a simple XOR checksum starting with an initial value into * data[result-idx] using data[start-idx] .. data[end-idx] -- cgit v1.2.3 From a4244b0cf58d56c171874e85228ba5deffeb017a Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Thu, 11 Jun 2015 10:28:16 +0300 Subject: net/ethtool: Add current supported tunable options Add strings array of the current supported tunable options. Signed-off-by: Hadar Hen Zion Reviewed-by: Amir Vadai Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 0594933cdf55..cd67aec187d9 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -215,6 +215,11 @@ enum tunable_id { ETHTOOL_ID_UNSPEC, ETHTOOL_RX_COPYBREAK, ETHTOOL_TX_COPYBREAK, + /* + * Add your fresh new tubale attribute above and remember to update + * tunable_strings[] in net/core/ethtool.c + */ + __ETHTOOL_TUNABLE_COUNT, }; enum tunable_type_id { @@ -545,6 +550,7 @@ enum ethtool_stringset { ETH_SS_NTUPLE_FILTERS, ETH_SS_FEATURES, ETH_SS_RSS_HASH_FUNCS, + ETH_SS_TUNABLES, }; /** -- cgit v1.2.3 From 9961127d4bce6325e9a0b0fb105e0c85a6c62cb7 Mon Sep 17 00:00:00 2001 From: Vincent Cuissard Date: Thu, 11 Jun 2015 11:25:47 +0200 Subject: NFC: nci: add generic uart support Some NFC controller supports UART as host interface. As with SPI, a lot of code can be shared between vendor drivers. This patch add the generic support of UART and provides some extension API for vendor specific needs. This code is strongly inspired by the Bluetooth HCI ldisc implementation. NCI UART vendor drivers will have to register themselves to this layer via nci_uart_register. Underlying tty will have to be configured from user land thanks to an ioctl. Signed-off-by: Vincent Cuissard Signed-off-by: Samuel Ortiz --- include/uapi/linux/tty.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tty.h b/include/uapi/linux/tty.h index dac199a2dba5..01c4410352ff 100644 --- a/include/uapi/linux/tty.h +++ b/include/uapi/linux/tty.h @@ -34,5 +34,6 @@ #define N_TI_WL 22 /* for TI's WL BT, FM, GPS combo chips */ #define N_TRACESINK 23 /* Trace data routing for MIPI P1149.7 */ #define N_TRACEROUTER 24 /* Trace data routing for MIPI P1149.7 */ +#define N_NCI 25 /* NFC NCI UART */ #endif /* _UAPI_LINUX_TTY_H */ -- cgit v1.2.3 From ca0f6a5cd99e0c6ba4bb78dc402817f636370f26 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 13 Jun 2015 19:45:33 +0200 Subject: netfilter: ipset: Fix coding styles reported by checkpatch.pl Signed-off-by: Jozsef Kadlecsik --- include/uapi/linux/netfilter/ipset/ip_set.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 5ab4e60894cf..63b2e34f1b60 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -15,12 +15,12 @@ /* The protocol version */ #define IPSET_PROTOCOL 6 -/* The maximum permissible comment length we will accept over netlink */ -#define IPSET_MAX_COMMENT_SIZE 255 - /* The max length of strings including NUL: set and type identifiers */ #define IPSET_MAXNAMELEN 32 +/* The maximum permissible comment length we will accept over netlink */ +#define IPSET_MAX_COMMENT_SIZE 255 + /* Message types and commands */ enum ipset_cmd { IPSET_CMD_NONE, -- cgit v1.2.3 From 2cbce139fc57bc2625f88add055d0b94f00c3352 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 12 Jun 2015 13:55:41 +0200 Subject: netfilter: nf_tables: attach net_device to basechain The device is part of the hook configuration, so instead of a global configuration per table, set it to each of the basechain that we create. This patch reworks ebddf1a8d78a ("netfilter: nf_tables: allow to bind table to net_device"). Note that this adds a dev_name field in the nft_base_chain structure which is required the netdev notification subscription that follows up in a patch to handle gone net_devices. Suggested-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 89a671e0f5e7..a99e6a997140 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -122,11 +122,13 @@ enum nft_list_attributes { * * @NFTA_HOOK_HOOKNUM: netfilter hook number (NLA_U32) * @NFTA_HOOK_PRIORITY: netfilter hook priority (NLA_U32) + * @NFTA_HOOK_DEV: netdevice name (NLA_STRING) */ enum nft_hook_attributes { NFTA_HOOK_UNSPEC, NFTA_HOOK_HOOKNUM, NFTA_HOOK_PRIORITY, + NFTA_HOOK_DEV, __NFTA_HOOK_MAX }; #define NFTA_HOOK_MAX (__NFTA_HOOK_MAX - 1) @@ -146,14 +148,12 @@ enum nft_table_flags { * @NFTA_TABLE_NAME: name of the table (NLA_STRING) * @NFTA_TABLE_FLAGS: bitmask of enum nft_table_flags (NLA_U32) * @NFTA_TABLE_USE: number of chains in this table (NLA_U32) - * @NFTA_TABLE_DEV: net device name (NLA_STRING) */ enum nft_table_attributes { NFTA_TABLE_UNSPEC, NFTA_TABLE_NAME, NFTA_TABLE_FLAGS, NFTA_TABLE_USE, - NFTA_TABLE_DEV, __NFTA_TABLE_MAX }; #define NFTA_TABLE_MAX (__NFTA_TABLE_MAX - 1) -- cgit v1.2.3 From ffeedafbf0236f03aeb2e8db273b3e5ae5f5bc89 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 12 Jun 2015 19:39:12 -0700 Subject: bpf: introduce current->pid, tgid, uid, gid, comm accessors eBPF programs attached to kprobes need to filter based on current->pid, uid and other fields, so introduce helper functions: u64 bpf_get_current_pid_tgid(void) Return: current->tgid << 32 | current->pid u64 bpf_get_current_uid_gid(void) Return: current_gid << 32 | current_uid bpf_get_current_comm(char *buf, int size_of_buf) stores current->comm into buf They can be used from the programs attached to TC as well to classify packets based on current task fields. Update tracex2 example to print histogram of write syscalls for each process instead of aggregated for all. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 602f05b7a275..29ef6f99e43d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -230,6 +230,25 @@ enum bpf_func_id { * Return: 0 on success */ BPF_FUNC_clone_redirect, + + /** + * u64 bpf_get_current_pid_tgid(void) + * Return: current->tgid << 32 | current->pid + */ + BPF_FUNC_get_current_pid_tgid, + + /** + * u64 bpf_get_current_uid_gid(void) + * Return: current_gid << 32 | current_uid + */ + BPF_FUNC_get_current_uid_gid, + + /** + * bpf_get_current_comm(char *buf, int size_of_buf) + * stores current->comm into buf + * Return: 0 on success + */ + BPF_FUNC_get_current_comm, __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From 254cb6dbfd8894743fbf814ec856ccd0874af691 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 14 Jun 2015 16:36:34 +0300 Subject: bonding: export slave's actor_oper_port_state via sysfs and netlink Export the actor_oper_port_state of each port via sysfs and netlink. In 802.3ad mode it is valuable for the user to be able to check the actor_oper state, it is already exported via bond's proc entry. Signed-off-by: Nikolay Aleksandrov Signed-off-by: Andy Gospodarek Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 1737b7a8272b..1b3e357223f2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -456,6 +456,7 @@ enum { IFLA_BOND_SLAVE_PERM_HWADDR, IFLA_BOND_SLAVE_QUEUE_ID, IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, + IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, __IFLA_BOND_SLAVE_MAX, }; -- cgit v1.2.3 From 46ea297ed67cdeeb0142244873458b911037d0ba Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 14 Jun 2015 16:36:35 +0300 Subject: bonding: export slave's partner_oper_port_state via sysfs and netlink Export the partner_oper_port_state of each port via sysfs and netlink. In 802.3ad mode it is valuable for the user to be able to check the partner_oper state, it is already exported via bond's proc entry. Signed-off-by: Nikolay Aleksandrov Signed-off-by: Andy Gospodarek Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 1b3e357223f2..510efb360580 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -457,6 +457,7 @@ enum { IFLA_BOND_SLAVE_QUEUE_ID, IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, + IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, __IFLA_BOND_SLAVE_MAX, }; -- cgit v1.2.3 From 3b766cd832328fcb87db3507e7b98cf42f21689d Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 15 Jun 2015 17:59:07 +0300 Subject: net/core: Add reading VF statistics through the PF netdevice Add ndo_get_vf_stats where the PF retrieves and fills the VFs traffic statistics. We encode the VF stats in a nested manner to allow for future extensions. Signed-off-by: Eran Ben Elisha Signed-off-by: Hadar Hen Zion Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 510efb360580..2c7e8e3d3981 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -484,6 +484,7 @@ enum { IFLA_VF_RSS_QUERY_EN, /* RSS Redirection Table and Hash Key query * on/off switch */ + IFLA_VF_STATS, /* network device statistics */ __IFLA_VF_MAX, }; @@ -533,6 +534,18 @@ struct ifla_vf_rss_query_en { __u32 setting; }; +enum { + IFLA_VF_STATS_RX_PACKETS, + IFLA_VF_STATS_TX_PACKETS, + IFLA_VF_STATS_RX_BYTES, + IFLA_VF_STATS_TX_BYTES, + IFLA_VF_STATS_BROADCAST, + IFLA_VF_STATS_MULTICAST, + __IFLA_VF_STATS_MAX, +}; + +#define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1) + /* VF ports management section * * Nested layout of set/get msg is: -- cgit v1.2.3 From eb4cb008529ca08e0d8c0fa54e8f739520197a65 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 15 Jun 2015 11:26:18 -0400 Subject: sock_diag: define destruction multicast groups These groups will contain socket-destruction events for AF_INET/AF_INET6, IPPROTO_TCP/IPPROTO_UDP. Near the end of socket destruction, a check for listeners is performed. In the presence of a listener, rather than completely cleanup the socket, a unit of work will be added to a private work queue which will first broadcast information about the socket and then finish the cleanup operation. Signed-off-by: Craig Gallek Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/sock_diag.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h index b00e29efb161..49230d36f9ce 100644 --- a/include/uapi/linux/sock_diag.h +++ b/include/uapi/linux/sock_diag.h @@ -23,4 +23,14 @@ enum { SK_MEMINFO_VARS, }; +enum sknetlink_groups { + SKNLGRP_NONE, + SKNLGRP_INET_TCP_DESTROY, + SKNLGRP_INET_UDP_DESTROY, + SKNLGRP_INET6_TCP_DESTROY, + SKNLGRP_INET6_UDP_DESTROY, + __SKNLGRP_MAX, +}; +#define SKNLGRP_MAX (__SKNLGRP_MAX - 1) + #endif /* _UAPI__SOCK_DIAG_H__ */ -- cgit v1.2.3 From 35ac838a9b96470f999db04320f53a2033642bfb Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 15 Jun 2015 11:26:20 -0400 Subject: sock_diag: implement a get_info handler for inet This get_info handler will simply dispatch to the appropriate existing inet protocol handler. This patch also includes a new netlink attribute (INET_DIAG_PROTOCOL). This attribute is currently only used for multicast messages. Without this attribute, there is no way of knowing the IP protocol used by the socket information being broadcast. This attribute is not necessary in the 'dump' variant of this protocol (though it could easily be added) because dump requests are issued for specific family/protocol pairs. Tested: ss -E (note, the -E option has not yet been merged into the upstream version of ss). Signed-off-by: Craig Gallek Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index c7093c75bdd6..b629fc53b109 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -111,9 +111,10 @@ enum { INET_DIAG_SKMEMINFO, INET_DIAG_SHUTDOWN, INET_DIAG_DCTCPINFO, + INET_DIAG_PROTOCOL, /* response attribute only */ }; -#define INET_DIAG_MAX INET_DIAG_DCTCPINFO +#define INET_DIAG_MAX INET_DIAG_PROTOCOL /* INET_DIAG_MEM */ -- cgit v1.2.3 From ef493bd930ae482c608c5999b40d79dda3f2a674 Mon Sep 17 00:00:00 2001 From: Roman Kubiak Date: Fri, 12 Jun 2015 12:32:57 +0200 Subject: netfilter: nfnetlink_queue: add security context information This patch adds an additional attribute when sending packet information via netlink in netfilter_queue module. It will send additional security context data, so that userspace applications can verify this context against their own security databases. Signed-off-by: Roman Kubiak Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_queue.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h index 8dd819e2b5fe..b67a853638ff 100644 --- a/include/uapi/linux/netfilter/nfnetlink_queue.h +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h @@ -49,6 +49,7 @@ enum nfqnl_attr_type { NFQA_EXP, /* nf_conntrack_netlink.h */ NFQA_UID, /* __u32 sk uid */ NFQA_GID, /* __u32 sk gid */ + NFQA_SECCTX, /* security context string */ __NFQA_MAX }; @@ -102,7 +103,8 @@ enum nfqnl_attr_config { #define NFQA_CFG_F_CONNTRACK (1 << 1) #define NFQA_CFG_F_GSO (1 << 2) #define NFQA_CFG_F_UID_GID (1 << 3) -#define NFQA_CFG_F_MAX (1 << 4) +#define NFQA_CFG_F_SECCTX (1 << 4) +#define NFQA_CFG_F_MAX (1 << 5) /* flags for NFQA_SKB_INFO */ /* packet appears to have wrong checksums, but they are ok */ -- cgit v1.2.3 From 01555e74bde51444c6898ef1800fb2bc697d479e Mon Sep 17 00:00:00 2001 From: Harout Hedeshian Date: Mon, 15 Jun 2015 18:40:43 -0600 Subject: netfilter: xt_socket: add XT_SOCKET_RESTORESKMARK flag xt_socket is useful for matching sockets with IP_TRANSPARENT and taking some action on the matching packets. However, it lacks the ability to match only a small subset of transparent sockets. Suppose there are 2 applications, each with its own set of transparent sockets. The first application wants all matching packets dropped, while the second application wants them forwarded somewhere else. Add the ability to retore the skb->mark from the sk_mark. The mark is only restored if a matching socket is found and the transparent / nowildcard conditions are satisfied. Now the 2 hypothetical applications can differentiate their sockets based on a mark value set with SO_MARK. iptables -t mangle -I PREROUTING -m socket --transparent \ --restore-skmark -j action iptables -t mangle -A action -m mark --mark 10 -j action2 iptables -t mangle -A action -m mark --mark 11 -j action3 Signed-off-by: Harout Hedeshian Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_socket.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/xt_socket.h b/include/uapi/linux/netfilter/xt_socket.h index 6315e2ac3474..87644f832494 100644 --- a/include/uapi/linux/netfilter/xt_socket.h +++ b/include/uapi/linux/netfilter/xt_socket.h @@ -6,6 +6,7 @@ enum { XT_SOCKET_TRANSPARENT = 1 << 0, XT_SOCKET_NOWILDCARD = 1 << 1, + XT_SOCKET_RESTORESKMARK = 1 << 2, }; struct xt_socket_mtinfo1 { @@ -18,4 +19,11 @@ struct xt_socket_mtinfo2 { }; #define XT_SOCKET_FLAGS_V2 (XT_SOCKET_TRANSPARENT | XT_SOCKET_NOWILDCARD) +struct xt_socket_mtinfo3 { + __u8 flags; +}; +#define XT_SOCKET_FLAGS_V3 (XT_SOCKET_TRANSPARENT \ + | XT_SOCKET_NOWILDCARD \ + | XT_SOCKET_RESTORESKMARK) + #endif /* _XT_SOCKET_H */ -- cgit v1.2.3 From a263653ed798216c0069922d7b5237ca49436007 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Jun 2015 10:28:27 -0500 Subject: netfilter: don't pull include/linux/netfilter.h from netns headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This pulls the full hook netfilter definitions from all those that include net_namespace.h. Instead let's just include the bare minimum required in the new linux/netfilter_defs.h file, and use it from the netfilter netns header files. I also needed to include in.h and in6.h from linux/netfilter.h otherwise we hit this compilation error: In file included from include/linux/netfilter_defs.h:4:0, from include/net/netns/netfilter.h:4, from include/net/net_namespace.h:22, from include/linux/netdevice.h:43, from net/netfilter/nfnetlink_queue_core.c:23: include/uapi/linux/netfilter.h:76:17: error: field ‘in’ has incomplete type struct in_addr in; And also explicit include linux/netfilter.h in several spots. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Eric W. Biederman --- include/uapi/linux/netfilter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index 177027cce6b3..d93f949d1d9a 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -4,7 +4,8 @@ #include #include #include - +#include +#include /* Responses from hook functions. */ #define NF_DROP 0 -- cgit v1.2.3 From b42be38b2778eda2237fc759e55e3b698b05b315 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Wed, 17 Jun 2015 17:14:33 +0200 Subject: netlink: add API to retrieve all group memberships This patch adds getsockopt(SOL_NETLINK, NETLINK_LIST_MEMBERSHIPS) to retrieve all groups a socket is a member of. Currently, we have to use getsockname() and look at the nl.nl_groups bitmask. However, this mask is limited to 32 groups. Hence, similar to NETLINK_ADD_MEMBERSHIP and NETLINK_DROP_MEMBERSHIP, this adds a separate sockopt to manager higher groups IDs than 32. This new NETLINK_LIST_MEMBERSHIPS option takes a pointer to __u32 and the size of the array. The array is filled with the full membership-set of the socket, and the required array size is returned in optlen. Hence, user-space can retry with a properly sized array in case it was too small. Signed-off-by: David Herrmann Signed-off-by: David S. Miller --- include/uapi/linux/netlink.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 3e34b7d702f8..cf6a65cccbdf 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -101,14 +101,15 @@ struct nlmsgerr { struct nlmsghdr msg; }; -#define NETLINK_ADD_MEMBERSHIP 1 -#define NETLINK_DROP_MEMBERSHIP 2 -#define NETLINK_PKTINFO 3 -#define NETLINK_BROADCAST_ERROR 4 -#define NETLINK_NO_ENOBUFS 5 -#define NETLINK_RX_RING 6 -#define NETLINK_TX_RING 7 -#define NETLINK_LISTEN_ALL_NSID 8 +#define NETLINK_ADD_MEMBERSHIP 1 +#define NETLINK_DROP_MEMBERSHIP 2 +#define NETLINK_PKTINFO 3 +#define NETLINK_BROADCAST_ERROR 4 +#define NETLINK_NO_ENOBUFS 5 +#define NETLINK_RX_RING 6 +#define NETLINK_TX_RING 7 +#define NETLINK_LISTEN_ALL_NSID 8 +#define NETLINK_LIST_MEMBERSHIPS 9 struct nl_pktinfo { __u32 group; -- cgit v1.2.3 From 42bcce87d763b4d22dc6d3a0c0b60c6b49820de8 Mon Sep 17 00:00:00 2001 From: Anish Bhatt Date: Mon, 22 Jun 2015 17:44:35 -0700 Subject: dcb : Fix incorrect documentation for struct dcb_app While IEEE and CEE use the same structure to store apps, the selector and priority fields for both are different. Only the priority field is explained, add documentation explaining how the selector field differs for both. cgdcbxd code shows an example of how selector fields differ. Signed-off-by: Anish Bhatt Signed-off-by: David S. Miller --- include/uapi/linux/dcbnl.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index 6497d7933d5b..3ea470f35e40 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -207,8 +207,7 @@ struct cee_pfc { #define IEEE_8021QAZ_APP_SEL_ANY 4 /* This structure contains the IEEE 802.1Qaz APP managed object. This - * object is also used for the CEE std as well. There is no difference - * between the objects. + * object is also used for the CEE std as well. * * @selector: protocol identifier type * @protocol: protocol of type indicated @@ -216,13 +215,18 @@ struct cee_pfc { * 8-bit 802.1p user priority bitmap for CEE * * ---- - * Selector field values + * Selector field values for IEEE 802.1Qaz * 0 Reserved * 1 Ethertype * 2 Well known port number over TCP or SCTP * 3 Well known port number over UDP or DCCP * 4 Well known port number over TCP, SCTP, UDP, or DCCP * 5-7 Reserved + * + * Selector field values for CEE + * 0 Ethertype + * 1 Well known port number over TCP or UDP + * 2-3 Reserved */ struct dcb_app { __u8 selector; -- cgit v1.2.3 From 8a3d03166f19329b46c6f9e900f93a89f446077b Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Tue, 23 Jun 2015 13:45:36 -0400 Subject: net: track link-status of ipv4 nexthops Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are reachable via an interface where carrier is off. No action is taken, but additional flags are passed to userspace to indicate carrier status. This also includes a cleanup to fib_disable_ip to more clearly indicate what event made the function call to replace the more cryptic force option previously used. v2: Split out kernel functionality into 2 patches, this patch simply sets and clears new nexthop flag RTNH_F_LINKDOWN. v3: Cleanups suggested by Alex as well as a bug noticed in fib_sync_down_dev and fib_sync_up when multipath was not enabled. v5: Whitespace and variable declaration fixups suggested by Dave. v6: Style fixups noticed by Dave; ran checkpatch to be sure I got them all. Signed-off-by: Andy Gospodarek Signed-off-by: Dinesh Dutt Acked-by: Scott Feldman Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 17fb02f488da..fdd8f07f1d34 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -338,6 +338,9 @@ struct rtnexthop { #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ #define RTNH_F_OFFLOAD 8 /* offloaded route */ +#define RTNH_F_LINKDOWN 16 /* carrier-down on nexthop */ + +#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN) /* Macros to handle hexthops */ -- cgit v1.2.3 From 0eeb075fad736fb92620af995c47c204bbb5e829 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Tue, 23 Jun 2015 13:45:37 -0400 Subject: net: ipv4 sysctl option to ignore routes when nexthop link is down This feature is only enabled with the new per-interface or ipv4 global sysctls called 'ignore_routes_with_linkdown'. net.ipv4.conf.all.ignore_routes_with_linkdown = 0 net.ipv4.conf.default.ignore_routes_with_linkdown = 0 net.ipv4.conf.lo.ignore_routes_with_linkdown = 0 ... When the above sysctls are set, will report to userspace that a route is dead and will no longer resolve to this nexthop when performing a fib lookup. This will signal to userspace that the route will not be selected. The signalling of a RTNH_F_DEAD is only passed to userspace if the sysctl is enabled and link is down. This was done as without it the netlink listeners would have no idea whether or not a nexthop would be selected. The kernel only sets RTNH_F_DEAD internally if the interface has IFF_UP cleared. With the new sysctl set, the following behavior can be observed (interface p8p1 is link-down): default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 dead linkdown 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 dead linkdown 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 90.0.0.1 via 70.0.0.2 dev p7p1 src 70.0.0.1 cache local 80.0.0.1 dev lo src 80.0.0.1 cache 80.0.0.2 via 10.0.5.2 dev p9p1 src 10.0.5.15 cache While the route does remain in the table (so it can be modified if needed rather than being wiped away as it would be if IFF_UP was cleared), the proper next-hop is chosen automatically when the link is down. Now interface p8p1 is linked-up: default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 192.168.56.0/24 dev p2p1 proto kernel scope link src 192.168.56.2 90.0.0.1 via 80.0.0.2 dev p8p1 src 80.0.0.1 cache local 80.0.0.1 dev lo src 80.0.0.1 cache 80.0.0.2 dev p8p1 src 80.0.0.1 cache and the output changes to what one would expect. If the sysctl is not set, the following output would be expected when p8p1 is down: default via 10.0.5.2 dev p9p1 10.0.5.0/24 dev p9p1 proto kernel scope link src 10.0.5.15 70.0.0.0/24 dev p7p1 proto kernel scope link src 70.0.0.1 80.0.0.0/24 dev p8p1 proto kernel scope link src 80.0.0.1 linkdown 90.0.0.0/24 via 80.0.0.2 dev p8p1 metric 1 linkdown 90.0.0.0/24 via 70.0.0.2 dev p7p1 metric 2 Since the dead flag does not appear, there should be no expectation that the kernel would skip using this route due to link being down. v2: Split kernel changes into 2 patches, this actually makes a behavioral change if the sysctl is set. Also took suggestion from Alex to simplify code by only checking sysctl during fib lookup and suggestion from Scott to add a per-interface sysctl. v3: Code clean-ups to make it more readable and efficient as well as a reverse path check fix. v4: Drop binary sysctl v5: Whitespace fixups from Dave v6: Style changes from Dave and checkpatch suggestions v7: One more checkpatch fixup Signed-off-by: Andy Gospodarek Signed-off-by: Dinesh Dutt Acked-by: Scott Feldman Signed-off-by: David S. Miller --- include/uapi/linux/ip.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index 411959405ab6..08f894d2ddbd 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -164,6 +164,7 @@ enum IPV4_DEVCONF_ROUTE_LOCALNET, IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL, IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL, + IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, __IPV4_DEVCONF_MAX }; -- cgit v1.2.3 From 204621551b2a0060a013b92f7add4d5c452fa7cb Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 24 Jun 2015 11:02:51 +0200 Subject: net: inet_diag: export IPV6_V6ONLY sockopt For AF_INET6 sockets, the value of struct ipv6_pinfo.ipv6only is exported to userspace. It indicates whether a socket bound to in6addr_any listens on IPv4 as well as IPv6. Since the socket is natively IPv6, it is not listed by e.g. 'ss -l -4'. This patch is accompanied by an appropriate one for iproute2 to enable the additional information in 'ss -e'. Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index b629fc53b109..68a1f71fde9f 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -112,9 +112,10 @@ enum { INET_DIAG_SHUTDOWN, INET_DIAG_DCTCPINFO, INET_DIAG_PROTOCOL, /* response attribute only */ + INET_DIAG_SKV6ONLY, }; -#define INET_DIAG_MAX INET_DIAG_PROTOCOL +#define INET_DIAG_MAX INET_DIAG_SKV6ONLY /* INET_DIAG_MEM */ -- cgit v1.2.3