From 4749c3ef854e3a5d3dd3cc0ccd2dcb7e05d583bd Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 30 Apr 2015 12:12:00 +0200
Subject: net: sched: remove TC_MUNGED bits

Not used.

pedit sets TC_MUNGED when packet content was altered, but all the core
does is unset MUNGED again and then set OK2MUNGE.

And the latter isn't tested anywhere. So lets remove both
TC_MUNGED and TC_OK2MUNGE.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index bf08e76bf505..6810ca43a80a 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -35,6 +35,8 @@ bits 9,10,11: redirect counter -  redirect TTL. Loop avoidance
  *
  * */
 
+#ifndef __KERNEL__
+/* backwards compat for userspace only */
 #define TC_MUNGED          _TC_MAKEMASK1(0)
 #define SET_TC_MUNGED(v)   ( TC_MUNGED | (v & ~TC_MUNGED))
 #define CLR_TC_MUNGED(v)   ( v & ~TC_MUNGED)
@@ -42,6 +44,7 @@ bits 9,10,11: redirect counter -  redirect TTL. Loop avoidance
 #define TC_OK2MUNGE        _TC_MAKEMASK1(1)
 #define SET_TC_OK2MUNGE(v)   ( TC_OK2MUNGE | (v & ~TC_OK2MUNGE))
 #define CLR_TC_OK2MUNGE(v)   ( v & ~TC_OK2MUNGE)
+#endif
 
 #define S_TC_VERD          _TC_MAKE32(2)
 #define M_TC_VERD          _TC_MAKEMASK(4,S_TC_VERD)
-- 
cgit v1.2.3


From c19ae86a510cf4332af64caab04718bc853d3184 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Fri, 1 May 2015 22:19:43 -0700
Subject: tc: remove unused redirect ttl

improves ingress+u32 performance from 22.4 Mpps to 22.9 Mpps

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Florian Westphal <fw@strlen.de>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 6810ca43a80a..596ffa0c7084 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -65,11 +65,13 @@ bits 9,10,11: redirect counter -  redirect TTL. Loop avoidance
 #define SET_TC_NCLS(v)   ( TC_NCLS | (v & ~TC_NCLS))
 #define CLR_TC_NCLS(v)   ( v & ~TC_NCLS)
 
+#ifndef __KERNEL__
 #define S_TC_RTTL          _TC_MAKE32(9)
 #define M_TC_RTTL          _TC_MAKEMASK(3,S_TC_RTTL)
 #define G_TC_RTTL(x)       _TC_GETVALUE(x,S_TC_RTTL,M_TC_RTTL)
 #define V_TC_RTTL(x)       _TC_MAKEVALUE(x,S_TC_RTTL)
 #define SET_TC_RTTL(v,n)   ((V_TC_RTTL(n)) | (v & ~M_TC_RTTL))
+#endif
 
 #define S_TC_AT          _TC_MAKE32(12)
 #define M_TC_AT          _TC_MAKEMASK(2,S_TC_AT)
-- 
cgit v1.2.3


From cd8ae85299d54155702a56811b2e035e63064d3d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 3 May 2015 21:34:46 -0700
Subject: tcp: provide SYN headers for passive connections

This patch allows a server application to get the TCP SYN headers for
its passive connections.  This is useful if the server is doing
fingerprinting of clients based on SYN packet contents.

Two socket options are added: TCP_SAVE_SYN and TCP_SAVED_SYN.

The first is used on a socket to enable saving the SYN headers
for child connections. This can be set before or after the listen()
call.

The latter is used to retrieve the SYN headers for passive connections,
if the parent listener has enabled TCP_SAVE_SYN.

TCP_SAVED_SYN is read once, it frees the saved SYN headers.

The data returned in TCP_SAVED_SYN are network (IPv4/IPv6) and TCP
headers.

Original patch was written by Tom Herbert, I changed it to not hold
a full skb (and associated dst and conntracking reference).

We have used such patch for about 3 years at Google.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Tested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index faa72f4fa547..51ebedba577f 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -113,6 +113,8 @@ enum {
 #define TCP_TIMESTAMP		24
 #define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */
 #define TCP_CC_INFO		26	/* Get Congestion Control (optional) info */
+#define TCP_SAVE_SYN		27	/* Record SYN headers for new connections */
+#define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
-- 
cgit v1.2.3


From a2f11835994ed5bcd6d66c7205947cc482231b08 Mon Sep 17 00:00:00 2001
From: Shawn Landden <shawn@churchofgit.com>
Date: Tue, 5 May 2015 09:07:16 -0700
Subject: can.h: make padding given by gcc explicit

The current definition of struct can_frame has a 16-byte size, with 8-byte
alignment, but the 3 bytes of padding are not explicit like the similar 2 bytes
of padding of struct canfd_frame. Make it explicit so it is easier to read.

Signed-off-by: Shawn Landden <shawn@churchofgit.com>
Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/can.h b/include/uapi/linux/can.h
index 41892f720057..9692cda5f8fc 100644
--- a/include/uapi/linux/can.h
+++ b/include/uapi/linux/can.h
@@ -95,11 +95,17 @@ typedef __u32 can_err_mask_t;
  * @can_dlc: frame payload length in byte (0 .. 8) aka data length code
  *           N.B. the DLC field from ISO 11898-1 Chapter 8.4.2.3 has a 1:1
  *           mapping of the 'data length code' to the real payload length
+ * @__pad:   padding
+ * @__res0:  reserved / padding
+ * @__res1:  reserved / padding
  * @data:    CAN frame payload (up to 8 byte)
  */
 struct can_frame {
 	canid_t can_id;  /* 32 bit CAN_ID + EFF/RTR/ERR flags */
 	__u8    can_dlc; /* frame payload length in byte (0 .. CAN_MAX_DLEN) */
+	__u8    __pad;   /* padding */
+	__u8    __res0;  /* reserved / padding */
+	__u8    __res1;  /* reserved / padding */
 	__u8    data[CAN_MAX_DLEN] __attribute__((aligned(8)));
 };
 
-- 
cgit v1.2.3


From 06f207fc541862ba8902ceda0ddeade6ea6bce72 Mon Sep 17 00:00:00 2001
From: Arik Nemtsov <arik@wizery.com>
Date: Wed, 6 May 2015 16:28:31 +0300
Subject: cfg80211: change GO_CONCURRENT to IR_CONCURRENT for STA

The GO_CONCURRENT regulatory definition can be extended to station
interfaces requesting to IR as part of TDLS off-channel operations.
Rename the GO_CONCURRENT flag to IR_CONCURRENT and allow the added
use-case.

Change internal users of GO_CONCURRENT to use the new definition.

Signed-off-by: Arik Nemtsov <arikx.nemtsov@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 241220c43e86..c0ab6b0a3919 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2620,16 +2620,17 @@ enum nl80211_band_attr {
  *	an indoor surroundings, i.e., it is connected to AC power (and not
  *	through portable DC inverters) or is under the control of a master
  *	that is acting as an AP and is connected to AC power.
- * @NL80211_FREQUENCY_ATTR_GO_CONCURRENT: GO operation is allowed on this
+ * @NL80211_FREQUENCY_ATTR_IR_CONCURRENT: IR operation is allowed on this
  *	channel if it's connected concurrently to a BSS on the same channel on
  *	the 2 GHz band or to a channel in the same UNII band (on the 5 GHz
- *	band), and IEEE80211_CHAN_RADAR is not set. Instantiating a GO on a
- *	channel that has the GO_CONCURRENT attribute set can be done when there
- *	is a clear assessment that the device is operating under the guidance of
- *	an authorized master, i.e., setting up a GO while the device is also
- *	connected to an AP with DFS and radar detection on the UNII band (it is
- *	up to user-space, i.e., wpa_supplicant to perform the required
- *	verifications)
+ *	band), and IEEE80211_CHAN_RADAR is not set. Instantiating a GO or TDLS
+ *	off-channel on a channel that has the IR_CONCURRENT attribute set can be
+ *	done when there is a clear assessment that the device is operating under
+ *	the guidance of an authorized master, i.e., setting up a GO or TDLS
+ *	off-channel while the device is also connected to an AP with DFS and
+ *	radar detection on the UNII band (it is up to user-space, i.e.,
+ *	wpa_supplicant to perform the required verifications). Using this
+ *	attribute for IR is disallowed for master interfaces (IBSS, AP).
  * @NL80211_FREQUENCY_ATTR_NO_20MHZ: 20 MHz operation is not allowed
  *	on this channel in current regulatory domain.
  * @NL80211_FREQUENCY_ATTR_NO_10MHZ: 10 MHz operation is not allowed
@@ -2641,7 +2642,7 @@ enum nl80211_band_attr {
  * See https://apps.fcc.gov/eas/comments/GetPublishedDocument.html?id=327&tn=528122
  * for more information on the FCC description of the relaxations allowed
  * by NL80211_FREQUENCY_ATTR_INDOOR_ONLY and
- * NL80211_FREQUENCY_ATTR_GO_CONCURRENT.
+ * NL80211_FREQUENCY_ATTR_IR_CONCURRENT.
  */
 enum nl80211_frequency_attr {
 	__NL80211_FREQUENCY_ATTR_INVALID,
@@ -2659,7 +2660,7 @@ enum nl80211_frequency_attr {
 	NL80211_FREQUENCY_ATTR_NO_160MHZ,
 	NL80211_FREQUENCY_ATTR_DFS_CAC_TIME,
 	NL80211_FREQUENCY_ATTR_INDOOR_ONLY,
-	NL80211_FREQUENCY_ATTR_GO_CONCURRENT,
+	NL80211_FREQUENCY_ATTR_IR_CONCURRENT,
 	NL80211_FREQUENCY_ATTR_NO_20MHZ,
 	NL80211_FREQUENCY_ATTR_NO_10MHZ,
 
@@ -2672,6 +2673,8 @@ enum nl80211_frequency_attr {
 #define NL80211_FREQUENCY_ATTR_PASSIVE_SCAN	NL80211_FREQUENCY_ATTR_NO_IR
 #define NL80211_FREQUENCY_ATTR_NO_IBSS		NL80211_FREQUENCY_ATTR_NO_IR
 #define NL80211_FREQUENCY_ATTR_NO_IR		NL80211_FREQUENCY_ATTR_NO_IR
+#define NL80211_FREQUENCY_ATTR_GO_CONCURRENT \
+					NL80211_FREQUENCY_ATTR_IR_CONCURRENT
 
 /**
  * enum nl80211_bitrate_attr - bitrate attributes
@@ -2830,7 +2833,7 @@ enum nl80211_sched_scan_match_attr {
  * @NL80211_RRF_AUTO_BW: maximum available bandwidth should be calculated
  *	base on contiguous rules and wider channels will be allowed to cross
  *	multiple contiguous/overlapping frequency ranges.
- * @NL80211_RRF_GO_CONCURRENT: See &NL80211_FREQUENCY_ATTR_GO_CONCURRENT
+ * @NL80211_RRF_IR_CONCURRENT: See &NL80211_FREQUENCY_ATTR_IR_CONCURRENT
  * @NL80211_RRF_NO_HT40MINUS: channels can't be used in HT40- operation
  * @NL80211_RRF_NO_HT40PLUS: channels can't be used in HT40+ operation
  * @NL80211_RRF_NO_80MHZ: 80MHz operation not allowed
@@ -2847,7 +2850,7 @@ enum nl80211_reg_rule_flags {
 	NL80211_RRF_NO_IR		= 1<<7,
 	__NL80211_RRF_NO_IBSS		= 1<<8,
 	NL80211_RRF_AUTO_BW		= 1<<11,
-	NL80211_RRF_GO_CONCURRENT	= 1<<12,
+	NL80211_RRF_IR_CONCURRENT	= 1<<12,
 	NL80211_RRF_NO_HT40MINUS	= 1<<13,
 	NL80211_RRF_NO_HT40PLUS		= 1<<14,
 	NL80211_RRF_NO_80MHZ		= 1<<15,
@@ -2859,6 +2862,7 @@ enum nl80211_reg_rule_flags {
 #define NL80211_RRF_NO_IR		NL80211_RRF_NO_IR
 #define NL80211_RRF_NO_HT40		(NL80211_RRF_NO_HT40MINUS |\
 					 NL80211_RRF_NO_HT40PLUS)
+#define NL80211_RRF_GO_CONCURRENT	NL80211_RRF_IR_CONCURRENT
 
 /* For backport compatibility with older userspace */
 #define NL80211_RRF_NO_IR_ALL		(NL80211_RRF_NO_IR | __NL80211_RRF_NO_IBSS)
-- 
cgit v1.2.3


From e520af48c7e5acae5f17f82a79ba7ab7cf156f3b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 May 2015 14:26:25 -0700
Subject: tcp: add TCPWinProbe and TCPKeepAlive SNMP counters

Diagnosing problems related to Window Probes has been hard because
we lack a counter.

TCPWinProbe counts the number of ACK packets a sender has to send
at regular intervals to make sure a reverse ACK packet opening back
a window had not been lost.

TCPKeepAlive counts the number of ACK packets sent to keep TCP
flows alive (SO_KEEPALIVE)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 6a6fb747c78d..eee8968407f0 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -276,6 +276,8 @@ enum
 	LINUX_MIB_TCPACKSKIPPEDFINWAIT2,	/* TCPACKSkippedFinWait2 */
 	LINUX_MIB_TCPACKSKIPPEDTIMEWAIT,	/* TCPACKSkippedTimeWait */
 	LINUX_MIB_TCPACKSKIPPEDCHALLENGE,	/* TCPACKSkippedChallenge */
+	LINUX_MIB_TCPWINPROBE,			/* TCPWinProbe */
+	LINUX_MIB_TCPKEEPALIVE,			/* TCPKeepAlive */
 	__LINUX_MIB_MAX
 };
 
-- 
cgit v1.2.3


From 59324cf35aba5336b611074028777838a963d03b Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 7 May 2015 11:02:53 +0200
Subject: netlink: allow to listen "all" netns

More accurately, listen all netns that have a nsid assigned into the netns
where the netlink socket is opened.
For this purpose, a netlink socket option is added:
NETLINK_LISTEN_ALL_NSID. When this option is set on a netlink socket, this
socket will receive netlink notifications from all netns that have a nsid
assigned into the netns where the socket has been opened. The nsid is sent
to userland via an anscillary data.

With this patch, a daemon needs only one socket to listen many netns. This
is useful when the number of netns is high.

Because 0 is a valid value for a nsid, the field nsid_is_set indicates if
the field nsid is valid or not. skb->cb is initialized to 0 on skb
allocation, thus we are sure that we will never send a nsid 0 by error to
the userland.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 1a85940f8ab7..3e34b7d702f8 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -108,6 +108,7 @@ struct nlmsgerr {
 #define NETLINK_NO_ENOBUFS	5
 #define NETLINK_RX_RING		6
 #define NETLINK_TX_RING		7
+#define NETLINK_LISTEN_ALL_NSID	8
 
 struct nl_pktinfo {
 	__u32	group;
-- 
cgit v1.2.3


From 80ba92fa1a92dea128283f69f55b02242e213650 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 8 May 2015 15:05:12 -0700
Subject: codel: add ce_threshold attribute

For DCTCP or similar ECN based deployments on fabrics with shallow
buffers, hosts are responsible for a good part of the buffering.

This patch adds an optional ce_threshold to codel & fq_codel qdiscs,
so that DCTCP can have feedback from queuing in the host.

A DCTCP enabled egress port simply have a queue occupancy threshold
above which ECT packets get CE mark.

In codel language this translates to a sojourn time, so that one doesn't
have to worry about bytes or bandwidth but delays.

This makes the host an active participant in the health of the whole
network.

This also helps experimenting DCTCP in a setup without DCTCP compliant
fabric.

On following example, ce_threshold is set to 1ms, and we can see from
'ldelay xxx us' that TCP is not trying to go around the 5ms codel
target.

Queue has more capacity to absorb inelastic bursts (say from UDP
traffic), as queues are maintained to an optimal level.

lpaa23:~# ./tc -s -d qd sh dev eth1
qdisc mq 1: dev eth1 root
 Sent 87910654696 bytes 58065331 pkt (dropped 0, overlimits 0 requeues 42961)
 backlog 3108242b 364p requeues 42961
qdisc codel 8063: dev eth1 parent 1:1 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
 Sent 7363778701 bytes 4863809 pkt (dropped 0, overlimits 0 requeues 5503)
 rate 2348Mbit 193919pps backlog 255866b 46p requeues 5503
  count 0 lastcount 0 ldelay 1.0ms drop_next 0us
  maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 72384
qdisc codel 8064: dev eth1 parent 1:2 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
 Sent 7636486190 bytes 5043942 pkt (dropped 0, overlimits 0 requeues 5186)
 rate 2319Mbit 191538pps backlog 207418b 64p requeues 5186
  count 0 lastcount 0 ldelay 694us drop_next 0us
  maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 69873
qdisc codel 8065: dev eth1 parent 1:3 limit 1000p target 5.0ms ce_threshold 1.0ms interval 100.0ms
 Sent 11569360142 bytes 7641602 pkt (dropped 0, overlimits 0 requeues 5554)
 rate 3041Mbit 251096pps backlog 210446b 59p requeues 5554
  count 0 lastcount 0 ldelay 889us drop_next 0us
  maxpacket 68130 ecn_mark 0 drop_overlimit 0 ce_mark 37780
...

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Glenn Judd <glenn.judd@morganstanley.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 534b84710745..69d88b309cc7 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -679,6 +679,7 @@ enum {
 	TCA_CODEL_LIMIT,
 	TCA_CODEL_INTERVAL,
 	TCA_CODEL_ECN,
+	TCA_CODEL_CE_THRESHOLD,
 	__TCA_CODEL_MAX
 };
 
@@ -695,6 +696,7 @@ struct tc_codel_xstats {
 	__u32	drop_overlimit; /* number of time max qdisc packet limit was hit */
 	__u32	ecn_mark;  /* number of packets we ECN marked instead of dropped */
 	__u32	dropping;  /* are we in dropping state ? */
+	__u32	ce_mark;   /* number of CE marked packets because of ce_threshold */
 };
 
 /* FQ_CODEL */
@@ -707,6 +709,7 @@ enum {
 	TCA_FQ_CODEL_ECN,
 	TCA_FQ_CODEL_FLOWS,
 	TCA_FQ_CODEL_QUANTUM,
+	TCA_FQ_CODEL_CE_THRESHOLD,
 	__TCA_FQ_CODEL_MAX
 };
 
@@ -730,6 +733,7 @@ struct tc_fq_codel_qd_stats {
 				 */
 	__u32	new_flows_len;	/* count of flows in new list */
 	__u32	old_flows_len;	/* count of flows in old list */
+	__u32	ce_mark;	/* packets above ce_threshold */
 };
 
 struct tc_fq_codel_cl_stats {
-- 
cgit v1.2.3


From 171a42c38c6e1a5a076d6276e94e55a0b5b7868c Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <gospo@cumulusnetworks.com>
Date: Sat, 9 May 2015 00:01:58 -0700
Subject: bonding: add netlink support for sys prio, actor sys mac, and port
 key

Adds netlink support for the following bonding options:
* BOND_OPT_AD_ACTOR_SYS_PRIO
* BOND_OPT_AD_ACTOR_SYSTEM
* BOND_OPT_AD_USER_PORT_KEY

When setting the actor system mac address we assume the netlink message
contains a binary mac and not a string representation of a mac.

Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
[jt: completed the setting side of the netlink attributes]
Signed-off-by: Jonathan Toppins <jtoppins@cumulusnetworks.com>
Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d9cd19214b98..6d6e502e1051 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -417,6 +417,9 @@ enum {
 	IFLA_BOND_AD_LACP_RATE,
 	IFLA_BOND_AD_SELECT,
 	IFLA_BOND_AD_INFO,
+	IFLA_BOND_AD_ACTOR_SYS_PRIO,
+	IFLA_BOND_AD_USER_PORT_KEY,
+	IFLA_BOND_AD_ACTOR_SYSTEM,
 	__IFLA_BOND_MAX,
 };
 
-- 
cgit v1.2.3


From a3eb95f891d6130b1fc03dd07a8b54cf0a5c8ab8 Mon Sep 17 00:00:00 2001
From: David Ward <david.ward@ll.mit.edu>
Date: Sat, 9 May 2015 22:01:46 -0400
Subject: net_sched: gred: add TCA_GRED_LIMIT attribute

In a GRED qdisc, if the default "virtual queue" (VQ) does not have drop
parameters configured, then packets for the default VQ are not subjected
to RED and are only dropped if the queue is larger than the net_device's
tx_queue_len. This behavior is useful for WRED mode, since these packets
will still influence the calculated average queue length and (therefore)
the drop probability for all of the other VQs. However, for some drivers
tx_queue_len is zero. In other cases the user may wish to make the limit
the same for all VQs (including the default VQ with no drop parameters).

This change adds a TCA_GRED_LIMIT attribute to set the GRED queue limit,
in bytes, during qdisc setup. (This limit is in bytes to be consistent
with the drop parameters.) The default limit is the same as for a bfifo
queue (tx_queue_len * psched_mtu). If the drop parameters of any VQ are
configured with a smaller limit than the GRED queue limit, that VQ will
still observe the smaller limit instead.

Signed-off-by: David Ward <david.ward@ll.mit.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 69d88b309cc7..8d2530daca9f 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -268,7 +268,8 @@ enum {
        TCA_GRED_STAB,
        TCA_GRED_DPS,
        TCA_GRED_MAX_P,
-	   __TCA_GRED_MAX,
+       TCA_GRED_LIMIT,
+       __TCA_GRED_MAX,
 };
 
 #define TCA_GRED_MAX (__TCA_GRED_MAX - 1)
-- 
cgit v1.2.3


From e578d9c02587d57bfa7b560767c698a668a468c6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 11 May 2015 19:50:41 +0200
Subject: net: sched: use counter to break reclassify loops

Seems all we want here is to avoid endless 'goto reclassify' loop.
tc_classify_compat even resets this counter when something other
than TC_ACT_RECLASSIFY is returned, so this skb-counter doesn't
break hypothetical loops induced by something other than perpetual
TC_ACT_RECLASSIFY return values.

skb_act_clone is now identical to skb_clone, so just use that.

Tested with following (bogus) filter:
tc filter add dev eth0 parent ffff: \
 protocol ip u32 match u32 0 0 police rate 10Kbit burst \
 64000 mtu 1500 action reclassify

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 596ffa0c7084..ffc112c8e1c2 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -44,13 +44,13 @@ bits 9,10,11: redirect counter -  redirect TTL. Loop avoidance
 #define TC_OK2MUNGE        _TC_MAKEMASK1(1)
 #define SET_TC_OK2MUNGE(v)   ( TC_OK2MUNGE | (v & ~TC_OK2MUNGE))
 #define CLR_TC_OK2MUNGE(v)   ( v & ~TC_OK2MUNGE)
-#endif
 
 #define S_TC_VERD          _TC_MAKE32(2)
 #define M_TC_VERD          _TC_MAKEMASK(4,S_TC_VERD)
 #define G_TC_VERD(x)       _TC_GETVALUE(x,S_TC_VERD,M_TC_VERD)
 #define V_TC_VERD(x)       _TC_MAKEVALUE(x,S_TC_VERD)
 #define SET_TC_VERD(v,n)   ((V_TC_VERD(n)) | (v & ~M_TC_VERD))
+#endif
 
 #define S_TC_FROM          _TC_MAKE32(6)
 #define M_TC_FROM          _TC_MAKEMASK(2,S_TC_FROM)
-- 
cgit v1.2.3


From 77b9900ef53ae047e36a37d13a2aa33bb2d60641 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:21 +0200
Subject: tc: introduce Flower classifier

This patch introduces a flow-based filter. So far, the very essential
packet fields are supported.

This patch is only the first step. There is a lot of potential performance
improvements possible to implement. Also a lot of features are missing
now. They will be addressed in follow-up patches.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index ffc112c8e1c2..39fb53d67b11 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -409,6 +409,36 @@ enum {
 
 #define TCA_BPF_MAX (__TCA_BPF_MAX - 1)
 
+/* Flower classifier */
+
+enum {
+	TCA_FLOWER_UNSPEC,
+	TCA_FLOWER_CLASSID,
+	TCA_FLOWER_INDEV,
+	TCA_FLOWER_ACT,
+	TCA_FLOWER_KEY_ETH_DST,		/* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_DST_MASK,	/* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC,		/* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC_MASK,	/* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_TYPE,	/* be16 */
+	TCA_FLOWER_KEY_IP_PROTO,	/* u8 */
+	TCA_FLOWER_KEY_IPV4_SRC,	/* be32 */
+	TCA_FLOWER_KEY_IPV4_SRC_MASK,	/* be32 */
+	TCA_FLOWER_KEY_IPV4_DST,	/* be32 */
+	TCA_FLOWER_KEY_IPV4_DST_MASK,	/* be32 */
+	TCA_FLOWER_KEY_IPV6_SRC,	/* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_SRC_MASK,	/* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST,	/* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST_MASK,	/* struct in6_addr */
+	TCA_FLOWER_KEY_TCP_SRC,		/* be16 */
+	TCA_FLOWER_KEY_TCP_DST,		/* be16 */
+	TCA_FLOWER_KEY_UDP_SRC,		/* be16 */
+	TCA_FLOWER_KEY_UDP_DST,		/* be16 */
+	__TCA_FLOWER_MAX,
+};
+
+#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1)
+
 /* Extended Matches */
 
 struct tcf_ematch_tree_hdr {
-- 
cgit v1.2.3


From a9b6391814d5d6b8668fca2dace86949b7244e2e Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 12 May 2015 11:56:50 -0400
Subject: packet: rollover statistics

Rollover indicates exceptional conditions. Export a counter to inform
socket owners of this state.

If no socket with sufficient room is found, rollover fails. Also count
these events.

Finally, also count when flows are rolled over early thanks to huge
flow detection, to validate its correctness.

Tested:
  Read counters in bench_rollover on all other tests in the patchset

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_packet.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index 053bd102fbe0..d3d715f8c88f 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -54,6 +54,7 @@ struct sockaddr_ll {
 #define PACKET_FANOUT			18
 #define PACKET_TX_HAS_OFF		19
 #define PACKET_QDISC_BYPASS		20
+#define PACKET_ROLLOVER_STATS		21
 
 #define PACKET_FANOUT_HASH		0
 #define PACKET_FANOUT_LB		1
@@ -75,6 +76,12 @@ struct tpacket_stats_v3 {
 	unsigned int	tp_freeze_q_cnt;
 };
 
+struct tpacket_rollover_stats {
+	__aligned_u64	tp_all;
+	__aligned_u64	tp_huge;
+	__aligned_u64	tp_failed;
+};
+
 union tpacket_stats_u {
 	struct tpacket_stats stats1;
 	struct tpacket_stats_v3 stats3;
-- 
cgit v1.2.3


From 2d07dc79fe04a43d82a346ced6bbf07bdb523f1b Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Wed, 13 May 2015 12:57:30 -0400
Subject: geneve: add initial netdev driver for GENEVE tunnels

This is an initial implementation of a netdev driver for GENEVE
tunnels.  This implementation uses a fixed UDP port, and only supports
point-to-point links with specific partner endpoints.  Only IPv4
links are supported at this time.

Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 6d6e502e1051..afccc9393fef 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -390,6 +390,15 @@ struct ifla_vxlan_port_range {
 	__be16	high;
 };
 
+/* GENEVE section */
+enum {
+	IFLA_GENEVE_UNSPEC,
+	IFLA_GENEVE_ID,
+	IFLA_GENEVE_REMOTE,
+	__IFLA_GENEVE_MAX
+};
+#define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
+
 /* Bonding section */
 
 enum {
-- 
cgit v1.2.3


From e687ad60af09010936bbd0b2a3b5d90a8ee8353c Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Wed, 13 May 2015 18:19:38 +0200
Subject: netfilter: add netfilter ingress hook after handle_ing() under unique
 static key

This patch adds the Netfilter ingress hook just after the existing tc ingress
hook, that seems to be the consensus solution for this.

Note that the Netfilter hook resides under the global static key that enables
ingress filtering. Nonetheless, Netfilter still also has its own static key for
minimal impact on the existing handle_ing().

* Without this patch:

Result: OK: 6216490(c6216338+d152) usec, 100000000 (60byte,0frags)
  16086246pps 7721Mb/sec (7721398080bps) errors: 100000000

    42.46%  kpktgend_0   [kernel.kallsyms]   [k] __netif_receive_skb_core
    25.92%  kpktgend_0   [kernel.kallsyms]   [k] kfree_skb
     7.81%  kpktgend_0   [pktgen]            [k] pktgen_thread_worker
     5.62%  kpktgend_0   [kernel.kallsyms]   [k] ip_rcv
     2.70%  kpktgend_0   [kernel.kallsyms]   [k] netif_receive_skb_internal
     2.34%  kpktgend_0   [kernel.kallsyms]   [k] netif_receive_skb_sk
     1.44%  kpktgend_0   [kernel.kallsyms]   [k] __build_skb

* With this patch:

Result: OK: 6214833(c6214731+d101) usec, 100000000 (60byte,0frags)
  16090536pps 7723Mb/sec (7723457280bps) errors: 100000000

    41.23%  kpktgend_0      [kernel.kallsyms]  [k] __netif_receive_skb_core
    26.57%  kpktgend_0      [kernel.kallsyms]  [k] kfree_skb
     7.72%  kpktgend_0      [pktgen]           [k] pktgen_thread_worker
     5.55%  kpktgend_0      [kernel.kallsyms]  [k] ip_rcv
     2.78%  kpktgend_0      [kernel.kallsyms]  [k] netif_receive_skb_internal
     2.06%  kpktgend_0      [kernel.kallsyms]  [k] netif_receive_skb_sk
     1.43%  kpktgend_0      [kernel.kallsyms]  [k] __build_skb

* Without this patch + tc ingress:

        tc filter add dev eth4 parent ffff: protocol ip prio 1 \
                u32 match ip dst 4.3.2.1/32

Result: OK: 9269001(c9268821+d179) usec, 100000000 (60byte,0frags)
  10788648pps 5178Mb/sec (5178551040bps) errors: 100000000

    40.99%  kpktgend_0   [kernel.kallsyms]  [k] __netif_receive_skb_core
    17.50%  kpktgend_0   [kernel.kallsyms]  [k] kfree_skb
    11.77%  kpktgend_0   [cls_u32]          [k] u32_classify
     5.62%  kpktgend_0   [kernel.kallsyms]  [k] tc_classify_compat
     5.18%  kpktgend_0   [pktgen]           [k] pktgen_thread_worker
     3.23%  kpktgend_0   [kernel.kallsyms]  [k] tc_classify
     2.97%  kpktgend_0   [kernel.kallsyms]  [k] ip_rcv
     1.83%  kpktgend_0   [kernel.kallsyms]  [k] netif_receive_skb_internal
     1.50%  kpktgend_0   [kernel.kallsyms]  [k] netif_receive_skb_sk
     0.99%  kpktgend_0   [kernel.kallsyms]  [k] __build_skb

* With this patch + tc ingress:

        tc filter add dev eth4 parent ffff: protocol ip prio 1 \
                u32 match ip dst 4.3.2.1/32

Result: OK: 9308218(c9308091+d126) usec, 100000000 (60byte,0frags)
  10743194pps 5156Mb/sec (5156733120bps) errors: 100000000

    42.01%  kpktgend_0   [kernel.kallsyms]   [k] __netif_receive_skb_core
    17.78%  kpktgend_0   [kernel.kallsyms]   [k] kfree_skb
    11.70%  kpktgend_0   [cls_u32]           [k] u32_classify
     5.46%  kpktgend_0   [kernel.kallsyms]   [k] tc_classify_compat
     5.16%  kpktgend_0   [pktgen]            [k] pktgen_thread_worker
     2.98%  kpktgend_0   [kernel.kallsyms]   [k] ip_rcv
     2.84%  kpktgend_0   [kernel.kallsyms]   [k] tc_classify
     1.96%  kpktgend_0   [kernel.kallsyms]   [k] netif_receive_skb_internal
     1.57%  kpktgend_0   [kernel.kallsyms]   [k] netif_receive_skb_sk

Note that the results are very similar before and after.

I can see gcc gets the code under the ingress static key out of the hot path.
Then, on that cold branch, it generates the code to accomodate the netfilter
ingress static key. My explanation for this is that this reduces the pressure
on the instruction cache for non-users as the new code is out of the hot path,
and it comes with minimal impact for tc ingress users.

Using gcc version 4.8.4 on:

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                8
[...]
L1d cache:             16K
L1i cache:             64K
L2 cache:              2048K
L3 cache:              8192K

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netfilter.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index ef1b1f88ca18..177027cce6b3 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -51,11 +51,17 @@ enum nf_inet_hooks {
 	NF_INET_NUMHOOKS
 };
 
+enum nf_dev_hooks {
+	NF_NETDEV_INGRESS,
+	NF_NETDEV_NUMHOOKS
+};
+
 enum {
 	NFPROTO_UNSPEC =  0,
 	NFPROTO_INET   =  1,
 	NFPROTO_IPV4   =  2,
 	NFPROTO_ARP    =  3,
+	NFPROTO_NETDEV =  5,
 	NFPROTO_BRIDGE =  7,
 	NFPROTO_IPV6   = 10,
 	NFPROTO_DECNET = 12,
-- 
cgit v1.2.3


From c9a70d43461d83818825ae065bb8fc887421e150 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav.pandit@avagotech.com>
Date: Mon, 18 May 2015 16:31:47 +0530
Subject: net-next: ethtool: Added port speed macros.

Signed-off-by: Parav Pandit <parav.pandit@avagotech.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 2e49fc880d29..ae832b45b44c 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1264,15 +1264,19 @@ enum ethtool_sfeatures_retval_bits {
  * it was forced up into this mode or autonegotiated.
  */
 
-/* The forced speed, 10Mb, 100Mb, gigabit, [2.5|10|20|40|56]GbE. */
+/* The forced speed, 10Mb, 100Mb, gigabit, [2.5|5|10|20|25|40|50|56|100]GbE. */
 #define SPEED_10		10
 #define SPEED_100		100
 #define SPEED_1000		1000
 #define SPEED_2500		2500
+#define SPEED_5000		5000
 #define SPEED_10000		10000
 #define SPEED_20000		20000
+#define SPEED_25000		25000
 #define SPEED_40000		40000
+#define SPEED_50000		50000
 #define SPEED_56000		56000
+#define SPEED_100000		100000
 
 #define SPEED_UNKNOWN		-1
 
-- 
cgit v1.2.3


From 04fd61ab36ec065e194ab5e74ae34a5240d992bb Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 19 May 2015 16:59:03 -0700
Subject: bpf: allow bpf programs to tail-call other bpf programs

introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
  ...
  bpf_tail_call(ctx, &jmp_table, index);
  ...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
  ...
  if (jmp_table[index])
    return (*jmp_table[index])(ctx);
  ...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.

bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table

Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.

New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.

The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.

Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>

==========
- simplify complex programs by splitting them into a sequence of small programs

- dispatch routine
  For tracing and future seccomp the program may be triggered on all system
  calls, but processing of syscall arguments will be different. It's more
  efficient to implement them as:
  int syscall_entry(struct seccomp_data *ctx)
  {
     bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
     ... default: process unknown syscall ...
  }
  int sys_write_event(struct seccomp_data *ctx) {...}
  int sys_read_event(struct seccomp_data *ctx) {...}
  syscall_jmp_table[__NR_write] = sys_write_event;
  syscall_jmp_table[__NR_read] = sys_read_event;

  For networking the program may call into different parsers depending on
  packet format, like:
  int packet_parser(struct __sk_buff *skb)
  {
     ... parse L2, L3 here ...
     __u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
     bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
     ... default: process unknown protocol ...
  }
  int parse_tcp(struct __sk_buff *skb) {...}
  int parse_udp(struct __sk_buff *skb) {...}
  ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
  ipproto_jmp_table[IPPROTO_UDP] = parse_udp;

- for TC use case, bpf_tail_call() allows to implement reclassify-like logic

- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
  are atomic, so user space can build chains of BPF programs on the fly

Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
  It could have been implemented without JIT changes as a wrapper on top of
  BPF_PROG_RUN() macro, but with two downsides:
  . all programs would have to pay performance penalty for this feature and
    tail call itself would be slower, since mandatory stack unwind, return,
    stack allocate would be done for every tailcall.
  . tailcall would be limited to programs running preempt_disabled, since
    generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
    need to be either global per_cpu variable accessed by helper and by wrapper
    or global variable protected by locks.

  In this implementation x64 JIT bypasses stack unwind and jumps into the
  callee program after prologue.

- bpf_prog_array_compatible() ensures that prog_type of callee and caller
  are the same and JITed/non-JITed flag is the same, since calling JITed
  program from non-JITed is invalid, since stack frames are different.
  Similarly calling kprobe type program from socket type program is invalid.

- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
  abstraction, its user space API and all of verifier logic.
  It's in the existing arraymap.c file, since several functions are
  shared with regular array map.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a9ebdf5701e8..f0a9af8b4dae 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -113,6 +113,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_UNSPEC,
 	BPF_MAP_TYPE_HASH,
 	BPF_MAP_TYPE_ARRAY,
+	BPF_MAP_TYPE_PROG_ARRAY,
 };
 
 enum bpf_prog_type {
@@ -210,6 +211,15 @@ enum bpf_func_id {
 	 * Return: 0 on success
 	 */
 	BPF_FUNC_l4_csum_replace,
+
+	/**
+	 * bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program
+	 * @ctx: context pointer passed to next program
+	 * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
+	 * @index: index inside array that selects specific program to run
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_tail_call,
 	__BPF_FUNC_MAX_ID,
 };
 
-- 
cgit v1.2.3


From 2efd055c53c06b7e89c167c98069bab9afce7e59 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <mleitner@redhat.com>
Date: Wed, 20 May 2015 16:35:41 -0700
Subject: tcp: add tcpi_segs_in and tcpi_segs_out to tcp_info

This patch tracks the total number of inbound and outbound segments on a
TCP socket. One may use this number to have an idea on connection
quality when compared against the retransmissions.

RFC4898 named these : tcpEStatsPerfSegsIn and tcpEStatsPerfSegsOut

These are a 32bit field each and can be fetched both from TCP_INFO
getsockopt() if one has a handle on a TCP socket, or from inet_diag
netlink facility (iproute2/ss patch will follow)

Note that tp->segs_out was placed near tp->snd_nxt for good data
locality and minimal performance impact, while tp->segs_in was placed
near tp->bytes_received for the same reason.

Join work with Eric Dumazet.

Note that received SYN are accounted on the listener, but sent SYNACK
are not accounted.

Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 51ebedba577f..65a77b071e22 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -192,8 +192,10 @@ struct tcp_info {
 
 	__u64	tcpi_pacing_rate;
 	__u64	tcpi_max_pacing_rate;
-	__u64	tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
+	__u64	tcpi_bytes_acked;    /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
 	__u64	tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
+	__u32	tcpi_segs_out;	     /* RFC4898 tcpEStatsPerfSegsOut */
+	__u32	tcpi_segs_in;	     /* RFC4898 tcpEStatsPerfSegsIn */
 };
 
 /* for TCP_MD5SIG socket option */
-- 
cgit v1.2.3


From bd5850d39f10f9d216bff69bcbf5938680b862ae Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 21 May 2015 02:26:24 +0200
Subject: net: sched: pkt_cls: remove unused macros from uapi

Jamal points out that this header also contains kernel internal magic that
cannot be used from userspace for anything meaningful.

Lets remove what the kernel doesn't use anymore and wrap remainder with
__KERNEL__.

Suggested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Suggested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 32 +++++---------------------------
 1 file changed, 5 insertions(+), 27 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 39fb53d67b11..4f0d1bc3647d 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 #include <linux/pkt_sched.h>
 
+#ifdef __KERNEL__
 /* I think i could have done better macros ; for now this is stolen from
  * some arch/mips code - jhs
 */
@@ -35,23 +36,6 @@ bits 9,10,11: redirect counter -  redirect TTL. Loop avoidance
  *
  * */
 
-#ifndef __KERNEL__
-/* backwards compat for userspace only */
-#define TC_MUNGED          _TC_MAKEMASK1(0)
-#define SET_TC_MUNGED(v)   ( TC_MUNGED | (v & ~TC_MUNGED))
-#define CLR_TC_MUNGED(v)   ( v & ~TC_MUNGED)
-
-#define TC_OK2MUNGE        _TC_MAKEMASK1(1)
-#define SET_TC_OK2MUNGE(v)   ( TC_OK2MUNGE | (v & ~TC_OK2MUNGE))
-#define CLR_TC_OK2MUNGE(v)   ( v & ~TC_OK2MUNGE)
-
-#define S_TC_VERD          _TC_MAKE32(2)
-#define M_TC_VERD          _TC_MAKEMASK(4,S_TC_VERD)
-#define G_TC_VERD(x)       _TC_GETVALUE(x,S_TC_VERD,M_TC_VERD)
-#define V_TC_VERD(x)       _TC_MAKEVALUE(x,S_TC_VERD)
-#define SET_TC_VERD(v,n)   ((V_TC_VERD(n)) | (v & ~M_TC_VERD))
-#endif
-
 #define S_TC_FROM          _TC_MAKE32(6)
 #define M_TC_FROM          _TC_MAKEMASK(2,S_TC_FROM)
 #define G_TC_FROM(x)       _TC_GETVALUE(x,S_TC_FROM,M_TC_FROM)
@@ -65,20 +49,16 @@ bits 9,10,11: redirect counter -  redirect TTL. Loop avoidance
 #define SET_TC_NCLS(v)   ( TC_NCLS | (v & ~TC_NCLS))
 #define CLR_TC_NCLS(v)   ( v & ~TC_NCLS)
 
-#ifndef __KERNEL__
-#define S_TC_RTTL          _TC_MAKE32(9)
-#define M_TC_RTTL          _TC_MAKEMASK(3,S_TC_RTTL)
-#define G_TC_RTTL(x)       _TC_GETVALUE(x,S_TC_RTTL,M_TC_RTTL)
-#define V_TC_RTTL(x)       _TC_MAKEVALUE(x,S_TC_RTTL)
-#define SET_TC_RTTL(v,n)   ((V_TC_RTTL(n)) | (v & ~M_TC_RTTL))
-#endif
-
 #define S_TC_AT          _TC_MAKE32(12)
 #define M_TC_AT          _TC_MAKEMASK(2,S_TC_AT)
 #define G_TC_AT(x)       _TC_GETVALUE(x,S_TC_AT,M_TC_AT)
 #define V_TC_AT(x)       _TC_MAKEVALUE(x,S_TC_AT)
 #define SET_TC_AT(v,n)   ((V_TC_AT(n)) | (v & ~M_TC_AT))
 
+#define MAX_REC_LOOP 4
+#define MAX_RED_LOOP 4
+#endif
+
 /* Action attributes */
 enum {
 	TCA_ACT_UNSPEC,
@@ -98,8 +78,6 @@ enum {
 #define TCA_ACT_NOUNBIND	0
 #define TCA_ACT_REPLACE		1
 #define TCA_ACT_NOREPLACE	0
-#define MAX_REC_LOOP 4
-#define MAX_RED_LOOP 4
 
 #define TC_ACT_UNSPEC	(-1)
 #define TC_ACT_OK		0
-- 
cgit v1.2.3


From d52d3997f843ffefaa8d8462790ffcaca6c74192 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 22 May 2015 20:56:06 -0700
Subject: ipv6: Create percpu rt6_info

After the patch
'ipv6: Only create RTF_CACHE routes after encountering pmtu exception',
we need to compensate the performance hit (bouncing dst->__refcnt).

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ipv6_route.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h
index 2be7bd174751..f6598d1c886e 100644
--- a/include/uapi/linux/ipv6_route.h
+++ b/include/uapi/linux/ipv6_route.h
@@ -34,6 +34,7 @@
 #define RTF_PREF(pref)	((pref) << 27)
 #define RTF_PREF_MASK	0x18000000
 
+#define RTF_PCPU	0x40000000
 #define RTF_LOCAL	0x80000000
 
 
-- 
cgit v1.2.3


From ebddf1a8d78aa3436353fae75c4396e50cb2d6cf Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 26 May 2015 18:41:20 +0200
Subject: netfilter: nf_tables: allow to bind table to net_device

This patch adds the internal NFT_AF_NEEDS_DEV flag to indicate that you must
attach this table to a net_device.

This change is required by the follow up patch that introduces the new netdev
table.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 5fa1cd04762e..89a671e0f5e7 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -146,12 +146,14 @@ enum nft_table_flags {
  * @NFTA_TABLE_NAME: name of the table (NLA_STRING)
  * @NFTA_TABLE_FLAGS: bitmask of enum nft_table_flags (NLA_U32)
  * @NFTA_TABLE_USE: number of chains in this table (NLA_U32)
+ * @NFTA_TABLE_DEV: net device name (NLA_STRING)
  */
 enum nft_table_attributes {
 	NFTA_TABLE_UNSPEC,
 	NFTA_TABLE_NAME,
 	NFTA_TABLE_FLAGS,
 	NFTA_TABLE_USE,
+	NFTA_TABLE_DEV,
 	__NFTA_TABLE_MAX
 };
 #define NFTA_TABLE_MAX		(__NFTA_TABLE_MAX - 1)
-- 
cgit v1.2.3


From 8cf6f497de405ca9eb87ffb34d90699962d10125 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.r.fastabend@intel.com>
Date: Tue, 26 May 2015 08:22:49 -0700
Subject: ethtool: Add helper routines to pass vf to rx_flow_spec

The ring_cookie is 64 bits wide which is much larger than can be used
for actual queue index values. So provide some helper routines to
pack a VF index into the cookie. This is useful to steer packets to
a VF ring without having to know the queue layout of the device.

CC: Alex Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/uapi/linux/ethtool.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index ae832b45b44c..0594933cdf55 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -796,6 +796,31 @@ struct ethtool_rx_flow_spec {
 	__u32		location;
 };
 
+/* How rings are layed out when accessing virtual functions or
+ * offloaded queues is device specific. To allow users to do flow
+ * steering and specify these queues the ring cookie is partitioned
+ * into a 32bit queue index with an 8 bit virtual function id.
+ * This also leaves the 3bytes for further specifiers. It is possible
+ * future devices may support more than 256 virtual functions if
+ * devices start supporting PCIe w/ARI. However at the moment I
+ * do not know of any devices that support this so I do not reserve
+ * space for this at this time. If a future patch consumes the next
+ * byte it should be aware of this possiblity.
+ */
+#define ETHTOOL_RX_FLOW_SPEC_RING	0x00000000FFFFFFFFLL
+#define ETHTOOL_RX_FLOW_SPEC_RING_VF	0x000000FF00000000LL
+#define ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF 32
+static inline __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie)
+{
+	return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie;
+};
+
+static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie)
+{
+	return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >>
+				ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF;
+};
+
 /**
  * struct ethtool_rxnfc - command to get or set RX flow classification rules
  * @cmd: Specific command number - %ETHTOOL_GRXFH, %ETHTOOL_SRXFH,
-- 
cgit v1.2.3


From 37e82c2f974b72c9ab49c787ef7b5bb1aec12768 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 27 May 2015 15:30:39 -0700
Subject: bpf: allow BPF programs access skb->skb_iif and skb->dev->ifindex
 fields

classic BPF already exposes skb->dev->ifindex via SKF_AD_IFINDEX extension.
Allow eBPF program to access it as well. Note that classic aborts execution
of the program if 'skb->dev == NULL' (which is inconvenient for program
writers), whereas eBPF returns zero in such case.
Also expose the 'skb_iif' field, since programs triggered by redirected
packet need to known the original interface index.
Summary:
__skb->ifindex         -> skb->dev->ifindex
__skb->ingress_ifindex -> skb->skb_iif

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f0a9af8b4dae..72f3080afa1e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -236,6 +236,8 @@ struct __sk_buff {
 	__u32 vlan_tci;
 	__u32 vlan_proto;
 	__u32 priority;
+	__u32 ingress_ifindex;
+	__u32 ifindex;
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From a28c257c9eb0bd76a4adcac97c07e34044ec71fb Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Fri, 29 May 2015 17:28:07 -0400
Subject: net/rds: Declare SO_RDS_TRANSPORT and RDS_TRANS_* constants in
 uapi/linux/rds.h

User space applications that desire to explicitly select the
underlying transport for a PF_RDS socket may do so by using the
SO_RDS_TRANSPORT socket option at the SOL_RDS level before bind().
The integer argument provided to the socket option would be one
of the RDS_TRANS_* values, e.g., RDS_TRANS_TCP. This commit exports
the constant values need by such applications via <linux/rds.h>

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rds.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 91950950aa59..0f9265cb2a96 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -38,6 +38,8 @@
 
 #define RDS_IB_ABI_VERSION		0x301
 
+#define	SOL_RDS		276
+
 /*
  * setsockopt/getsockopt for SOL_RDS
  */
@@ -48,6 +50,14 @@
 #define RDS_RECVERR			5
 #define RDS_CONG_MONITOR		6
 #define RDS_GET_MR_FOR_DEST		7
+#define SO_RDS_TRANSPORT		8
+
+/* supported values for SO_RDS_TRANSPORT */
+#define	RDS_TRANS_IB	0
+#define	RDS_TRANS_IWARP	1
+#define	RDS_TRANS_TCP	2
+#define RDS_TRANS_COUNT	3
+#define	RDS_TRANS_NONE	(~0)
 
 /*
  * Control message types for SOL_RDS.
-- 
cgit v1.2.3


From ccea74457bbdafe33dce8bffcb5cb183aeb5f2bb Mon Sep 17 00:00:00 2001
From: Neil McKee <neil.mckee@inmon.com>
Date: Tue, 26 May 2015 20:59:43 -0700
Subject: openvswitch: include datapath actions with sampled-packet upcall to
 userspace

If new optional attribute OVS_USERSPACE_ATTR_ACTIONS is added to an
OVS_ACTION_ATTR_USERSPACE action, then include the datapath actions
in the upcall.

This Directly associates the sampled packet with the path it takes
through the virtual switch. Path information currently includes mangling,
encapsulation and decapsulation actions for tunneling protocols GRE,
VXLAN, Geneve, MPLS and QinQ, but this extension requires no further
changes to accommodate datapath actions that may be added in the
future.

Adding path information enhances visibility into complex virtual
networks.

Signed-off-by: Neil McKee <neil.mckee@inmon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index bbd49a0c46c7..1dab77601c21 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -153,6 +153,8 @@ enum ovs_packet_cmd {
  * flow key against the kernel's.
  * @OVS_PACKET_ATTR_ACTIONS: Contains actions for the packet.  Used
  * for %OVS_PACKET_CMD_EXECUTE.  It has nested %OVS_ACTION_ATTR_* attributes.
+ * Also used in upcall when %OVS_ACTION_ATTR_USERSPACE has optional
+ * %OVS_USERSPACE_ATTR_ACTIONS attribute.
  * @OVS_PACKET_ATTR_USERDATA: Present for an %OVS_PACKET_CMD_ACTION
  * notification if the %OVS_ACTION_ATTR_USERSPACE action specified an
  * %OVS_USERSPACE_ATTR_USERDATA attribute, with the same length and content
@@ -528,6 +530,7 @@ enum ovs_sample_attr {
  * copied to the %OVS_PACKET_CMD_ACTION message as %OVS_PACKET_ATTR_USERDATA.
  * @OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: If present, u32 output port to get
  * tunnel info.
+ * @OVS_USERSPACE_ATTR_ACTIONS: If present, send actions with upcall.
  */
 enum ovs_userspace_attr {
 	OVS_USERSPACE_ATTR_UNSPEC,
@@ -535,6 +538,7 @@ enum ovs_userspace_attr {
 	OVS_USERSPACE_ATTR_USERDATA,  /* Optional user-specified cookie. */
 	OVS_USERSPACE_ATTR_EGRESS_TUN_PORT,  /* Optional, u32 output port
 					      * to get tunnel info. */
+	OVS_USERSPACE_ATTR_ACTIONS,   /* Optional flag to get actions. */
 	__OVS_USERSPACE_ATTR_MAX
 };
 
-- 
cgit v1.2.3


From 8760ce58353c2099be35ead62a572ee2d1e83b5b Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Mon, 1 Jun 2015 15:51:34 -0400
Subject: geneve: allow user to specify TTL for tunnel frames

Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index afccc9393fef..a834f31db915 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -395,6 +395,7 @@ enum {
 	IFLA_GENEVE_UNSPEC,
 	IFLA_GENEVE_ID,
 	IFLA_GENEVE_REMOTE,
+	IFLA_GENEVE_TTL,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
-- 
cgit v1.2.3


From d89511251f6519599b109dc6cda87a6ab314ed8c Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Mon, 1 Jun 2015 15:51:35 -0400
Subject: geneve: allow user to specify TOS info for tunnel frames

Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index a834f31db915..1737b7a8272b 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -396,6 +396,7 @@ enum {
 	IFLA_GENEVE_ID,
 	IFLA_GENEVE_REMOTE,
 	IFLA_GENEVE_TTL,
+	IFLA_GENEVE_TOS,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
-- 
cgit v1.2.3


From 3896d655f4d491c67d669a15f275a39f713410f8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 2 Jun 2015 16:03:14 -0700
Subject: bpf: introduce bpf_clone_redirect() helper

Allow eBPF programs attached to classifier/actions to call
bpf_clone_redirect(skb, ifindex, flags) helper which will
mirror or redirect the packet by dynamic ifindex selection
from within the program to a target device either at ingress
or at egress. Can be used for various scenarios, for example,
to load balance skbs into veths, split parts of the traffic
to local taps, etc.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 72f3080afa1e..42aa19abab86 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -220,6 +220,16 @@ enum bpf_func_id {
 	 * Return: 0 on success
 	 */
 	BPF_FUNC_tail_call,
+
+	/**
+	 * bpf_clone_redirect(skb, ifindex, flags) - redirect to another netdev
+	 * @skb: pointer to skb
+	 * @ifindex: ifindex of the net device
+	 * @flags: bit 0 - if set, redirect to ingress instead of egress
+	 *         other bits - reserved
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_clone_redirect,
 	__BPF_FUNC_MAX_ID,
 };
 
-- 
cgit v1.2.3


From 730fc4371333636a00fed32c587fc1e85c5367e2 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:37 -0700
Subject: mpls: Add definition for IPPROTO_MPLS

Add uapi define for MPLS over IP.

Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/in.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index 589ced069e8a..641338bef651 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -69,6 +69,8 @@ enum {
 #define IPPROTO_SCTP		IPPROTO_SCTP
   IPPROTO_UDPLITE = 136,	/* UDP-Lite (RFC 3828)			*/
 #define IPPROTO_UDPLITE		IPPROTO_UDPLITE
+  IPPROTO_MPLS = 137,		/* MPLS in IP (RFC 4023)		*/
+#define IPPROTO_MPLS		IPPROTO_MPLS
   IPPROTO_RAW = 255,		/* Raw IP packets			*/
 #define IPPROTO_RAW		IPPROTO_RAW
   IPPROTO_MAX
-- 
cgit v1.2.3


From 90c337da1524863838658078ec34241f45d8394d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 6 Jun 2015 21:17:57 -0700
Subject: inet: add IP_BIND_ADDRESS_NO_PORT to overcome bind(0) limitations

When an application needs to force a source IP on an active TCP socket
it has to use bind(IP, port=x).

As most applications do not want to deal with already used ports, x is
often set to 0, meaning the kernel is in charge to find an available
port.
But kernel does not know yet if this socket is going to be a listener or
be connected.
It has very limited choices (no full knowledge of final 4-tuple for a
connect())

With limited ephemeral port range (about 32K ports), it is very easy to
fill the space.

This patch adds a new SOL_IP socket option, asking kernel to ignore
the 0 port provided by application in bind(IP, port=0) and only
remember the given IP address.

The port will be automatically chosen at connect() time, in a way
that allows sharing a source port as long as the 4-tuples are unique.

This new feature is available for both IPv4 and IPv6 (Thanks Neal)

Tested:

Wrote a test program and checked its behavior on IPv4 and IPv6.

strace(1) shows sequences of bind(IP=127.0.0.2, port=0) followed by
connect().
Also getsockname() show that the port is still 0 right after bind()
but properly allocated after connect().

socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5
setsockopt(5, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0
bind(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, 16) = 0
getsockname(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0
connect(5, {sa_family=AF_INET, sin_port=htons(53174), sin_addr=inet_addr("127.0.0.3")}, 16) = 0
getsockname(5, {sa_family=AF_INET, sin_port=htons(38050), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0

IPv6 test :

socket(PF_INET6, SOCK_STREAM, IPPROTO_IP) = 7
setsockopt(7, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0
bind(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0
getsockname(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0
connect(7, {sa_family=AF_INET6, sin6_port=htons(57300), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0
getsockname(7, {sa_family=AF_INET6, sin6_port=htons(60964), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0

I was able to bind()/connect() a million concurrent IPv4 sockets,
instead of ~32000 before patch.

lpaa23:~# ulimit -n 1000010
lpaa23:~# ./bind --connect --num-flows=1000000 &
1000000 sockets

lpaa23:~# grep TCP /proc/net/sockstat
TCP: inuse 2000063 orphan 0 tw 47 alloc 2000157 mem 66

Check that a given source port is indeed used by many different
connections :

lpaa23:~# ss -t src :40000 | head -10
State      Recv-Q Send-Q   Local Address:Port          Peer Address:Port
ESTAB      0      0           127.0.0.2:40000         127.0.202.33:44983
ESTAB      0      0           127.0.0.2:40000         127.2.27.240:44983
ESTAB      0      0           127.0.0.2:40000           127.2.98.5:44983
ESTAB      0      0           127.0.0.2:40000        127.0.124.196:44983
ESTAB      0      0           127.0.0.2:40000         127.2.139.38:44983
ESTAB      0      0           127.0.0.2:40000          127.1.59.80:44983
ESTAB      0      0           127.0.0.2:40000          127.3.6.228:44983
ESTAB      0      0           127.0.0.2:40000          127.0.38.53:44983
ESTAB      0      0           127.0.0.2:40000         127.1.197.10:44983

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/in.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index 641338bef651..83d6236a2f08 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -112,6 +112,7 @@ struct in_addr {
 #define IP_MINTTL       21
 #define IP_NODEFRAG     22
 #define IP_CHECKSUM	23
+#define IP_BIND_ADDRESS_NO_PORT	24
 
 /* IP_MTU_DISCOVER values */
 #define IP_PMTUDISC_DONT		0	/* Never send DF frames */
-- 
cgit v1.2.3


From d691f9e8d4405c334aa10d556e73c8bf44cb0e01 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 4 Jun 2015 10:11:54 -0700
Subject: bpf: allow programs to write to certain skb fields

allow programs read/write skb->mark, tc_index fields and
((struct qdisc_skb_cb *)cb)->data.

mark and tc_index are generically useful in TC.
cb[0]-cb[4] are primarily used to pass arguments from one
program to another called via bpf_tail_call() which can
be seen in sockex3_kern.c example.

All fields of 'struct __sk_buff' are readable to socket and tc_cls_act progs.
mark, tc_index are writeable from tc_cls_act only.
cb[0]-cb[4] are writeable by both sockets and tc_cls_act.

Add verifier tests and improve sample code.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 42aa19abab86..602f05b7a275 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -248,6 +248,8 @@ struct __sk_buff {
 	__u32 priority;
 	__u32 ingress_ifindex;
 	__u32 ifindex;
+	__u32 tc_index;
+	__u32 cb[5];
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From 9e58095f9660f88d6a2febe87d5073a6b2e9c399 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Tue, 14 Oct 2014 02:19:46 +0200
Subject: NFC: netlink: Implement vendor command support

Vendor commands are passed from userspace through the
NFC_CMD_VENDOR netlink command, allowing driver and hardware
specific operations implementations like for example RF tuning
or production line calibration.

Drivers will associate a set of vendor commands to a vendor
id, which could typically be an OUI. The netlink kernel
implementation will try to match the received vendor id
and sub command attributes with the registered ones. When
such match is found, the driver defined sub command routine
is called.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/uapi/linux/nfc.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index c1e2e63cf9b5..dd3f75389076 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -86,6 +86,8 @@
  *	for this event is the application ID (AID).
  * @NFC_CMD_GET_SE: Dump all discovered secure elements from an NFC controller.
  * @NFC_CMD_SE_IO: Send/Receive APDUs to/from the selected secure element.
+ * @NFC_CMD_VENDOR: Vendor specific command, to be implemented directly
+ *	from the driver in order to support hardware specific operations.
  */
 enum nfc_commands {
 	NFC_CMD_UNSPEC,
@@ -117,6 +119,7 @@ enum nfc_commands {
 	NFC_CMD_GET_SE,
 	NFC_CMD_SE_IO,
 	NFC_CMD_ACTIVATE_TARGET,
+	NFC_CMD_VENDOR,
 /* private: internal use only */
 	__NFC_CMD_AFTER_LAST
 };
@@ -153,6 +156,10 @@ enum nfc_commands {
  * @NFC_ATTR_APDU: Secure element APDU
  * @NFC_ATTR_TARGET_ISO15693_DSFID: ISO 15693 Data Storage Format Identifier
  * @NFC_ATTR_TARGET_ISO15693_UID: ISO 15693 Unique Identifier
+ * @NFC_ATTR_VENDOR_ID: NFC manufacturer unique ID, typically an OUI
+ * @NFC_ATTR_VENDOR_SUBCMD: Vendor specific sub command
+ * @NFC_ATTR_VENDOR_DATA: Vendor specific data, to be optionally passed
+ *	to a vendor specific command implementation
  */
 enum nfc_attrs {
 	NFC_ATTR_UNSPEC,
@@ -184,6 +191,9 @@ enum nfc_attrs {
 	NFC_ATTR_TARGET_ISO15693_DSFID,
 	NFC_ATTR_TARGET_ISO15693_UID,
 	NFC_ATTR_SE_PARAMS,
+	NFC_ATTR_VENDOR_ID,
+	NFC_ATTR_VENDOR_SUBCMD,
+	NFC_ATTR_VENDOR_DATA,
 /* private: internal use only */
 	__NFC_ATTR_AFTER_LAST
 };
-- 
cgit v1.2.3


From dd895d7f21b244e7fd4c7477697e274de7e44ecb Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Tue, 9 Jun 2015 08:05:10 +0200
Subject: can: cangw: introduce optional uid to reference created routing jobs

Similar to referencing iptables rules by their line number this UID allows to
reference created routing jobs, e.g. to alter configured data modifications.

The UID is an optional non-zero value which can be provided at routing job
creation time. When the UID is set the UID replaces the data modification
configuration as job identification attribute e.g. at job removal time.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/gw.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/can/gw.h b/include/uapi/linux/can/gw.h
index 3e6184cf2f6d..5079b9d57e31 100644
--- a/include/uapi/linux/can/gw.h
+++ b/include/uapi/linux/can/gw.h
@@ -78,6 +78,7 @@ enum {
 	CGW_FILTER,	/* specify struct can_filter on source CAN device */
 	CGW_DELETED,	/* number of deleted CAN frames (see max_hops param) */
 	CGW_LIM_HOPS,	/* limit the number of hops of this specific rule */
+	CGW_MOD_UID,	/* user defined identifier for modification updates */
 	__CGW_MAX
 };
 
@@ -162,6 +163,10 @@ enum {
  * load time of the can-gw module). This value is used to reduce the number of
  * possible hops for this gateway rule to a value smaller then max_hops.
  *
+ * CGW_MOD_UID (length 4 bytes):
+ * Optional non-zero user defined routing job identifier to alter existing
+ * modification settings at runtime.
+ *
  * CGW_CS_XOR (length 4 bytes):
  * Set a simple XOR checksum starting with an initial value into
  * data[result-idx] using data[start-idx] .. data[end-idx]
-- 
cgit v1.2.3


From a4244b0cf58d56c171874e85228ba5deffeb017a Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Thu, 11 Jun 2015 10:28:16 +0300
Subject: net/ethtool: Add current supported tunable options

Add strings array of the current supported tunable options.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 0594933cdf55..cd67aec187d9 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -215,6 +215,11 @@ enum tunable_id {
 	ETHTOOL_ID_UNSPEC,
 	ETHTOOL_RX_COPYBREAK,
 	ETHTOOL_TX_COPYBREAK,
+	/*
+	 * Add your fresh new tubale attribute above and remember to update
+	 * tunable_strings[] in net/core/ethtool.c
+	 */
+	__ETHTOOL_TUNABLE_COUNT,
 };
 
 enum tunable_type_id {
@@ -545,6 +550,7 @@ enum ethtool_stringset {
 	ETH_SS_NTUPLE_FILTERS,
 	ETH_SS_FEATURES,
 	ETH_SS_RSS_HASH_FUNCS,
+	ETH_SS_TUNABLES,
 };
 
 /**
-- 
cgit v1.2.3


From 9961127d4bce6325e9a0b0fb105e0c85a6c62cb7 Mon Sep 17 00:00:00 2001
From: Vincent Cuissard <cuissard@marvell.com>
Date: Thu, 11 Jun 2015 11:25:47 +0200
Subject: NFC: nci: add generic uart support

Some NFC controller supports UART as host interface.
As with SPI, a lot of code can be shared between vendor
drivers. This patch add the generic support of UART and
provides some extension API for vendor specific needs.

This code is strongly inspired by the Bluetooth HCI ldisc
implementation. NCI UART vendor drivers will have to register
themselves to this layer via nci_uart_register.

Underlying tty will have to be configured from user land
thanks to an ioctl.

Signed-off-by: Vincent Cuissard <cuissard@marvell.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/uapi/linux/tty.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tty.h b/include/uapi/linux/tty.h
index dac199a2dba5..01c4410352ff 100644
--- a/include/uapi/linux/tty.h
+++ b/include/uapi/linux/tty.h
@@ -34,5 +34,6 @@
 #define N_TI_WL		22	/* for TI's WL BT, FM, GPS combo chips */
 #define N_TRACESINK	23	/* Trace data routing for MIPI P1149.7 */
 #define N_TRACEROUTER	24	/* Trace data routing for MIPI P1149.7 */
+#define N_NCI		25	/* NFC NCI UART */
 
 #endif /* _UAPI_LINUX_TTY_H */
-- 
cgit v1.2.3


From ca0f6a5cd99e0c6ba4bb78dc402817f636370f26 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Sat, 13 Jun 2015 19:45:33 +0200
Subject: netfilter: ipset: Fix coding styles reported by checkpatch.pl

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/uapi/linux/netfilter/ipset/ip_set.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index 5ab4e60894cf..63b2e34f1b60 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -15,12 +15,12 @@
 /* The protocol version */
 #define IPSET_PROTOCOL		6
 
-/* The maximum permissible comment length we will accept over netlink */
-#define IPSET_MAX_COMMENT_SIZE	255
-
 /* The max length of strings including NUL: set and type identifiers */
 #define IPSET_MAXNAMELEN	32
 
+/* The maximum permissible comment length we will accept over netlink */
+#define IPSET_MAX_COMMENT_SIZE	255
+
 /* Message types and commands */
 enum ipset_cmd {
 	IPSET_CMD_NONE,
-- 
cgit v1.2.3


From 2cbce139fc57bc2625f88add055d0b94f00c3352 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 12 Jun 2015 13:55:41 +0200
Subject: netfilter: nf_tables: attach net_device to basechain

The device is part of the hook configuration, so instead of a global
configuration per table, set it to each of the basechain that we create.

This patch reworks ebddf1a8d78a ("netfilter: nf_tables: allow to bind table to
net_device").

Note that this adds a dev_name field in the nft_base_chain structure which is
required the netdev notification subscription that follows up in a patch to
handle gone net_devices.

Suggested-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 89a671e0f5e7..a99e6a997140 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -122,11 +122,13 @@ enum nft_list_attributes {
  *
  * @NFTA_HOOK_HOOKNUM: netfilter hook number (NLA_U32)
  * @NFTA_HOOK_PRIORITY: netfilter hook priority (NLA_U32)
+ * @NFTA_HOOK_DEV: netdevice name (NLA_STRING)
  */
 enum nft_hook_attributes {
 	NFTA_HOOK_UNSPEC,
 	NFTA_HOOK_HOOKNUM,
 	NFTA_HOOK_PRIORITY,
+	NFTA_HOOK_DEV,
 	__NFTA_HOOK_MAX
 };
 #define NFTA_HOOK_MAX		(__NFTA_HOOK_MAX - 1)
@@ -146,14 +148,12 @@ enum nft_table_flags {
  * @NFTA_TABLE_NAME: name of the table (NLA_STRING)
  * @NFTA_TABLE_FLAGS: bitmask of enum nft_table_flags (NLA_U32)
  * @NFTA_TABLE_USE: number of chains in this table (NLA_U32)
- * @NFTA_TABLE_DEV: net device name (NLA_STRING)
  */
 enum nft_table_attributes {
 	NFTA_TABLE_UNSPEC,
 	NFTA_TABLE_NAME,
 	NFTA_TABLE_FLAGS,
 	NFTA_TABLE_USE,
-	NFTA_TABLE_DEV,
 	__NFTA_TABLE_MAX
 };
 #define NFTA_TABLE_MAX		(__NFTA_TABLE_MAX - 1)
-- 
cgit v1.2.3


From ffeedafbf0236f03aeb2e8db273b3e5ae5f5bc89 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 12 Jun 2015 19:39:12 -0700
Subject: bpf: introduce current->pid, tgid, uid, gid, comm accessors

eBPF programs attached to kprobes need to filter based on
current->pid, uid and other fields, so introduce helper functions:

u64 bpf_get_current_pid_tgid(void)
Return: current->tgid << 32 | current->pid

u64 bpf_get_current_uid_gid(void)
Return: current_gid << 32 | current_uid

bpf_get_current_comm(char *buf, int size_of_buf)
stores current->comm into buf

They can be used from the programs attached to TC as well to classify packets
based on current task fields.

Update tracex2 example to print histogram of write syscalls for each process
instead of aggregated for all.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 602f05b7a275..29ef6f99e43d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -230,6 +230,25 @@ enum bpf_func_id {
 	 * Return: 0 on success
 	 */
 	BPF_FUNC_clone_redirect,
+
+	/**
+	 * u64 bpf_get_current_pid_tgid(void)
+	 * Return: current->tgid << 32 | current->pid
+	 */
+	BPF_FUNC_get_current_pid_tgid,
+
+	/**
+	 * u64 bpf_get_current_uid_gid(void)
+	 * Return: current_gid << 32 | current_uid
+	 */
+	BPF_FUNC_get_current_uid_gid,
+
+	/**
+	 * bpf_get_current_comm(char *buf, int size_of_buf)
+	 * stores current->comm into buf
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_get_current_comm,
 	__BPF_FUNC_MAX_ID,
 };
 
-- 
cgit v1.2.3


From 254cb6dbfd8894743fbf814ec856ccd0874af691 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <razor@blackwall.org>
Date: Sun, 14 Jun 2015 16:36:34 +0300
Subject: bonding: export slave's actor_oper_port_state via sysfs and netlink

Export the actor_oper_port_state of each port via sysfs and netlink.
In 802.3ad mode it is valuable for the user to be able to check the
actor_oper state, it is already exported via bond's proc entry.

Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 1737b7a8272b..1b3e357223f2 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -456,6 +456,7 @@ enum {
 	IFLA_BOND_SLAVE_PERM_HWADDR,
 	IFLA_BOND_SLAVE_QUEUE_ID,
 	IFLA_BOND_SLAVE_AD_AGGREGATOR_ID,
+	IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE,
 	__IFLA_BOND_SLAVE_MAX,
 };
 
-- 
cgit v1.2.3


From 46ea297ed67cdeeb0142244873458b911037d0ba Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <razor@blackwall.org>
Date: Sun, 14 Jun 2015 16:36:35 +0300
Subject: bonding: export slave's partner_oper_port_state via sysfs and netlink

Export the partner_oper_port_state of each port via sysfs and netlink.
In 802.3ad mode it is valuable for the user to be able to check the
partner_oper state, it is already exported via bond's proc entry.

Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 1b3e357223f2..510efb360580 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -457,6 +457,7 @@ enum {
 	IFLA_BOND_SLAVE_QUEUE_ID,
 	IFLA_BOND_SLAVE_AD_AGGREGATOR_ID,
 	IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE,
+	IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE,
 	__IFLA_BOND_SLAVE_MAX,
 };
 
-- 
cgit v1.2.3


From 3b766cd832328fcb87db3507e7b98cf42f21689d Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Mon, 15 Jun 2015 17:59:07 +0300
Subject: net/core: Add reading VF statistics through the PF netdevice

Add ndo_get_vf_stats where the PF retrieves and fills the VFs traffic
statistics. We encode the VF stats in a nested manner to allow for
future extensions.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 510efb360580..2c7e8e3d3981 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -484,6 +484,7 @@ enum {
 	IFLA_VF_RSS_QUERY_EN,	/* RSS Redirection Table and Hash Key query
 				 * on/off switch
 				 */
+	IFLA_VF_STATS,		/* network device statistics */
 	__IFLA_VF_MAX,
 };
 
@@ -533,6 +534,18 @@ struct ifla_vf_rss_query_en {
 	__u32 setting;
 };
 
+enum {
+	IFLA_VF_STATS_RX_PACKETS,
+	IFLA_VF_STATS_TX_PACKETS,
+	IFLA_VF_STATS_RX_BYTES,
+	IFLA_VF_STATS_TX_BYTES,
+	IFLA_VF_STATS_BROADCAST,
+	IFLA_VF_STATS_MULTICAST,
+	__IFLA_VF_STATS_MAX,
+};
+
+#define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1)
+
 /* VF ports management section
  *
  *	Nested layout of set/get msg is:
-- 
cgit v1.2.3


From eb4cb008529ca08e0d8c0fa54e8f739520197a65 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Mon, 15 Jun 2015 11:26:18 -0400
Subject: sock_diag: define destruction multicast groups

These groups will contain socket-destruction events for
AF_INET/AF_INET6, IPPROTO_TCP/IPPROTO_UDP.

Near the end of socket destruction, a check for listeners is
performed.  In the presence of a listener, rather than completely
cleanup the socket, a unit of work will be added to a private
work queue which will first broadcast information about the socket
and then finish the cleanup operation.

Signed-off-by: Craig Gallek <kraig@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sock_diag.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h
index b00e29efb161..49230d36f9ce 100644
--- a/include/uapi/linux/sock_diag.h
+++ b/include/uapi/linux/sock_diag.h
@@ -23,4 +23,14 @@ enum {
 	SK_MEMINFO_VARS,
 };
 
+enum sknetlink_groups {
+	SKNLGRP_NONE,
+	SKNLGRP_INET_TCP_DESTROY,
+	SKNLGRP_INET_UDP_DESTROY,
+	SKNLGRP_INET6_TCP_DESTROY,
+	SKNLGRP_INET6_UDP_DESTROY,
+	__SKNLGRP_MAX,
+};
+#define SKNLGRP_MAX	(__SKNLGRP_MAX - 1)
+
 #endif /* _UAPI__SOCK_DIAG_H__ */
-- 
cgit v1.2.3


From 35ac838a9b96470f999db04320f53a2033642bfb Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Mon, 15 Jun 2015 11:26:20 -0400
Subject: sock_diag: implement a get_info handler for inet

This get_info handler will simply dispatch to the appropriate
existing inet protocol handler.

This patch also includes a new netlink attribute
(INET_DIAG_PROTOCOL).  This attribute is currently only used
for multicast messages.  Without this attribute, there is no
way of knowing the IP protocol used by the socket information
being broadcast.  This attribute is not necessary in the 'dump'
variant of this protocol (though it could easily be added)
because dump requests are issued for specific family/protocol
pairs.

Tested: ss -E (note, the -E option has not yet been merged into
the upstream version of ss).

Signed-off-by: Craig Gallek <kraig@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index c7093c75bdd6..b629fc53b109 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -111,9 +111,10 @@ enum {
 	INET_DIAG_SKMEMINFO,
 	INET_DIAG_SHUTDOWN,
 	INET_DIAG_DCTCPINFO,
+	INET_DIAG_PROTOCOL,  /* response attribute only */
 };
 
-#define INET_DIAG_MAX INET_DIAG_DCTCPINFO
+#define INET_DIAG_MAX INET_DIAG_PROTOCOL
 
 /* INET_DIAG_MEM */
 
-- 
cgit v1.2.3


From ef493bd930ae482c608c5999b40d79dda3f2a674 Mon Sep 17 00:00:00 2001
From: Roman Kubiak <r.kubiak@samsung.com>
Date: Fri, 12 Jun 2015 12:32:57 +0200
Subject: netfilter: nfnetlink_queue: add security context information

This patch adds an additional attribute when sending
packet information via netlink in netfilter_queue module.
It will send additional security context data, so that
userspace applications can verify this context against
their own security databases.

Signed-off-by: Roman Kubiak <r.kubiak@samsung.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_queue.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index 8dd819e2b5fe..b67a853638ff 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -49,6 +49,7 @@ enum nfqnl_attr_type {
 	NFQA_EXP,			/* nf_conntrack_netlink.h */
 	NFQA_UID,			/* __u32 sk uid */
 	NFQA_GID,			/* __u32 sk gid */
+	NFQA_SECCTX,			/* security context string */
 
 	__NFQA_MAX
 };
@@ -102,7 +103,8 @@ enum nfqnl_attr_config {
 #define NFQA_CFG_F_CONNTRACK			(1 << 1)
 #define NFQA_CFG_F_GSO				(1 << 2)
 #define NFQA_CFG_F_UID_GID			(1 << 3)
-#define NFQA_CFG_F_MAX				(1 << 4)
+#define NFQA_CFG_F_SECCTX			(1 << 4)
+#define NFQA_CFG_F_MAX				(1 << 5)
 
 /* flags for NFQA_SKB_INFO */
 /* packet appears to have wrong checksums, but they are ok */
-- 
cgit v1.2.3


From 01555e74bde51444c6898ef1800fb2bc697d479e Mon Sep 17 00:00:00 2001
From: Harout Hedeshian <harouth@codeaurora.org>
Date: Mon, 15 Jun 2015 18:40:43 -0600
Subject: netfilter: xt_socket: add XT_SOCKET_RESTORESKMARK flag

xt_socket is useful for matching sockets with IP_TRANSPARENT and
taking some action on the matching packets. However, it lacks the
ability to match only a small subset of transparent sockets.

Suppose there are 2 applications, each with its own set of transparent
sockets. The first application wants all matching packets dropped,
while the second application wants them forwarded somewhere else.

Add the ability to retore the skb->mark from the sk_mark. The mark
is only restored if a matching socket is found and the transparent /
nowildcard conditions are satisfied.

Now the 2 hypothetical applications can differentiate their sockets
based on a mark value set with SO_MARK.

iptables -t mangle -I PREROUTING -m socket --transparent \
                                           --restore-skmark -j action
iptables -t mangle -A action -m mark --mark 10 -j action2
iptables -t mangle -A action -m mark --mark 11 -j action3

Signed-off-by: Harout Hedeshian <harouth@codeaurora.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/xt_socket.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/xt_socket.h b/include/uapi/linux/netfilter/xt_socket.h
index 6315e2ac3474..87644f832494 100644
--- a/include/uapi/linux/netfilter/xt_socket.h
+++ b/include/uapi/linux/netfilter/xt_socket.h
@@ -6,6 +6,7 @@
 enum {
 	XT_SOCKET_TRANSPARENT = 1 << 0,
 	XT_SOCKET_NOWILDCARD = 1 << 1,
+	XT_SOCKET_RESTORESKMARK = 1 << 2,
 };
 
 struct xt_socket_mtinfo1 {
@@ -18,4 +19,11 @@ struct xt_socket_mtinfo2 {
 };
 #define XT_SOCKET_FLAGS_V2 (XT_SOCKET_TRANSPARENT | XT_SOCKET_NOWILDCARD)
 
+struct xt_socket_mtinfo3 {
+	__u8 flags;
+};
+#define XT_SOCKET_FLAGS_V3 (XT_SOCKET_TRANSPARENT \
+			   | XT_SOCKET_NOWILDCARD \
+			   | XT_SOCKET_RESTORESKMARK)
+
 #endif /* _XT_SOCKET_H */
-- 
cgit v1.2.3


From a263653ed798216c0069922d7b5237ca49436007 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 17 Jun 2015 10:28:27 -0500
Subject: netfilter: don't pull include/linux/netfilter.h from netns headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This pulls the full hook netfilter definitions from all those that include
net_namespace.h.

Instead let's just include the bare minimum required in the new
linux/netfilter_defs.h file, and use it from the netfilter netns header files.

I also needed to include in.h and in6.h from linux/netfilter.h otherwise we hit
this compilation error:

In file included from include/linux/netfilter_defs.h:4:0,
                 from include/net/netns/netfilter.h:4,
                 from include/net/net_namespace.h:22,
                 from include/linux/netdevice.h:43,
                 from net/netfilter/nfnetlink_queue_core.c:23:
include/uapi/linux/netfilter.h:76:17: error: field ‘in’ has incomplete type struct in_addr in;

And also explicit include linux/netfilter.h in several spots.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/uapi/linux/netfilter.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index 177027cce6b3..d93f949d1d9a 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -4,7 +4,8 @@
 #include <linux/types.h>
 #include <linux/compiler.h>
 #include <linux/sysctl.h>
-
+#include <linux/in.h>
+#include <linux/in6.h>
 
 /* Responses from hook functions. */
 #define NF_DROP 0
-- 
cgit v1.2.3


From b42be38b2778eda2237fc759e55e3b698b05b315 Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Wed, 17 Jun 2015 17:14:33 +0200
Subject: netlink: add API to retrieve all group memberships

This patch adds getsockopt(SOL_NETLINK, NETLINK_LIST_MEMBERSHIPS) to
retrieve all groups a socket is a member of. Currently, we have to use
getsockname() and look at the nl.nl_groups bitmask. However, this mask is
limited to 32 groups. Hence, similar to NETLINK_ADD_MEMBERSHIP and
NETLINK_DROP_MEMBERSHIP, this adds a separate sockopt to manager higher
groups IDs than 32.

This new NETLINK_LIST_MEMBERSHIPS option takes a pointer to __u32 and the
size of the array. The array is filled with the full membership-set of the
socket, and the required array size is returned in optlen. Hence,
user-space can retry with a properly sized array in case it was too small.

Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netlink.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 3e34b7d702f8..cf6a65cccbdf 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -101,14 +101,15 @@ struct nlmsgerr {
 	struct nlmsghdr msg;
 };
 
-#define NETLINK_ADD_MEMBERSHIP	1
-#define NETLINK_DROP_MEMBERSHIP	2
-#define NETLINK_PKTINFO		3
-#define NETLINK_BROADCAST_ERROR	4
-#define NETLINK_NO_ENOBUFS	5
-#define NETLINK_RX_RING		6
-#define NETLINK_TX_RING		7
-#define NETLINK_LISTEN_ALL_NSID	8
+#define NETLINK_ADD_MEMBERSHIP		1
+#define NETLINK_DROP_MEMBERSHIP		2
+#define NETLINK_PKTINFO			3
+#define NETLINK_BROADCAST_ERROR		4
+#define NETLINK_NO_ENOBUFS		5
+#define NETLINK_RX_RING			6
+#define NETLINK_TX_RING			7
+#define NETLINK_LISTEN_ALL_NSID		8
+#define NETLINK_LIST_MEMBERSHIPS	9
 
 struct nl_pktinfo {
 	__u32	group;
-- 
cgit v1.2.3


From 42bcce87d763b4d22dc6d3a0c0b60c6b49820de8 Mon Sep 17 00:00:00 2001
From: Anish Bhatt <anish@chelsio.com>
Date: Mon, 22 Jun 2015 17:44:35 -0700
Subject: dcb : Fix incorrect documentation for struct dcb_app

While IEEE and CEE use the same structure to store apps, the selector
and priority fields for both are different. Only the priority field is
explained, add documentation explaining how the selector field differs
for both.

cgdcbxd code shows an example of how selector fields differ.

Signed-off-by: Anish Bhatt <anish@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/dcbnl.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h
index 6497d7933d5b..3ea470f35e40 100644
--- a/include/uapi/linux/dcbnl.h
+++ b/include/uapi/linux/dcbnl.h
@@ -207,8 +207,7 @@ struct cee_pfc {
 #define IEEE_8021QAZ_APP_SEL_ANY	4
 
 /* This structure contains the IEEE 802.1Qaz APP managed object. This
- * object is also used for the CEE std as well. There is no difference
- * between the objects.
+ * object is also used for the CEE std as well.
  *
  * @selector: protocol identifier type
  * @protocol: protocol of type indicated
@@ -216,13 +215,18 @@ struct cee_pfc {
  *            8-bit 802.1p user priority bitmap for CEE
  *
  * ----
- *  Selector field values
+ *  Selector field values for IEEE 802.1Qaz
  *	0	Reserved
  *	1	Ethertype
  *	2	Well known port number over TCP or SCTP
  *	3	Well known port number over UDP or DCCP
  *	4	Well known port number over TCP, SCTP, UDP, or DCCP
  *	5-7	Reserved
+ *
+ *  Selector field values for CEE
+ *	0	Ethertype
+ *	1	Well known port number over TCP or UDP
+ *	2-3	Reserved
  */
 struct dcb_app {
 	__u8	selector;
-- 
cgit v1.2.3


From 8a3d03166f19329b46c6f9e900f93a89f446077b Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <gospo@cumulusnetworks.com>
Date: Tue, 23 Jun 2015 13:45:36 -0400
Subject: net: track link-status of ipv4 nexthops

Add a fib flag called RTNH_F_LINKDOWN to any ipv4 nexthops that are
reachable via an interface where carrier is off.  No action is taken,
but additional flags are passed to userspace to indicate carrier status.

This also includes a cleanup to fib_disable_ip to more clearly indicate
what event made the function call to replace the more cryptic force
option previously used.

v2: Split out kernel functionality into 2 patches, this patch simply
sets and clears new nexthop flag RTNH_F_LINKDOWN.

v3: Cleanups suggested by Alex as well as a bug noticed in
fib_sync_down_dev and fib_sync_up when multipath was not enabled.

v5: Whitespace and variable declaration fixups suggested by Dave.

v6: Style fixups noticed by Dave; ran checkpatch to be sure I got them
all.

Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Dinesh Dutt <ddutt@cumulusnetworks.com>
Acked-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 17fb02f488da..fdd8f07f1d34 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -338,6 +338,9 @@ struct rtnexthop {
 #define RTNH_F_PERVASIVE	2	/* Do recursive gateway lookup	*/
 #define RTNH_F_ONLINK		4	/* Gateway is forced on link	*/
 #define RTNH_F_OFFLOAD		8	/* offloaded route */
+#define RTNH_F_LINKDOWN		16	/* carrier-down on nexthop */
+
+#define RTNH_COMPARE_MASK	(RTNH_F_DEAD | RTNH_F_LINKDOWN)
 
 /* Macros to handle hexthops */
 
-- 
cgit v1.2.3


From 0eeb075fad736fb92620af995c47c204bbb5e829 Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <gospo@cumulusnetworks.com>
Date: Tue, 23 Jun 2015 13:45:37 -0400
Subject: net: ipv4 sysctl option to ignore routes when nexthop link is down

This feature is only enabled with the new per-interface or ipv4 global
sysctls called 'ignore_routes_with_linkdown'.

net.ipv4.conf.all.ignore_routes_with_linkdown = 0
net.ipv4.conf.default.ignore_routes_with_linkdown = 0
net.ipv4.conf.lo.ignore_routes_with_linkdown = 0
...

When the above sysctls are set, will report to userspace that a route is
dead and will no longer resolve to this nexthop when performing a fib
lookup.  This will signal to userspace that the route will not be
selected.  The signalling of a RTNH_F_DEAD is only passed to userspace
if the sysctl is enabled and link is down.  This was done as without it
the netlink listeners would have no idea whether or not a nexthop would
be selected.   The kernel only sets RTNH_F_DEAD internally if the
interface has IFF_UP cleared.

With the new sysctl set, the following behavior can be observed
(interface p8p1 is link-down):

default via 10.0.5.2 dev p9p1
10.0.5.0/24 dev p9p1  proto kernel  scope link  src 10.0.5.15
70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1
80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1 dead linkdown
90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 1 dead linkdown
90.0.0.0/24 via 70.0.0.2 dev p7p1  metric 2
90.0.0.1 via 70.0.0.2 dev p7p1  src 70.0.0.1
    cache
local 80.0.0.1 dev lo  src 80.0.0.1
    cache <local>
80.0.0.2 via 10.0.5.2 dev p9p1  src 10.0.5.15
    cache

While the route does remain in the table (so it can be modified if
needed rather than being wiped away as it would be if IFF_UP was
cleared), the proper next-hop is chosen automatically when the link is
down.  Now interface p8p1 is linked-up:

default via 10.0.5.2 dev p9p1
10.0.5.0/24 dev p9p1  proto kernel  scope link  src 10.0.5.15
70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1
80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1
90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 1
90.0.0.0/24 via 70.0.0.2 dev p7p1  metric 2
192.168.56.0/24 dev p2p1  proto kernel  scope link  src 192.168.56.2
90.0.0.1 via 80.0.0.2 dev p8p1  src 80.0.0.1
    cache
local 80.0.0.1 dev lo  src 80.0.0.1
    cache <local>
80.0.0.2 dev p8p1  src 80.0.0.1
    cache

and the output changes to what one would expect.

If the sysctl is not set, the following output would be expected when
p8p1 is down:

default via 10.0.5.2 dev p9p1
10.0.5.0/24 dev p9p1  proto kernel  scope link  src 10.0.5.15
70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1
80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1 linkdown
90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 1 linkdown
90.0.0.0/24 via 70.0.0.2 dev p7p1  metric 2

Since the dead flag does not appear, there should be no expectation that
the kernel would skip using this route due to link being down.

v2: Split kernel changes into 2 patches, this actually makes a
behavioral change if the sysctl is set.  Also took suggestion from Alex
to simplify code by only checking sysctl during fib lookup and
suggestion from Scott to add a per-interface sysctl.

v3: Code clean-ups to make it more readable and efficient as well as a
reverse path check fix.

v4: Drop binary sysctl

v5: Whitespace fixups from Dave

v6: Style changes from Dave and checkpatch suggestions

v7: One more checkpatch fixup

Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Dinesh Dutt <ddutt@cumulusnetworks.com>
Acked-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ip.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 411959405ab6..08f894d2ddbd 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -164,6 +164,7 @@ enum
 	IPV4_DEVCONF_ROUTE_LOCALNET,
 	IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL,
+	IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	__IPV4_DEVCONF_MAX
 };
 
-- 
cgit v1.2.3


From 204621551b2a0060a013b92f7add4d5c452fa7cb Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 24 Jun 2015 11:02:51 +0200
Subject: net: inet_diag: export IPV6_V6ONLY sockopt

For AF_INET6 sockets, the value of struct ipv6_pinfo.ipv6only is
exported to userspace. It indicates whether a socket bound to in6addr_any
listens on IPv4 as well as IPv6. Since the socket is natively IPv6, it is not
listed by e.g. 'ss -l -4'.

This patch is accompanied by an appropriate one for iproute2 to enable
the additional information in 'ss -e'.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index b629fc53b109..68a1f71fde9f 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -112,9 +112,10 @@ enum {
 	INET_DIAG_SHUTDOWN,
 	INET_DIAG_DCTCPINFO,
 	INET_DIAG_PROTOCOL,  /* response attribute only */
+	INET_DIAG_SKV6ONLY,
 };
 
-#define INET_DIAG_MAX INET_DIAG_PROTOCOL
+#define INET_DIAG_MAX INET_DIAG_SKV6ONLY
 
 /* INET_DIAG_MEM */
 
-- 
cgit v1.2.3